1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <math.h> 12 #include <stdlib.h> 13 #include <string.h> 14 15 #include "./vpx_dsp_rtcd.h" 16 #include "vpx_dsp/inv_txfm.h" 17 18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 19 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 20 0.5 shifts per pixel. */ 21 int i; 22 tran_low_t output[16]; 23 tran_high_t a1, b1, c1, d1, e1; 24 const tran_low_t *ip = input; 25 tran_low_t *op = output; 26 27 for (i = 0; i < 4; i++) { 28 a1 = ip[0] >> UNIT_QUANT_SHIFT; 29 c1 = ip[1] >> UNIT_QUANT_SHIFT; 30 d1 = ip[2] >> UNIT_QUANT_SHIFT; 31 b1 = ip[3] >> UNIT_QUANT_SHIFT; 32 a1 += c1; 33 d1 -= b1; 34 e1 = (a1 - d1) >> 1; 35 b1 = e1 - b1; 36 c1 = e1 - c1; 37 a1 -= b1; 38 d1 += c1; 39 op[0] = WRAPLOW(a1); 40 op[1] = WRAPLOW(b1); 41 op[2] = WRAPLOW(c1); 42 op[3] = WRAPLOW(d1); 43 ip += 4; 44 op += 4; 45 } 46 47 ip = output; 48 for (i = 0; i < 4; i++) { 49 a1 = ip[4 * 0]; 50 c1 = ip[4 * 1]; 51 d1 = ip[4 * 2]; 52 b1 = ip[4 * 3]; 53 a1 += c1; 54 d1 -= b1; 55 e1 = (a1 - d1) >> 1; 56 b1 = e1 - b1; 57 c1 = e1 - c1; 58 a1 -= b1; 59 d1 += c1; 60 dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1)); 61 dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1)); 62 dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1)); 63 dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1)); 64 65 ip++; 66 dest++; 67 } 68 } 69 70 void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 71 int i; 72 tran_high_t a1, e1; 73 tran_low_t tmp[4]; 74 const tran_low_t *ip = input; 75 tran_low_t *op = tmp; 76 77 a1 = ip[0] >> UNIT_QUANT_SHIFT; 78 e1 = a1 >> 1; 79 a1 -= e1; 80 op[0] = WRAPLOW(a1); 81 op[1] = op[2] = op[3] = WRAPLOW(e1); 82 83 ip = tmp; 84 for (i = 0; i < 4; i++) { 85 e1 = ip[0] >> 1; 86 a1 = ip[0] - e1; 87 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); 88 dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1); 89 dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1); 90 dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1); 91 ip++; 92 dest++; 93 } 94 } 95 96 void iadst4_c(const tran_low_t *input, tran_low_t *output) { 97 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 98 tran_low_t x0 = input[0]; 99 tran_low_t x1 = input[1]; 100 tran_low_t x2 = input[2]; 101 tran_low_t x3 = input[3]; 102 103 if (!(x0 | x1 | x2 | x3)) { 104 memset(output, 0, 4 * sizeof(*output)); 105 return; 106 } 107 108 // 32-bit result is enough for the following multiplications. 109 s0 = sinpi_1_9 * x0; 110 s1 = sinpi_2_9 * x0; 111 s2 = sinpi_3_9 * x1; 112 s3 = sinpi_4_9 * x2; 113 s4 = sinpi_1_9 * x2; 114 s5 = sinpi_2_9 * x3; 115 s6 = sinpi_4_9 * x3; 116 s7 = WRAPLOW(x0 - x2 + x3); 117 118 s0 = s0 + s3 + s5; 119 s1 = s1 - s4 - s6; 120 s3 = s2; 121 s2 = sinpi_3_9 * s7; 122 123 // 1-D transform scaling factor is sqrt(2). 124 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 125 // + 1b (addition) = 29b. 126 // Hence the output bit depth is 15b. 127 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); 128 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); 129 output[2] = WRAPLOW(dct_const_round_shift(s2)); 130 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); 131 } 132 133 void idct4_c(const tran_low_t *input, tran_low_t *output) { 134 int16_t step[4]; 135 tran_high_t temp1, temp2; 136 137 // stage 1 138 temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64; 139 temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64; 140 step[0] = WRAPLOW(dct_const_round_shift(temp1)); 141 step[1] = WRAPLOW(dct_const_round_shift(temp2)); 142 temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64; 143 temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64; 144 step[2] = WRAPLOW(dct_const_round_shift(temp1)); 145 step[3] = WRAPLOW(dct_const_round_shift(temp2)); 146 147 // stage 2 148 output[0] = WRAPLOW(step[0] + step[3]); 149 output[1] = WRAPLOW(step[1] + step[2]); 150 output[2] = WRAPLOW(step[1] - step[2]); 151 output[3] = WRAPLOW(step[0] - step[3]); 152 } 153 154 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 155 int i, j; 156 tran_low_t out[4 * 4]; 157 tran_low_t *outptr = out; 158 tran_low_t temp_in[4], temp_out[4]; 159 160 // Rows 161 for (i = 0; i < 4; ++i) { 162 idct4_c(input, outptr); 163 input += 4; 164 outptr += 4; 165 } 166 167 // Columns 168 for (i = 0; i < 4; ++i) { 169 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; 170 idct4_c(temp_in, temp_out); 171 for (j = 0; j < 4; ++j) { 172 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 173 ROUND_POWER_OF_TWO(temp_out[j], 4)); 174 } 175 } 176 } 177 178 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 179 int i; 180 tran_high_t a1; 181 tran_low_t out = 182 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); 183 184 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 185 a1 = ROUND_POWER_OF_TWO(out, 4); 186 187 for (i = 0; i < 4; i++) { 188 dest[0] = clip_pixel_add(dest[0], a1); 189 dest[1] = clip_pixel_add(dest[1], a1); 190 dest[2] = clip_pixel_add(dest[2], a1); 191 dest[3] = clip_pixel_add(dest[3], a1); 192 dest += stride; 193 } 194 } 195 196 void iadst8_c(const tran_low_t *input, tran_low_t *output) { 197 int s0, s1, s2, s3, s4, s5, s6, s7; 198 tran_high_t x0 = input[7]; 199 tran_high_t x1 = input[0]; 200 tran_high_t x2 = input[5]; 201 tran_high_t x3 = input[2]; 202 tran_high_t x4 = input[3]; 203 tran_high_t x5 = input[4]; 204 tran_high_t x6 = input[1]; 205 tran_high_t x7 = input[6]; 206 207 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 208 memset(output, 0, 8 * sizeof(*output)); 209 return; 210 } 211 212 // stage 1 213 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); 214 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); 215 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); 216 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); 217 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); 218 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); 219 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); 220 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); 221 222 x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); 223 x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); 224 x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); 225 x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); 226 x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); 227 x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); 228 x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); 229 x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); 230 231 // stage 2 232 s0 = (int)x0; 233 s1 = (int)x1; 234 s2 = (int)x2; 235 s3 = (int)x3; 236 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); 237 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); 238 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); 239 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); 240 241 x0 = WRAPLOW(s0 + s2); 242 x1 = WRAPLOW(s1 + s3); 243 x2 = WRAPLOW(s0 - s2); 244 x3 = WRAPLOW(s1 - s3); 245 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); 246 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); 247 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); 248 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); 249 250 // stage 3 251 s2 = (int)(cospi_16_64 * (x2 + x3)); 252 s3 = (int)(cospi_16_64 * (x2 - x3)); 253 s6 = (int)(cospi_16_64 * (x6 + x7)); 254 s7 = (int)(cospi_16_64 * (x6 - x7)); 255 256 x2 = WRAPLOW(dct_const_round_shift(s2)); 257 x3 = WRAPLOW(dct_const_round_shift(s3)); 258 x6 = WRAPLOW(dct_const_round_shift(s6)); 259 x7 = WRAPLOW(dct_const_round_shift(s7)); 260 261 output[0] = WRAPLOW(x0); 262 output[1] = WRAPLOW(-x4); 263 output[2] = WRAPLOW(x6); 264 output[3] = WRAPLOW(-x2); 265 output[4] = WRAPLOW(x3); 266 output[5] = WRAPLOW(-x7); 267 output[6] = WRAPLOW(x5); 268 output[7] = WRAPLOW(-x1); 269 } 270 271 void idct8_c(const tran_low_t *input, tran_low_t *output) { 272 int16_t step1[8], step2[8]; 273 tran_high_t temp1, temp2; 274 275 // stage 1 276 step1[0] = (int16_t)input[0]; 277 step1[2] = (int16_t)input[4]; 278 step1[1] = (int16_t)input[2]; 279 step1[3] = (int16_t)input[6]; 280 temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64; 281 temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64; 282 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 283 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 284 temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64; 285 temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64; 286 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 287 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 288 289 // stage 2 290 temp1 = (step1[0] + step1[2]) * cospi_16_64; 291 temp2 = (step1[0] - step1[2]) * cospi_16_64; 292 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 293 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 294 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; 295 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; 296 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 297 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 298 step2[4] = WRAPLOW(step1[4] + step1[5]); 299 step2[5] = WRAPLOW(step1[4] - step1[5]); 300 step2[6] = WRAPLOW(-step1[6] + step1[7]); 301 step2[7] = WRAPLOW(step1[6] + step1[7]); 302 303 // stage 3 304 step1[0] = WRAPLOW(step2[0] + step2[3]); 305 step1[1] = WRAPLOW(step2[1] + step2[2]); 306 step1[2] = WRAPLOW(step2[1] - step2[2]); 307 step1[3] = WRAPLOW(step2[0] - step2[3]); 308 step1[4] = step2[4]; 309 temp1 = (step2[6] - step2[5]) * cospi_16_64; 310 temp2 = (step2[5] + step2[6]) * cospi_16_64; 311 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 312 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 313 step1[7] = step2[7]; 314 315 // stage 4 316 output[0] = WRAPLOW(step1[0] + step1[7]); 317 output[1] = WRAPLOW(step1[1] + step1[6]); 318 output[2] = WRAPLOW(step1[2] + step1[5]); 319 output[3] = WRAPLOW(step1[3] + step1[4]); 320 output[4] = WRAPLOW(step1[3] - step1[4]); 321 output[5] = WRAPLOW(step1[2] - step1[5]); 322 output[6] = WRAPLOW(step1[1] - step1[6]); 323 output[7] = WRAPLOW(step1[0] - step1[7]); 324 } 325 326 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 327 int i, j; 328 tran_low_t out[8 * 8]; 329 tran_low_t *outptr = out; 330 tran_low_t temp_in[8], temp_out[8]; 331 332 // First transform rows 333 for (i = 0; i < 8; ++i) { 334 idct8_c(input, outptr); 335 input += 8; 336 outptr += 8; 337 } 338 339 // Then transform columns 340 for (i = 0; i < 8; ++i) { 341 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; 342 idct8_c(temp_in, temp_out); 343 for (j = 0; j < 8; ++j) { 344 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 345 ROUND_POWER_OF_TWO(temp_out[j], 5)); 346 } 347 } 348 } 349 350 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 351 int i, j; 352 tran_low_t out[8 * 8] = { 0 }; 353 tran_low_t *outptr = out; 354 tran_low_t temp_in[8], temp_out[8]; 355 356 // First transform rows 357 // Only first 4 row has non-zero coefs 358 for (i = 0; i < 4; ++i) { 359 idct8_c(input, outptr); 360 input += 8; 361 outptr += 8; 362 } 363 364 // Then transform columns 365 for (i = 0; i < 8; ++i) { 366 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; 367 idct8_c(temp_in, temp_out); 368 for (j = 0; j < 8; ++j) { 369 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 370 ROUND_POWER_OF_TWO(temp_out[j], 5)); 371 } 372 } 373 } 374 375 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 376 int i, j; 377 tran_high_t a1; 378 tran_low_t out = 379 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); 380 381 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 382 a1 = ROUND_POWER_OF_TWO(out, 5); 383 for (j = 0; j < 8; ++j) { 384 for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); 385 dest += stride; 386 } 387 } 388 389 void iadst16_c(const tran_low_t *input, tran_low_t *output) { 390 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 391 tran_high_t s9, s10, s11, s12, s13, s14, s15; 392 tran_high_t x0 = input[15]; 393 tran_high_t x1 = input[0]; 394 tran_high_t x2 = input[13]; 395 tran_high_t x3 = input[2]; 396 tran_high_t x4 = input[11]; 397 tran_high_t x5 = input[4]; 398 tran_high_t x6 = input[9]; 399 tran_high_t x7 = input[6]; 400 tran_high_t x8 = input[7]; 401 tran_high_t x9 = input[8]; 402 tran_high_t x10 = input[5]; 403 tran_high_t x11 = input[10]; 404 tran_high_t x12 = input[3]; 405 tran_high_t x13 = input[12]; 406 tran_high_t x14 = input[1]; 407 tran_high_t x15 = input[14]; 408 409 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | 410 x13 | x14 | x15)) { 411 memset(output, 0, 16 * sizeof(*output)); 412 return; 413 } 414 415 // stage 1 416 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 417 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 418 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 419 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 420 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 421 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 422 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 423 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 424 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 425 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 426 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 427 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 428 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 429 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 430 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 431 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 432 433 x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); 434 x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); 435 x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); 436 x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); 437 x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); 438 x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); 439 x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); 440 x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); 441 x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); 442 x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); 443 x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); 444 x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); 445 x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); 446 x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); 447 x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); 448 x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); 449 450 // stage 2 451 s0 = x0; 452 s1 = x1; 453 s2 = x2; 454 s3 = x3; 455 s4 = x4; 456 s5 = x5; 457 s6 = x6; 458 s7 = x7; 459 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 460 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 461 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 462 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 463 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; 464 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 465 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; 466 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 467 468 x0 = WRAPLOW(s0 + s4); 469 x1 = WRAPLOW(s1 + s5); 470 x2 = WRAPLOW(s2 + s6); 471 x3 = WRAPLOW(s3 + s7); 472 x4 = WRAPLOW(s0 - s4); 473 x5 = WRAPLOW(s1 - s5); 474 x6 = WRAPLOW(s2 - s6); 475 x7 = WRAPLOW(s3 - s7); 476 x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); 477 x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); 478 x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); 479 x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); 480 x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); 481 x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); 482 x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); 483 x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); 484 485 // stage 3 486 s0 = x0; 487 s1 = x1; 488 s2 = x2; 489 s3 = x3; 490 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 491 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 492 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; 493 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 494 s8 = x8; 495 s9 = x9; 496 s10 = x10; 497 s11 = x11; 498 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 499 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 500 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; 501 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 502 503 x0 = WRAPLOW(s0 + s2); 504 x1 = WRAPLOW(s1 + s3); 505 x2 = WRAPLOW(s0 - s2); 506 x3 = WRAPLOW(s1 - s3); 507 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); 508 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); 509 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); 510 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); 511 x8 = WRAPLOW(s8 + s10); 512 x9 = WRAPLOW(s9 + s11); 513 x10 = WRAPLOW(s8 - s10); 514 x11 = WRAPLOW(s9 - s11); 515 x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); 516 x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); 517 x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); 518 x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); 519 520 // stage 4 521 s2 = (-cospi_16_64) * (x2 + x3); 522 s3 = cospi_16_64 * (x2 - x3); 523 s6 = cospi_16_64 * (x6 + x7); 524 s7 = cospi_16_64 * (-x6 + x7); 525 s10 = cospi_16_64 * (x10 + x11); 526 s11 = cospi_16_64 * (-x10 + x11); 527 s14 = (-cospi_16_64) * (x14 + x15); 528 s15 = cospi_16_64 * (x14 - x15); 529 530 x2 = WRAPLOW(dct_const_round_shift(s2)); 531 x3 = WRAPLOW(dct_const_round_shift(s3)); 532 x6 = WRAPLOW(dct_const_round_shift(s6)); 533 x7 = WRAPLOW(dct_const_round_shift(s7)); 534 x10 = WRAPLOW(dct_const_round_shift(s10)); 535 x11 = WRAPLOW(dct_const_round_shift(s11)); 536 x14 = WRAPLOW(dct_const_round_shift(s14)); 537 x15 = WRAPLOW(dct_const_round_shift(s15)); 538 539 output[0] = WRAPLOW(x0); 540 output[1] = WRAPLOW(-x8); 541 output[2] = WRAPLOW(x12); 542 output[3] = WRAPLOW(-x4); 543 output[4] = WRAPLOW(x6); 544 output[5] = WRAPLOW(x14); 545 output[6] = WRAPLOW(x10); 546 output[7] = WRAPLOW(x2); 547 output[8] = WRAPLOW(x3); 548 output[9] = WRAPLOW(x11); 549 output[10] = WRAPLOW(x15); 550 output[11] = WRAPLOW(x7); 551 output[12] = WRAPLOW(x5); 552 output[13] = WRAPLOW(-x13); 553 output[14] = WRAPLOW(x9); 554 output[15] = WRAPLOW(-x1); 555 } 556 557 void idct16_c(const tran_low_t *input, tran_low_t *output) { 558 int16_t step1[16], step2[16]; 559 tran_high_t temp1, temp2; 560 561 // stage 1 562 step1[0] = (int16_t)input[0 / 2]; 563 step1[1] = (int16_t)input[16 / 2]; 564 step1[2] = (int16_t)input[8 / 2]; 565 step1[3] = (int16_t)input[24 / 2]; 566 step1[4] = (int16_t)input[4 / 2]; 567 step1[5] = (int16_t)input[20 / 2]; 568 step1[6] = (int16_t)input[12 / 2]; 569 step1[7] = (int16_t)input[28 / 2]; 570 step1[8] = (int16_t)input[2 / 2]; 571 step1[9] = (int16_t)input[18 / 2]; 572 step1[10] = (int16_t)input[10 / 2]; 573 step1[11] = (int16_t)input[26 / 2]; 574 step1[12] = (int16_t)input[6 / 2]; 575 step1[13] = (int16_t)input[22 / 2]; 576 step1[14] = (int16_t)input[14 / 2]; 577 step1[15] = (int16_t)input[30 / 2]; 578 579 // stage 2 580 step2[0] = step1[0]; 581 step2[1] = step1[1]; 582 step2[2] = step1[2]; 583 step2[3] = step1[3]; 584 step2[4] = step1[4]; 585 step2[5] = step1[5]; 586 step2[6] = step1[6]; 587 step2[7] = step1[7]; 588 589 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 590 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 591 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); 592 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); 593 594 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 595 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 596 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 597 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 598 599 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 600 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 601 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 602 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 603 604 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 605 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 606 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 607 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 608 609 // stage 3 610 step1[0] = step2[0]; 611 step1[1] = step2[1]; 612 step1[2] = step2[2]; 613 step1[3] = step2[3]; 614 615 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 616 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 617 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 618 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 619 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 620 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 621 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 622 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 623 624 step1[8] = WRAPLOW(step2[8] + step2[9]); 625 step1[9] = WRAPLOW(step2[8] - step2[9]); 626 step1[10] = WRAPLOW(-step2[10] + step2[11]); 627 step1[11] = WRAPLOW(step2[10] + step2[11]); 628 step1[12] = WRAPLOW(step2[12] + step2[13]); 629 step1[13] = WRAPLOW(step2[12] - step2[13]); 630 step1[14] = WRAPLOW(-step2[14] + step2[15]); 631 step1[15] = WRAPLOW(step2[14] + step2[15]); 632 633 // stage 4 634 temp1 = (step1[0] + step1[1]) * cospi_16_64; 635 temp2 = (step1[0] - step1[1]) * cospi_16_64; 636 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 637 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 638 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 639 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 640 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 641 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 642 step2[4] = WRAPLOW(step1[4] + step1[5]); 643 step2[5] = WRAPLOW(step1[4] - step1[5]); 644 step2[6] = WRAPLOW(-step1[6] + step1[7]); 645 step2[7] = WRAPLOW(step1[6] + step1[7]); 646 647 step2[8] = step1[8]; 648 step2[15] = step1[15]; 649 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 650 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 651 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 652 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 653 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 654 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 655 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 656 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 657 step2[11] = step1[11]; 658 step2[12] = step1[12]; 659 660 // stage 5 661 step1[0] = WRAPLOW(step2[0] + step2[3]); 662 step1[1] = WRAPLOW(step2[1] + step2[2]); 663 step1[2] = WRAPLOW(step2[1] - step2[2]); 664 step1[3] = WRAPLOW(step2[0] - step2[3]); 665 step1[4] = step2[4]; 666 temp1 = (step2[6] - step2[5]) * cospi_16_64; 667 temp2 = (step2[5] + step2[6]) * cospi_16_64; 668 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 669 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 670 step1[7] = step2[7]; 671 672 step1[8] = WRAPLOW(step2[8] + step2[11]); 673 step1[9] = WRAPLOW(step2[9] + step2[10]); 674 step1[10] = WRAPLOW(step2[9] - step2[10]); 675 step1[11] = WRAPLOW(step2[8] - step2[11]); 676 step1[12] = WRAPLOW(-step2[12] + step2[15]); 677 step1[13] = WRAPLOW(-step2[13] + step2[14]); 678 step1[14] = WRAPLOW(step2[13] + step2[14]); 679 step1[15] = WRAPLOW(step2[12] + step2[15]); 680 681 // stage 6 682 step2[0] = WRAPLOW(step1[0] + step1[7]); 683 step2[1] = WRAPLOW(step1[1] + step1[6]); 684 step2[2] = WRAPLOW(step1[2] + step1[5]); 685 step2[3] = WRAPLOW(step1[3] + step1[4]); 686 step2[4] = WRAPLOW(step1[3] - step1[4]); 687 step2[5] = WRAPLOW(step1[2] - step1[5]); 688 step2[6] = WRAPLOW(step1[1] - step1[6]); 689 step2[7] = WRAPLOW(step1[0] - step1[7]); 690 step2[8] = step1[8]; 691 step2[9] = step1[9]; 692 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 693 temp2 = (step1[10] + step1[13]) * cospi_16_64; 694 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 695 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 696 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 697 temp2 = (step1[11] + step1[12]) * cospi_16_64; 698 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 699 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 700 step2[14] = step1[14]; 701 step2[15] = step1[15]; 702 703 // stage 7 704 output[0] = WRAPLOW(step2[0] + step2[15]); 705 output[1] = WRAPLOW(step2[1] + step2[14]); 706 output[2] = WRAPLOW(step2[2] + step2[13]); 707 output[3] = WRAPLOW(step2[3] + step2[12]); 708 output[4] = WRAPLOW(step2[4] + step2[11]); 709 output[5] = WRAPLOW(step2[5] + step2[10]); 710 output[6] = WRAPLOW(step2[6] + step2[9]); 711 output[7] = WRAPLOW(step2[7] + step2[8]); 712 output[8] = WRAPLOW(step2[7] - step2[8]); 713 output[9] = WRAPLOW(step2[6] - step2[9]); 714 output[10] = WRAPLOW(step2[5] - step2[10]); 715 output[11] = WRAPLOW(step2[4] - step2[11]); 716 output[12] = WRAPLOW(step2[3] - step2[12]); 717 output[13] = WRAPLOW(step2[2] - step2[13]); 718 output[14] = WRAPLOW(step2[1] - step2[14]); 719 output[15] = WRAPLOW(step2[0] - step2[15]); 720 } 721 722 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, 723 int stride) { 724 int i, j; 725 tran_low_t out[16 * 16]; 726 tran_low_t *outptr = out; 727 tran_low_t temp_in[16], temp_out[16]; 728 729 // First transform rows 730 for (i = 0; i < 16; ++i) { 731 idct16_c(input, outptr); 732 input += 16; 733 outptr += 16; 734 } 735 736 // Then transform columns 737 for (i = 0; i < 16; ++i) { 738 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 739 idct16_c(temp_in, temp_out); 740 for (j = 0; j < 16; ++j) { 741 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 742 ROUND_POWER_OF_TWO(temp_out[j], 6)); 743 } 744 } 745 } 746 747 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, 748 int stride) { 749 int i, j; 750 tran_low_t out[16 * 16] = { 0 }; 751 tran_low_t *outptr = out; 752 tran_low_t temp_in[16], temp_out[16]; 753 754 // First transform rows. Since all non-zero dct coefficients are in 755 // upper-left 8x8 area, we only need to calculate first 8 rows here. 756 for (i = 0; i < 8; ++i) { 757 idct16_c(input, outptr); 758 input += 16; 759 outptr += 16; 760 } 761 762 // Then transform columns 763 for (i = 0; i < 16; ++i) { 764 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 765 idct16_c(temp_in, temp_out); 766 for (j = 0; j < 16; ++j) { 767 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 768 ROUND_POWER_OF_TWO(temp_out[j], 6)); 769 } 770 } 771 } 772 773 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, 774 int stride) { 775 int i, j; 776 tran_low_t out[16 * 16] = { 0 }; 777 tran_low_t *outptr = out; 778 tran_low_t temp_in[16], temp_out[16]; 779 780 // First transform rows. Since all non-zero dct coefficients are in 781 // upper-left 4x4 area, we only need to calculate first 4 rows here. 782 for (i = 0; i < 4; ++i) { 783 idct16_c(input, outptr); 784 input += 16; 785 outptr += 16; 786 } 787 788 // Then transform columns 789 for (i = 0; i < 16; ++i) { 790 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 791 idct16_c(temp_in, temp_out); 792 for (j = 0; j < 16; ++j) { 793 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 794 ROUND_POWER_OF_TWO(temp_out[j], 6)); 795 } 796 } 797 } 798 799 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 800 int i, j; 801 tran_high_t a1; 802 tran_low_t out = 803 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); 804 805 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 806 a1 = ROUND_POWER_OF_TWO(out, 6); 807 for (j = 0; j < 16; ++j) { 808 for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1); 809 dest += stride; 810 } 811 } 812 813 void idct32_c(const tran_low_t *input, tran_low_t *output) { 814 int16_t step1[32], step2[32]; 815 tran_high_t temp1, temp2; 816 817 // stage 1 818 step1[0] = (int16_t)input[0]; 819 step1[1] = (int16_t)input[16]; 820 step1[2] = (int16_t)input[8]; 821 step1[3] = (int16_t)input[24]; 822 step1[4] = (int16_t)input[4]; 823 step1[5] = (int16_t)input[20]; 824 step1[6] = (int16_t)input[12]; 825 step1[7] = (int16_t)input[28]; 826 step1[8] = (int16_t)input[2]; 827 step1[9] = (int16_t)input[18]; 828 step1[10] = (int16_t)input[10]; 829 step1[11] = (int16_t)input[26]; 830 step1[12] = (int16_t)input[6]; 831 step1[13] = (int16_t)input[22]; 832 step1[14] = (int16_t)input[14]; 833 step1[15] = (int16_t)input[30]; 834 835 temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64; 836 temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64; 837 step1[16] = WRAPLOW(dct_const_round_shift(temp1)); 838 step1[31] = WRAPLOW(dct_const_round_shift(temp2)); 839 840 temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64; 841 temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64; 842 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); 843 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); 844 845 temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64; 846 temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64; 847 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 848 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 849 850 temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64; 851 temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64; 852 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); 853 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); 854 855 temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64; 856 temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64; 857 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 858 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 859 860 temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64; 861 temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64; 862 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 863 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 864 865 temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64; 866 temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64; 867 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 868 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 869 870 temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64; 871 temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64; 872 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); 873 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); 874 875 // stage 2 876 step2[0] = step1[0]; 877 step2[1] = step1[1]; 878 step2[2] = step1[2]; 879 step2[3] = step1[3]; 880 step2[4] = step1[4]; 881 step2[5] = step1[5]; 882 step2[6] = step1[6]; 883 step2[7] = step1[7]; 884 885 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 886 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 887 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); 888 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); 889 890 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 891 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 892 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 893 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 894 895 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 896 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 897 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 898 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 899 900 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 901 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 902 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 903 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 904 905 step2[16] = WRAPLOW(step1[16] + step1[17]); 906 step2[17] = WRAPLOW(step1[16] - step1[17]); 907 step2[18] = WRAPLOW(-step1[18] + step1[19]); 908 step2[19] = WRAPLOW(step1[18] + step1[19]); 909 step2[20] = WRAPLOW(step1[20] + step1[21]); 910 step2[21] = WRAPLOW(step1[20] - step1[21]); 911 step2[22] = WRAPLOW(-step1[22] + step1[23]); 912 step2[23] = WRAPLOW(step1[22] + step1[23]); 913 step2[24] = WRAPLOW(step1[24] + step1[25]); 914 step2[25] = WRAPLOW(step1[24] - step1[25]); 915 step2[26] = WRAPLOW(-step1[26] + step1[27]); 916 step2[27] = WRAPLOW(step1[26] + step1[27]); 917 step2[28] = WRAPLOW(step1[28] + step1[29]); 918 step2[29] = WRAPLOW(step1[28] - step1[29]); 919 step2[30] = WRAPLOW(-step1[30] + step1[31]); 920 step2[31] = WRAPLOW(step1[30] + step1[31]); 921 922 // stage 3 923 step1[0] = step2[0]; 924 step1[1] = step2[1]; 925 step1[2] = step2[2]; 926 step1[3] = step2[3]; 927 928 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 929 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 930 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 931 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 932 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 933 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 934 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 935 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 936 937 step1[8] = WRAPLOW(step2[8] + step2[9]); 938 step1[9] = WRAPLOW(step2[8] - step2[9]); 939 step1[10] = WRAPLOW(-step2[10] + step2[11]); 940 step1[11] = WRAPLOW(step2[10] + step2[11]); 941 step1[12] = WRAPLOW(step2[12] + step2[13]); 942 step1[13] = WRAPLOW(step2[12] - step2[13]); 943 step1[14] = WRAPLOW(-step2[14] + step2[15]); 944 step1[15] = WRAPLOW(step2[14] + step2[15]); 945 946 step1[16] = step2[16]; 947 step1[31] = step2[31]; 948 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 949 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 950 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); 951 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); 952 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 953 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 954 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 955 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 956 step1[19] = step2[19]; 957 step1[20] = step2[20]; 958 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 959 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 960 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 961 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 962 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 963 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 964 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 965 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 966 step1[23] = step2[23]; 967 step1[24] = step2[24]; 968 step1[27] = step2[27]; 969 step1[28] = step2[28]; 970 971 // stage 4 972 temp1 = (step1[0] + step1[1]) * cospi_16_64; 973 temp2 = (step1[0] - step1[1]) * cospi_16_64; 974 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 975 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 976 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 977 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 978 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 979 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 980 step2[4] = WRAPLOW(step1[4] + step1[5]); 981 step2[5] = WRAPLOW(step1[4] - step1[5]); 982 step2[6] = WRAPLOW(-step1[6] + step1[7]); 983 step2[7] = WRAPLOW(step1[6] + step1[7]); 984 985 step2[8] = step1[8]; 986 step2[15] = step1[15]; 987 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 988 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 989 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 990 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 991 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 992 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 993 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 994 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 995 step2[11] = step1[11]; 996 step2[12] = step1[12]; 997 998 step2[16] = WRAPLOW(step1[16] + step1[19]); 999 step2[17] = WRAPLOW(step1[17] + step1[18]); 1000 step2[18] = WRAPLOW(step1[17] - step1[18]); 1001 step2[19] = WRAPLOW(step1[16] - step1[19]); 1002 step2[20] = WRAPLOW(-step1[20] + step1[23]); 1003 step2[21] = WRAPLOW(-step1[21] + step1[22]); 1004 step2[22] = WRAPLOW(step1[21] + step1[22]); 1005 step2[23] = WRAPLOW(step1[20] + step1[23]); 1006 1007 step2[24] = WRAPLOW(step1[24] + step1[27]); 1008 step2[25] = WRAPLOW(step1[25] + step1[26]); 1009 step2[26] = WRAPLOW(step1[25] - step1[26]); 1010 step2[27] = WRAPLOW(step1[24] - step1[27]); 1011 step2[28] = WRAPLOW(-step1[28] + step1[31]); 1012 step2[29] = WRAPLOW(-step1[29] + step1[30]); 1013 step2[30] = WRAPLOW(step1[29] + step1[30]); 1014 step2[31] = WRAPLOW(step1[28] + step1[31]); 1015 1016 // stage 5 1017 step1[0] = WRAPLOW(step2[0] + step2[3]); 1018 step1[1] = WRAPLOW(step2[1] + step2[2]); 1019 step1[2] = WRAPLOW(step2[1] - step2[2]); 1020 step1[3] = WRAPLOW(step2[0] - step2[3]); 1021 step1[4] = step2[4]; 1022 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1023 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1024 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 1025 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 1026 step1[7] = step2[7]; 1027 1028 step1[8] = WRAPLOW(step2[8] + step2[11]); 1029 step1[9] = WRAPLOW(step2[9] + step2[10]); 1030 step1[10] = WRAPLOW(step2[9] - step2[10]); 1031 step1[11] = WRAPLOW(step2[8] - step2[11]); 1032 step1[12] = WRAPLOW(-step2[12] + step2[15]); 1033 step1[13] = WRAPLOW(-step2[13] + step2[14]); 1034 step1[14] = WRAPLOW(step2[13] + step2[14]); 1035 step1[15] = WRAPLOW(step2[12] + step2[15]); 1036 1037 step1[16] = step2[16]; 1038 step1[17] = step2[17]; 1039 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 1040 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 1041 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 1042 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 1043 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 1044 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 1045 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); 1046 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); 1047 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 1048 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 1049 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 1050 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 1051 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 1052 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 1053 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 1054 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 1055 step1[22] = step2[22]; 1056 step1[23] = step2[23]; 1057 step1[24] = step2[24]; 1058 step1[25] = step2[25]; 1059 step1[30] = step2[30]; 1060 step1[31] = step2[31]; 1061 1062 // stage 6 1063 step2[0] = WRAPLOW(step1[0] + step1[7]); 1064 step2[1] = WRAPLOW(step1[1] + step1[6]); 1065 step2[2] = WRAPLOW(step1[2] + step1[5]); 1066 step2[3] = WRAPLOW(step1[3] + step1[4]); 1067 step2[4] = WRAPLOW(step1[3] - step1[4]); 1068 step2[5] = WRAPLOW(step1[2] - step1[5]); 1069 step2[6] = WRAPLOW(step1[1] - step1[6]); 1070 step2[7] = WRAPLOW(step1[0] - step1[7]); 1071 step2[8] = step1[8]; 1072 step2[9] = step1[9]; 1073 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1074 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1075 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 1076 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 1077 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1078 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1079 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 1080 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 1081 step2[14] = step1[14]; 1082 step2[15] = step1[15]; 1083 1084 step2[16] = WRAPLOW(step1[16] + step1[23]); 1085 step2[17] = WRAPLOW(step1[17] + step1[22]); 1086 step2[18] = WRAPLOW(step1[18] + step1[21]); 1087 step2[19] = WRAPLOW(step1[19] + step1[20]); 1088 step2[20] = WRAPLOW(step1[19] - step1[20]); 1089 step2[21] = WRAPLOW(step1[18] - step1[21]); 1090 step2[22] = WRAPLOW(step1[17] - step1[22]); 1091 step2[23] = WRAPLOW(step1[16] - step1[23]); 1092 1093 step2[24] = WRAPLOW(-step1[24] + step1[31]); 1094 step2[25] = WRAPLOW(-step1[25] + step1[30]); 1095 step2[26] = WRAPLOW(-step1[26] + step1[29]); 1096 step2[27] = WRAPLOW(-step1[27] + step1[28]); 1097 step2[28] = WRAPLOW(step1[27] + step1[28]); 1098 step2[29] = WRAPLOW(step1[26] + step1[29]); 1099 step2[30] = WRAPLOW(step1[25] + step1[30]); 1100 step2[31] = WRAPLOW(step1[24] + step1[31]); 1101 1102 // stage 7 1103 step1[0] = WRAPLOW(step2[0] + step2[15]); 1104 step1[1] = WRAPLOW(step2[1] + step2[14]); 1105 step1[2] = WRAPLOW(step2[2] + step2[13]); 1106 step1[3] = WRAPLOW(step2[3] + step2[12]); 1107 step1[4] = WRAPLOW(step2[4] + step2[11]); 1108 step1[5] = WRAPLOW(step2[5] + step2[10]); 1109 step1[6] = WRAPLOW(step2[6] + step2[9]); 1110 step1[7] = WRAPLOW(step2[7] + step2[8]); 1111 step1[8] = WRAPLOW(step2[7] - step2[8]); 1112 step1[9] = WRAPLOW(step2[6] - step2[9]); 1113 step1[10] = WRAPLOW(step2[5] - step2[10]); 1114 step1[11] = WRAPLOW(step2[4] - step2[11]); 1115 step1[12] = WRAPLOW(step2[3] - step2[12]); 1116 step1[13] = WRAPLOW(step2[2] - step2[13]); 1117 step1[14] = WRAPLOW(step2[1] - step2[14]); 1118 step1[15] = WRAPLOW(step2[0] - step2[15]); 1119 1120 step1[16] = step2[16]; 1121 step1[17] = step2[17]; 1122 step1[18] = step2[18]; 1123 step1[19] = step2[19]; 1124 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 1125 temp2 = (step2[20] + step2[27]) * cospi_16_64; 1126 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 1127 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 1128 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 1129 temp2 = (step2[21] + step2[26]) * cospi_16_64; 1130 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 1131 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 1132 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 1133 temp2 = (step2[22] + step2[25]) * cospi_16_64; 1134 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 1135 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 1136 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 1137 temp2 = (step2[23] + step2[24]) * cospi_16_64; 1138 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); 1139 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); 1140 step1[28] = step2[28]; 1141 step1[29] = step2[29]; 1142 step1[30] = step2[30]; 1143 step1[31] = step2[31]; 1144 1145 // final stage 1146 output[0] = WRAPLOW(step1[0] + step1[31]); 1147 output[1] = WRAPLOW(step1[1] + step1[30]); 1148 output[2] = WRAPLOW(step1[2] + step1[29]); 1149 output[3] = WRAPLOW(step1[3] + step1[28]); 1150 output[4] = WRAPLOW(step1[4] + step1[27]); 1151 output[5] = WRAPLOW(step1[5] + step1[26]); 1152 output[6] = WRAPLOW(step1[6] + step1[25]); 1153 output[7] = WRAPLOW(step1[7] + step1[24]); 1154 output[8] = WRAPLOW(step1[8] + step1[23]); 1155 output[9] = WRAPLOW(step1[9] + step1[22]); 1156 output[10] = WRAPLOW(step1[10] + step1[21]); 1157 output[11] = WRAPLOW(step1[11] + step1[20]); 1158 output[12] = WRAPLOW(step1[12] + step1[19]); 1159 output[13] = WRAPLOW(step1[13] + step1[18]); 1160 output[14] = WRAPLOW(step1[14] + step1[17]); 1161 output[15] = WRAPLOW(step1[15] + step1[16]); 1162 output[16] = WRAPLOW(step1[15] - step1[16]); 1163 output[17] = WRAPLOW(step1[14] - step1[17]); 1164 output[18] = WRAPLOW(step1[13] - step1[18]); 1165 output[19] = WRAPLOW(step1[12] - step1[19]); 1166 output[20] = WRAPLOW(step1[11] - step1[20]); 1167 output[21] = WRAPLOW(step1[10] - step1[21]); 1168 output[22] = WRAPLOW(step1[9] - step1[22]); 1169 output[23] = WRAPLOW(step1[8] - step1[23]); 1170 output[24] = WRAPLOW(step1[7] - step1[24]); 1171 output[25] = WRAPLOW(step1[6] - step1[25]); 1172 output[26] = WRAPLOW(step1[5] - step1[26]); 1173 output[27] = WRAPLOW(step1[4] - step1[27]); 1174 output[28] = WRAPLOW(step1[3] - step1[28]); 1175 output[29] = WRAPLOW(step1[2] - step1[29]); 1176 output[30] = WRAPLOW(step1[1] - step1[30]); 1177 output[31] = WRAPLOW(step1[0] - step1[31]); 1178 } 1179 1180 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, 1181 int stride) { 1182 int i, j; 1183 tran_low_t out[32 * 32]; 1184 tran_low_t *outptr = out; 1185 tran_low_t temp_in[32], temp_out[32]; 1186 1187 // Rows 1188 for (i = 0; i < 32; ++i) { 1189 int16_t zero_coeff = 0; 1190 for (j = 0; j < 32; ++j) zero_coeff |= input[j]; 1191 1192 if (zero_coeff) 1193 idct32_c(input, outptr); 1194 else 1195 memset(outptr, 0, sizeof(tran_low_t) * 32); 1196 input += 32; 1197 outptr += 32; 1198 } 1199 1200 // Columns 1201 for (i = 0; i < 32; ++i) { 1202 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 1203 idct32_c(temp_in, temp_out); 1204 for (j = 0; j < 32; ++j) { 1205 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1206 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1207 } 1208 } 1209 } 1210 1211 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, 1212 int stride) { 1213 int i, j; 1214 tran_low_t out[32 * 32] = { 0 }; 1215 tran_low_t *outptr = out; 1216 tran_low_t temp_in[32], temp_out[32]; 1217 1218 // Rows 1219 // Only upper-left 16x16 has non-zero coeff 1220 for (i = 0; i < 16; ++i) { 1221 idct32_c(input, outptr); 1222 input += 32; 1223 outptr += 32; 1224 } 1225 1226 // Columns 1227 for (i = 0; i < 32; ++i) { 1228 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 1229 idct32_c(temp_in, temp_out); 1230 for (j = 0; j < 32; ++j) { 1231 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1232 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1233 } 1234 } 1235 } 1236 1237 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, 1238 int stride) { 1239 int i, j; 1240 tran_low_t out[32 * 32] = { 0 }; 1241 tran_low_t *outptr = out; 1242 tran_low_t temp_in[32], temp_out[32]; 1243 1244 // Rows 1245 // Only upper-left 8x8 has non-zero coeff 1246 for (i = 0; i < 8; ++i) { 1247 idct32_c(input, outptr); 1248 input += 32; 1249 outptr += 32; 1250 } 1251 1252 // Columns 1253 for (i = 0; i < 32; ++i) { 1254 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 1255 idct32_c(temp_in, temp_out); 1256 for (j = 0; j < 32; ++j) { 1257 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1258 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1259 } 1260 } 1261 } 1262 1263 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 1264 int i, j; 1265 tran_high_t a1; 1266 tran_low_t out = 1267 WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); 1268 1269 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 1270 a1 = ROUND_POWER_OF_TWO(out, 6); 1271 1272 for (j = 0; j < 32; ++j) { 1273 for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1); 1274 dest += stride; 1275 } 1276 } 1277 1278 #if CONFIG_VP9_HIGHBITDEPTH 1279 1280 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse 1281 // transform amplify bits + 1 bit for contingency in rounding and quantizing 1282 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25) 1283 1284 static INLINE int detect_invalid_highbd_input(const tran_low_t *input, 1285 int size) { 1286 int i; 1287 for (i = 0; i < size; ++i) 1288 if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1; 1289 return 0; 1290 } 1291 1292 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, 1293 int stride, int bd) { 1294 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 1295 0.5 shifts per pixel. */ 1296 int i; 1297 tran_low_t output[16]; 1298 tran_high_t a1, b1, c1, d1, e1; 1299 const tran_low_t *ip = input; 1300 tran_low_t *op = output; 1301 1302 for (i = 0; i < 4; i++) { 1303 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1304 c1 = ip[1] >> UNIT_QUANT_SHIFT; 1305 d1 = ip[2] >> UNIT_QUANT_SHIFT; 1306 b1 = ip[3] >> UNIT_QUANT_SHIFT; 1307 a1 += c1; 1308 d1 -= b1; 1309 e1 = (a1 - d1) >> 1; 1310 b1 = e1 - b1; 1311 c1 = e1 - c1; 1312 a1 -= b1; 1313 d1 += c1; 1314 op[0] = HIGHBD_WRAPLOW(a1, bd); 1315 op[1] = HIGHBD_WRAPLOW(b1, bd); 1316 op[2] = HIGHBD_WRAPLOW(c1, bd); 1317 op[3] = HIGHBD_WRAPLOW(d1, bd); 1318 ip += 4; 1319 op += 4; 1320 } 1321 1322 ip = output; 1323 for (i = 0; i < 4; i++) { 1324 a1 = ip[4 * 0]; 1325 c1 = ip[4 * 1]; 1326 d1 = ip[4 * 2]; 1327 b1 = ip[4 * 3]; 1328 a1 += c1; 1329 d1 -= b1; 1330 e1 = (a1 - d1) >> 1; 1331 b1 = e1 - b1; 1332 c1 = e1 - c1; 1333 a1 -= b1; 1334 d1 += c1; 1335 dest[stride * 0] = 1336 highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd); 1337 dest[stride * 1] = 1338 highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd); 1339 dest[stride * 2] = 1340 highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd); 1341 dest[stride * 3] = 1342 highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd); 1343 1344 ip++; 1345 dest++; 1346 } 1347 } 1348 1349 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, 1350 int stride, int bd) { 1351 int i; 1352 tran_high_t a1, e1; 1353 tran_low_t tmp[4]; 1354 const tran_low_t *ip = input; 1355 tran_low_t *op = tmp; 1356 (void)bd; 1357 1358 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1359 e1 = a1 >> 1; 1360 a1 -= e1; 1361 op[0] = HIGHBD_WRAPLOW(a1, bd); 1362 op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); 1363 1364 ip = tmp; 1365 for (i = 0; i < 4; i++) { 1366 e1 = ip[0] >> 1; 1367 a1 = ip[0] - e1; 1368 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); 1369 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd); 1370 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd); 1371 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd); 1372 ip++; 1373 dest++; 1374 } 1375 } 1376 1377 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { 1378 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1379 tran_low_t x0 = input[0]; 1380 tran_low_t x1 = input[1]; 1381 tran_low_t x2 = input[2]; 1382 tran_low_t x3 = input[3]; 1383 (void)bd; 1384 1385 if (detect_invalid_highbd_input(input, 4)) { 1386 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1387 assert(0 && "invalid highbd txfm input"); 1388 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1389 memset(output, 0, sizeof(*output) * 4); 1390 return; 1391 } 1392 1393 if (!(x0 | x1 | x2 | x3)) { 1394 memset(output, 0, 4 * sizeof(*output)); 1395 return; 1396 } 1397 1398 s0 = (tran_high_t)sinpi_1_9 * x0; 1399 s1 = (tran_high_t)sinpi_2_9 * x0; 1400 s2 = (tran_high_t)sinpi_3_9 * x1; 1401 s3 = (tran_high_t)sinpi_4_9 * x2; 1402 s4 = (tran_high_t)sinpi_1_9 * x2; 1403 s5 = (tran_high_t)sinpi_2_9 * x3; 1404 s6 = (tran_high_t)sinpi_4_9 * x3; 1405 s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); 1406 1407 s0 = s0 + s3 + s5; 1408 s1 = s1 - s4 - s6; 1409 s3 = s2; 1410 s2 = sinpi_3_9 * s7; 1411 1412 // 1-D transform scaling factor is sqrt(2). 1413 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 1414 // + 1b (addition) = 29b. 1415 // Hence the output bit depth is 15b. 1416 output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); 1417 output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); 1418 output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); 1419 output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); 1420 } 1421 1422 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { 1423 tran_low_t step[4]; 1424 tran_high_t temp1, temp2; 1425 (void)bd; 1426 1427 if (detect_invalid_highbd_input(input, 4)) { 1428 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1429 assert(0 && "invalid highbd txfm input"); 1430 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1431 memset(output, 0, sizeof(*output) * 4); 1432 return; 1433 } 1434 1435 // stage 1 1436 temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64; 1437 temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64; 1438 step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1439 step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1440 temp1 = 1441 input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64; 1442 temp2 = 1443 input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64; 1444 step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1445 step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1446 1447 // stage 2 1448 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); 1449 output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); 1450 output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); 1451 output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); 1452 } 1453 1454 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, 1455 int stride, int bd) { 1456 int i, j; 1457 tran_low_t out[4 * 4]; 1458 tran_low_t *outptr = out; 1459 tran_low_t temp_in[4], temp_out[4]; 1460 1461 // Rows 1462 for (i = 0; i < 4; ++i) { 1463 vpx_highbd_idct4_c(input, outptr, bd); 1464 input += 4; 1465 outptr += 4; 1466 } 1467 1468 // Columns 1469 for (i = 0; i < 4; ++i) { 1470 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; 1471 vpx_highbd_idct4_c(temp_in, temp_out, bd); 1472 for (j = 0; j < 4; ++j) { 1473 dest[j * stride + i] = highbd_clip_pixel_add( 1474 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 1475 } 1476 } 1477 } 1478 1479 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, 1480 int stride, int bd) { 1481 int i; 1482 tran_high_t a1; 1483 tran_low_t out = HIGHBD_WRAPLOW( 1484 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); 1485 1486 out = 1487 HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); 1488 a1 = ROUND_POWER_OF_TWO(out, 4); 1489 1490 for (i = 0; i < 4; i++) { 1491 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); 1492 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); 1493 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); 1494 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); 1495 dest += stride; 1496 } 1497 } 1498 1499 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { 1500 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1501 tran_low_t x0 = input[7]; 1502 tran_low_t x1 = input[0]; 1503 tran_low_t x2 = input[5]; 1504 tran_low_t x3 = input[2]; 1505 tran_low_t x4 = input[3]; 1506 tran_low_t x5 = input[4]; 1507 tran_low_t x6 = input[1]; 1508 tran_low_t x7 = input[6]; 1509 (void)bd; 1510 1511 if (detect_invalid_highbd_input(input, 8)) { 1512 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1513 assert(0 && "invalid highbd txfm input"); 1514 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1515 memset(output, 0, sizeof(*output) * 8); 1516 return; 1517 } 1518 1519 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 1520 memset(output, 0, 8 * sizeof(*output)); 1521 return; 1522 } 1523 1524 // stage 1 1525 s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1; 1526 s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1; 1527 s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3; 1528 s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3; 1529 s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5; 1530 s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5; 1531 s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7; 1532 s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7; 1533 1534 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); 1535 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); 1536 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); 1537 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); 1538 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); 1539 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); 1540 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); 1541 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); 1542 1543 // stage 2 1544 s0 = x0; 1545 s1 = x1; 1546 s2 = x2; 1547 s3 = x3; 1548 s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5; 1549 s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5; 1550 s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7; 1551 s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7; 1552 1553 x0 = HIGHBD_WRAPLOW(s0 + s2, bd); 1554 x1 = HIGHBD_WRAPLOW(s1 + s3, bd); 1555 x2 = HIGHBD_WRAPLOW(s0 - s2, bd); 1556 x3 = HIGHBD_WRAPLOW(s1 - s3, bd); 1557 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); 1558 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); 1559 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); 1560 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); 1561 1562 // stage 3 1563 s2 = (tran_high_t)cospi_16_64 * (x2 + x3); 1564 s3 = (tran_high_t)cospi_16_64 * (x2 - x3); 1565 s6 = (tran_high_t)cospi_16_64 * (x6 + x7); 1566 s7 = (tran_high_t)cospi_16_64 * (x6 - x7); 1567 1568 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); 1569 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); 1570 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); 1571 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); 1572 1573 output[0] = HIGHBD_WRAPLOW(x0, bd); 1574 output[1] = HIGHBD_WRAPLOW(-x4, bd); 1575 output[2] = HIGHBD_WRAPLOW(x6, bd); 1576 output[3] = HIGHBD_WRAPLOW(-x2, bd); 1577 output[4] = HIGHBD_WRAPLOW(x3, bd); 1578 output[5] = HIGHBD_WRAPLOW(-x7, bd); 1579 output[6] = HIGHBD_WRAPLOW(x5, bd); 1580 output[7] = HIGHBD_WRAPLOW(-x1, bd); 1581 } 1582 1583 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { 1584 tran_low_t step1[8], step2[8]; 1585 tran_high_t temp1, temp2; 1586 1587 if (detect_invalid_highbd_input(input, 8)) { 1588 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1589 assert(0 && "invalid highbd txfm input"); 1590 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1591 memset(output, 0, sizeof(*output) * 8); 1592 return; 1593 } 1594 1595 // stage 1 1596 step1[0] = input[0]; 1597 step1[2] = input[4]; 1598 step1[1] = input[2]; 1599 step1[3] = input[6]; 1600 temp1 = 1601 input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64; 1602 temp2 = 1603 input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64; 1604 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1605 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1606 temp1 = 1607 input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64; 1608 temp2 = 1609 input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64; 1610 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1611 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1612 1613 // stage 2 & stage 3 - even half 1614 vpx_highbd_idct4_c(step1, step1, bd); 1615 1616 // stage 2 - odd half 1617 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); 1618 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); 1619 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); 1620 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); 1621 1622 // stage 3 - odd half 1623 step1[4] = step2[4]; 1624 temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; 1625 temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; 1626 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1627 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1628 step1[7] = step2[7]; 1629 1630 // stage 4 1631 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); 1632 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); 1633 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); 1634 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); 1635 output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); 1636 output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); 1637 output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); 1638 output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); 1639 } 1640 1641 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, 1642 int stride, int bd) { 1643 int i, j; 1644 tran_low_t out[8 * 8]; 1645 tran_low_t *outptr = out; 1646 tran_low_t temp_in[8], temp_out[8]; 1647 1648 // First transform rows 1649 for (i = 0; i < 8; ++i) { 1650 vpx_highbd_idct8_c(input, outptr, bd); 1651 input += 8; 1652 outptr += 8; 1653 } 1654 1655 // Then transform columns 1656 for (i = 0; i < 8; ++i) { 1657 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; 1658 vpx_highbd_idct8_c(temp_in, temp_out, bd); 1659 for (j = 0; j < 8; ++j) { 1660 dest[j * stride + i] = highbd_clip_pixel_add( 1661 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1662 } 1663 } 1664 } 1665 1666 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, 1667 int stride, int bd) { 1668 int i, j; 1669 tran_low_t out[8 * 8] = { 0 }; 1670 tran_low_t *outptr = out; 1671 tran_low_t temp_in[8], temp_out[8]; 1672 1673 // First transform rows 1674 // Only first 4 row has non-zero coefs 1675 for (i = 0; i < 4; ++i) { 1676 vpx_highbd_idct8_c(input, outptr, bd); 1677 input += 8; 1678 outptr += 8; 1679 } 1680 1681 // Then transform columns 1682 for (i = 0; i < 8; ++i) { 1683 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; 1684 vpx_highbd_idct8_c(temp_in, temp_out, bd); 1685 for (j = 0; j < 8; ++j) { 1686 dest[j * stride + i] = highbd_clip_pixel_add( 1687 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1688 } 1689 } 1690 } 1691 1692 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, 1693 int stride, int bd) { 1694 int i, j; 1695 tran_high_t a1; 1696 tran_low_t out = HIGHBD_WRAPLOW( 1697 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); 1698 1699 out = 1700 HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); 1701 a1 = ROUND_POWER_OF_TWO(out, 5); 1702 for (j = 0; j < 8; ++j) { 1703 for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 1704 dest += stride; 1705 } 1706 } 1707 1708 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { 1709 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 1710 tran_high_t s9, s10, s11, s12, s13, s14, s15; 1711 tran_low_t x0 = input[15]; 1712 tran_low_t x1 = input[0]; 1713 tran_low_t x2 = input[13]; 1714 tran_low_t x3 = input[2]; 1715 tran_low_t x4 = input[11]; 1716 tran_low_t x5 = input[4]; 1717 tran_low_t x6 = input[9]; 1718 tran_low_t x7 = input[6]; 1719 tran_low_t x8 = input[7]; 1720 tran_low_t x9 = input[8]; 1721 tran_low_t x10 = input[5]; 1722 tran_low_t x11 = input[10]; 1723 tran_low_t x12 = input[3]; 1724 tran_low_t x13 = input[12]; 1725 tran_low_t x14 = input[1]; 1726 tran_low_t x15 = input[14]; 1727 (void)bd; 1728 1729 if (detect_invalid_highbd_input(input, 16)) { 1730 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1731 assert(0 && "invalid highbd txfm input"); 1732 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1733 memset(output, 0, sizeof(*output) * 16); 1734 return; 1735 } 1736 1737 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | 1738 x13 | x14 | x15)) { 1739 memset(output, 0, 16 * sizeof(*output)); 1740 return; 1741 } 1742 1743 // stage 1 1744 s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64; 1745 s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64; 1746 s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64; 1747 s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64; 1748 s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64; 1749 s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64; 1750 s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64; 1751 s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64; 1752 s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64; 1753 s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64; 1754 s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64; 1755 s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64; 1756 s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64; 1757 s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64; 1758 s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64; 1759 s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64; 1760 1761 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); 1762 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); 1763 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); 1764 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); 1765 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); 1766 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); 1767 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); 1768 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); 1769 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); 1770 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); 1771 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); 1772 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); 1773 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); 1774 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); 1775 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); 1776 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); 1777 1778 // stage 2 1779 s0 = x0; 1780 s1 = x1; 1781 s2 = x2; 1782 s3 = x3; 1783 s4 = x4; 1784 s5 = x5; 1785 s6 = x6; 1786 s7 = x7; 1787 s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64; 1788 s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64; 1789 s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64; 1790 s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64; 1791 s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64; 1792 s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64; 1793 s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64; 1794 s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64; 1795 1796 x0 = HIGHBD_WRAPLOW(s0 + s4, bd); 1797 x1 = HIGHBD_WRAPLOW(s1 + s5, bd); 1798 x2 = HIGHBD_WRAPLOW(s2 + s6, bd); 1799 x3 = HIGHBD_WRAPLOW(s3 + s7, bd); 1800 x4 = HIGHBD_WRAPLOW(s0 - s4, bd); 1801 x5 = HIGHBD_WRAPLOW(s1 - s5, bd); 1802 x6 = HIGHBD_WRAPLOW(s2 - s6, bd); 1803 x7 = HIGHBD_WRAPLOW(s3 - s7, bd); 1804 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); 1805 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); 1806 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); 1807 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); 1808 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); 1809 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); 1810 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); 1811 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); 1812 1813 // stage 3 1814 s0 = x0; 1815 s1 = x1; 1816 s2 = x2; 1817 s3 = x3; 1818 s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64; 1819 s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64; 1820 s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64; 1821 s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64; 1822 s8 = x8; 1823 s9 = x9; 1824 s10 = x10; 1825 s11 = x11; 1826 s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64; 1827 s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64; 1828 s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64; 1829 s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64; 1830 1831 x0 = HIGHBD_WRAPLOW(s0 + s2, bd); 1832 x1 = HIGHBD_WRAPLOW(s1 + s3, bd); 1833 x2 = HIGHBD_WRAPLOW(s0 - s2, bd); 1834 x3 = HIGHBD_WRAPLOW(s1 - s3, bd); 1835 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); 1836 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); 1837 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); 1838 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); 1839 x8 = HIGHBD_WRAPLOW(s8 + s10, bd); 1840 x9 = HIGHBD_WRAPLOW(s9 + s11, bd); 1841 x10 = HIGHBD_WRAPLOW(s8 - s10, bd); 1842 x11 = HIGHBD_WRAPLOW(s9 - s11, bd); 1843 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); 1844 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); 1845 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); 1846 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); 1847 1848 // stage 4 1849 s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3); 1850 s3 = (tran_high_t)cospi_16_64 * (x2 - x3); 1851 s6 = (tran_high_t)cospi_16_64 * (x6 + x7); 1852 s7 = (tran_high_t)cospi_16_64 * (-x6 + x7); 1853 s10 = (tran_high_t)cospi_16_64 * (x10 + x11); 1854 s11 = (tran_high_t)cospi_16_64 * (-x10 + x11); 1855 s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15); 1856 s15 = (tran_high_t)cospi_16_64 * (x14 - x15); 1857 1858 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); 1859 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); 1860 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); 1861 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); 1862 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); 1863 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); 1864 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); 1865 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); 1866 1867 output[0] = HIGHBD_WRAPLOW(x0, bd); 1868 output[1] = HIGHBD_WRAPLOW(-x8, bd); 1869 output[2] = HIGHBD_WRAPLOW(x12, bd); 1870 output[3] = HIGHBD_WRAPLOW(-x4, bd); 1871 output[4] = HIGHBD_WRAPLOW(x6, bd); 1872 output[5] = HIGHBD_WRAPLOW(x14, bd); 1873 output[6] = HIGHBD_WRAPLOW(x10, bd); 1874 output[7] = HIGHBD_WRAPLOW(x2, bd); 1875 output[8] = HIGHBD_WRAPLOW(x3, bd); 1876 output[9] = HIGHBD_WRAPLOW(x11, bd); 1877 output[10] = HIGHBD_WRAPLOW(x15, bd); 1878 output[11] = HIGHBD_WRAPLOW(x7, bd); 1879 output[12] = HIGHBD_WRAPLOW(x5, bd); 1880 output[13] = HIGHBD_WRAPLOW(-x13, bd); 1881 output[14] = HIGHBD_WRAPLOW(x9, bd); 1882 output[15] = HIGHBD_WRAPLOW(-x1, bd); 1883 } 1884 1885 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { 1886 tran_low_t step1[16], step2[16]; 1887 tran_high_t temp1, temp2; 1888 (void)bd; 1889 1890 if (detect_invalid_highbd_input(input, 16)) { 1891 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1892 assert(0 && "invalid highbd txfm input"); 1893 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1894 memset(output, 0, sizeof(*output) * 16); 1895 return; 1896 } 1897 1898 // stage 1 1899 step1[0] = input[0 / 2]; 1900 step1[1] = input[16 / 2]; 1901 step1[2] = input[8 / 2]; 1902 step1[3] = input[24 / 2]; 1903 step1[4] = input[4 / 2]; 1904 step1[5] = input[20 / 2]; 1905 step1[6] = input[12 / 2]; 1906 step1[7] = input[28 / 2]; 1907 step1[8] = input[2 / 2]; 1908 step1[9] = input[18 / 2]; 1909 step1[10] = input[10 / 2]; 1910 step1[11] = input[26 / 2]; 1911 step1[12] = input[6 / 2]; 1912 step1[13] = input[22 / 2]; 1913 step1[14] = input[14 / 2]; 1914 step1[15] = input[30 / 2]; 1915 1916 // stage 2 1917 step2[0] = step1[0]; 1918 step2[1] = step1[1]; 1919 step2[2] = step1[2]; 1920 step2[3] = step1[3]; 1921 step2[4] = step1[4]; 1922 step2[5] = step1[5]; 1923 step2[6] = step1[6]; 1924 step2[7] = step1[7]; 1925 1926 temp1 = 1927 step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; 1928 temp2 = 1929 step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; 1930 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1931 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1932 1933 temp1 = step1[9] * (tran_high_t)cospi_14_64 - 1934 step1[14] * (tran_high_t)cospi_18_64; 1935 temp2 = step1[9] * (tran_high_t)cospi_18_64 + 1936 step1[14] * (tran_high_t)cospi_14_64; 1937 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1938 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1939 1940 temp1 = step1[10] * (tran_high_t)cospi_22_64 - 1941 step1[13] * (tran_high_t)cospi_10_64; 1942 temp2 = step1[10] * (tran_high_t)cospi_10_64 + 1943 step1[13] * (tran_high_t)cospi_22_64; 1944 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1945 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1946 1947 temp1 = step1[11] * (tran_high_t)cospi_6_64 - 1948 step1[12] * (tran_high_t)cospi_26_64; 1949 temp2 = step1[11] * (tran_high_t)cospi_26_64 + 1950 step1[12] * (tran_high_t)cospi_6_64; 1951 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1952 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1953 1954 // stage 3 1955 step1[0] = step2[0]; 1956 step1[1] = step2[1]; 1957 step1[2] = step2[2]; 1958 step1[3] = step2[3]; 1959 1960 temp1 = 1961 step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; 1962 temp2 = 1963 step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; 1964 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1965 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1966 temp1 = 1967 step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; 1968 temp2 = 1969 step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; 1970 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1971 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1972 1973 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); 1974 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); 1975 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); 1976 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); 1977 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); 1978 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); 1979 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); 1980 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); 1981 1982 // stage 4 1983 temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; 1984 temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; 1985 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1986 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1987 temp1 = 1988 step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; 1989 temp2 = 1990 step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; 1991 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1992 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1993 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); 1994 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); 1995 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); 1996 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); 1997 1998 step2[8] = step1[8]; 1999 step2[15] = step1[15]; 2000 temp1 = -step1[9] * (tran_high_t)cospi_8_64 + 2001 step1[14] * (tran_high_t)cospi_24_64; 2002 temp2 = 2003 step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; 2004 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2005 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2006 temp1 = -step1[10] * (tran_high_t)cospi_24_64 - 2007 step1[13] * (tran_high_t)cospi_8_64; 2008 temp2 = -step1[10] * (tran_high_t)cospi_8_64 + 2009 step1[13] * (tran_high_t)cospi_24_64; 2010 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2011 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2012 step2[11] = step1[11]; 2013 step2[12] = step1[12]; 2014 2015 // stage 5 2016 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); 2017 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); 2018 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); 2019 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); 2020 step1[4] = step2[4]; 2021 temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; 2022 temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; 2023 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2024 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2025 step1[7] = step2[7]; 2026 2027 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); 2028 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); 2029 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); 2030 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); 2031 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); 2032 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); 2033 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); 2034 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); 2035 2036 // stage 6 2037 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); 2038 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); 2039 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); 2040 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); 2041 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); 2042 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); 2043 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); 2044 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); 2045 step2[8] = step1[8]; 2046 step2[9] = step1[9]; 2047 temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; 2048 temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; 2049 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2050 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2051 temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; 2052 temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; 2053 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2054 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2055 step2[14] = step1[14]; 2056 step2[15] = step1[15]; 2057 2058 // stage 7 2059 output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); 2060 output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); 2061 output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); 2062 output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); 2063 output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); 2064 output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); 2065 output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); 2066 output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); 2067 output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); 2068 output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); 2069 output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); 2070 output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); 2071 output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); 2072 output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); 2073 output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); 2074 output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); 2075 } 2076 2077 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, 2078 int stride, int bd) { 2079 int i, j; 2080 tran_low_t out[16 * 16]; 2081 tran_low_t *outptr = out; 2082 tran_low_t temp_in[16], temp_out[16]; 2083 2084 // First transform rows 2085 for (i = 0; i < 16; ++i) { 2086 vpx_highbd_idct16_c(input, outptr, bd); 2087 input += 16; 2088 outptr += 16; 2089 } 2090 2091 // Then transform columns 2092 for (i = 0; i < 16; ++i) { 2093 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 2094 vpx_highbd_idct16_c(temp_in, temp_out, bd); 2095 for (j = 0; j < 16; ++j) { 2096 dest[j * stride + i] = highbd_clip_pixel_add( 2097 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2098 } 2099 } 2100 } 2101 2102 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, 2103 int stride, int bd) { 2104 int i, j; 2105 tran_low_t out[16 * 16] = { 0 }; 2106 tran_low_t *outptr = out; 2107 tran_low_t temp_in[16], temp_out[16]; 2108 2109 // First transform rows. Since all non-zero dct coefficients are in 2110 // upper-left 8x8 area, we only need to calculate first 8 rows here. 2111 for (i = 0; i < 8; ++i) { 2112 vpx_highbd_idct16_c(input, outptr, bd); 2113 input += 16; 2114 outptr += 16; 2115 } 2116 2117 // Then transform columns 2118 for (i = 0; i < 16; ++i) { 2119 uint16_t *destT = dest; 2120 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 2121 vpx_highbd_idct16_c(temp_in, temp_out, bd); 2122 for (j = 0; j < 16; ++j) { 2123 destT[i] = highbd_clip_pixel_add(destT[i], 2124 ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2125 destT += stride; 2126 } 2127 } 2128 } 2129 2130 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, 2131 int stride, int bd) { 2132 int i, j; 2133 tran_low_t out[16 * 16] = { 0 }; 2134 tran_low_t *outptr = out; 2135 tran_low_t temp_in[16], temp_out[16]; 2136 2137 // First transform rows. Since all non-zero dct coefficients are in 2138 // upper-left 4x4 area, we only need to calculate first 4 rows here. 2139 for (i = 0; i < 4; ++i) { 2140 vpx_highbd_idct16_c(input, outptr, bd); 2141 input += 16; 2142 outptr += 16; 2143 } 2144 2145 // Then transform columns 2146 for (i = 0; i < 16; ++i) { 2147 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 2148 vpx_highbd_idct16_c(temp_in, temp_out, bd); 2149 for (j = 0; j < 16; ++j) { 2150 dest[j * stride + i] = highbd_clip_pixel_add( 2151 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2152 } 2153 } 2154 } 2155 2156 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, 2157 int stride, int bd) { 2158 int i, j; 2159 tran_high_t a1; 2160 tran_low_t out = HIGHBD_WRAPLOW( 2161 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); 2162 2163 out = 2164 HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); 2165 a1 = ROUND_POWER_OF_TWO(out, 6); 2166 for (j = 0; j < 16; ++j) { 2167 for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 2168 dest += stride; 2169 } 2170 } 2171 2172 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, 2173 int bd) { 2174 tran_low_t step1[32], step2[32]; 2175 tran_high_t temp1, temp2; 2176 (void)bd; 2177 2178 if (detect_invalid_highbd_input(input, 32)) { 2179 #if CONFIG_COEFFICIENT_RANGE_CHECKING 2180 assert(0 && "invalid highbd txfm input"); 2181 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 2182 memset(output, 0, sizeof(*output) * 32); 2183 return; 2184 } 2185 2186 // stage 1 2187 step1[0] = input[0]; 2188 step1[1] = input[16]; 2189 step1[2] = input[8]; 2190 step1[3] = input[24]; 2191 step1[4] = input[4]; 2192 step1[5] = input[20]; 2193 step1[6] = input[12]; 2194 step1[7] = input[28]; 2195 step1[8] = input[2]; 2196 step1[9] = input[18]; 2197 step1[10] = input[10]; 2198 step1[11] = input[26]; 2199 step1[12] = input[6]; 2200 step1[13] = input[22]; 2201 step1[14] = input[14]; 2202 step1[15] = input[30]; 2203 2204 temp1 = 2205 input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64; 2206 temp2 = 2207 input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64; 2208 step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2209 step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2210 2211 temp1 = input[17] * (tran_high_t)cospi_15_64 - 2212 input[15] * (tran_high_t)cospi_17_64; 2213 temp2 = input[17] * (tran_high_t)cospi_17_64 + 2214 input[15] * (tran_high_t)cospi_15_64; 2215 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2216 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2217 2218 temp1 = 2219 input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64; 2220 temp2 = 2221 input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64; 2222 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2223 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2224 2225 temp1 = 2226 input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64; 2227 temp2 = 2228 input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64; 2229 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2230 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2231 2232 temp1 = 2233 input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64; 2234 temp2 = 2235 input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64; 2236 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2237 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2238 2239 temp1 = input[21] * (tran_high_t)cospi_11_64 - 2240 input[11] * (tran_high_t)cospi_21_64; 2241 temp2 = input[21] * (tran_high_t)cospi_21_64 + 2242 input[11] * (tran_high_t)cospi_11_64; 2243 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2244 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2245 2246 temp1 = input[13] * (tran_high_t)cospi_19_64 - 2247 input[19] * (tran_high_t)cospi_13_64; 2248 temp2 = input[13] * (tran_high_t)cospi_13_64 + 2249 input[19] * (tran_high_t)cospi_19_64; 2250 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2251 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2252 2253 temp1 = 2254 input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64; 2255 temp2 = 2256 input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64; 2257 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2258 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2259 2260 // stage 2 2261 step2[0] = step1[0]; 2262 step2[1] = step1[1]; 2263 step2[2] = step1[2]; 2264 step2[3] = step1[3]; 2265 step2[4] = step1[4]; 2266 step2[5] = step1[5]; 2267 step2[6] = step1[6]; 2268 step2[7] = step1[7]; 2269 2270 temp1 = 2271 step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64; 2272 temp2 = 2273 step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64; 2274 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2275 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2276 2277 temp1 = step1[9] * (tran_high_t)cospi_14_64 - 2278 step1[14] * (tran_high_t)cospi_18_64; 2279 temp2 = step1[9] * (tran_high_t)cospi_18_64 + 2280 step1[14] * (tran_high_t)cospi_14_64; 2281 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2282 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2283 2284 temp1 = step1[10] * (tran_high_t)cospi_22_64 - 2285 step1[13] * (tran_high_t)cospi_10_64; 2286 temp2 = step1[10] * (tran_high_t)cospi_10_64 + 2287 step1[13] * (tran_high_t)cospi_22_64; 2288 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2289 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2290 2291 temp1 = step1[11] * (tran_high_t)cospi_6_64 - 2292 step1[12] * (tran_high_t)cospi_26_64; 2293 temp2 = step1[11] * (tran_high_t)cospi_26_64 + 2294 step1[12] * (tran_high_t)cospi_6_64; 2295 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2296 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2297 2298 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); 2299 step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); 2300 step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); 2301 step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); 2302 step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); 2303 step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); 2304 step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); 2305 step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); 2306 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); 2307 step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); 2308 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); 2309 step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); 2310 step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); 2311 step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); 2312 step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); 2313 step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); 2314 2315 // stage 3 2316 step1[0] = step2[0]; 2317 step1[1] = step2[1]; 2318 step1[2] = step2[2]; 2319 step1[3] = step2[3]; 2320 2321 temp1 = 2322 step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64; 2323 temp2 = 2324 step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64; 2325 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2326 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2327 temp1 = 2328 step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64; 2329 temp2 = 2330 step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64; 2331 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2332 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2333 2334 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); 2335 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); 2336 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); 2337 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); 2338 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); 2339 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); 2340 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); 2341 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); 2342 2343 step1[16] = step2[16]; 2344 step1[31] = step2[31]; 2345 temp1 = -step2[17] * (tran_high_t)cospi_4_64 + 2346 step2[30] * (tran_high_t)cospi_28_64; 2347 temp2 = step2[17] * (tran_high_t)cospi_28_64 + 2348 step2[30] * (tran_high_t)cospi_4_64; 2349 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2350 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2351 temp1 = -step2[18] * (tran_high_t)cospi_28_64 - 2352 step2[29] * (tran_high_t)cospi_4_64; 2353 temp2 = -step2[18] * (tran_high_t)cospi_4_64 + 2354 step2[29] * (tran_high_t)cospi_28_64; 2355 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2356 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2357 step1[19] = step2[19]; 2358 step1[20] = step2[20]; 2359 temp1 = -step2[21] * (tran_high_t)cospi_20_64 + 2360 step2[26] * (tran_high_t)cospi_12_64; 2361 temp2 = step2[21] * (tran_high_t)cospi_12_64 + 2362 step2[26] * (tran_high_t)cospi_20_64; 2363 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2364 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2365 temp1 = -step2[22] * (tran_high_t)cospi_12_64 - 2366 step2[25] * (tran_high_t)cospi_20_64; 2367 temp2 = -step2[22] * (tran_high_t)cospi_20_64 + 2368 step2[25] * (tran_high_t)cospi_12_64; 2369 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2370 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2371 step1[23] = step2[23]; 2372 step1[24] = step2[24]; 2373 step1[27] = step2[27]; 2374 step1[28] = step2[28]; 2375 2376 // stage 4 2377 temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64; 2378 temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64; 2379 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2380 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2381 temp1 = 2382 step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64; 2383 temp2 = 2384 step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64; 2385 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2386 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2387 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); 2388 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); 2389 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); 2390 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); 2391 2392 step2[8] = step1[8]; 2393 step2[15] = step1[15]; 2394 temp1 = -step1[9] * (tran_high_t)cospi_8_64 + 2395 step1[14] * (tran_high_t)cospi_24_64; 2396 temp2 = 2397 step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64; 2398 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2399 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2400 temp1 = -step1[10] * (tran_high_t)cospi_24_64 - 2401 step1[13] * (tran_high_t)cospi_8_64; 2402 temp2 = -step1[10] * (tran_high_t)cospi_8_64 + 2403 step1[13] * (tran_high_t)cospi_24_64; 2404 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2405 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2406 step2[11] = step1[11]; 2407 step2[12] = step1[12]; 2408 2409 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); 2410 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); 2411 step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); 2412 step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); 2413 step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); 2414 step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); 2415 step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); 2416 step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); 2417 2418 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); 2419 step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); 2420 step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); 2421 step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); 2422 step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); 2423 step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); 2424 step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); 2425 step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); 2426 2427 // stage 5 2428 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); 2429 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); 2430 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); 2431 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); 2432 step1[4] = step2[4]; 2433 temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64; 2434 temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64; 2435 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2436 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2437 step1[7] = step2[7]; 2438 2439 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); 2440 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); 2441 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); 2442 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); 2443 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); 2444 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); 2445 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); 2446 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); 2447 2448 step1[16] = step2[16]; 2449 step1[17] = step2[17]; 2450 temp1 = -step2[18] * (tran_high_t)cospi_8_64 + 2451 step2[29] * (tran_high_t)cospi_24_64; 2452 temp2 = step2[18] * (tran_high_t)cospi_24_64 + 2453 step2[29] * (tran_high_t)cospi_8_64; 2454 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2455 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2456 temp1 = -step2[19] * (tran_high_t)cospi_8_64 + 2457 step2[28] * (tran_high_t)cospi_24_64; 2458 temp2 = step2[19] * (tran_high_t)cospi_24_64 + 2459 step2[28] * (tran_high_t)cospi_8_64; 2460 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2461 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2462 temp1 = -step2[20] * (tran_high_t)cospi_24_64 - 2463 step2[27] * (tran_high_t)cospi_8_64; 2464 temp2 = -step2[20] * (tran_high_t)cospi_8_64 + 2465 step2[27] * (tran_high_t)cospi_24_64; 2466 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2467 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2468 temp1 = -step2[21] * (tran_high_t)cospi_24_64 - 2469 step2[26] * (tran_high_t)cospi_8_64; 2470 temp2 = -step2[21] * (tran_high_t)cospi_8_64 + 2471 step2[26] * (tran_high_t)cospi_24_64; 2472 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2473 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2474 step1[22] = step2[22]; 2475 step1[23] = step2[23]; 2476 step1[24] = step2[24]; 2477 step1[25] = step2[25]; 2478 step1[30] = step2[30]; 2479 step1[31] = step2[31]; 2480 2481 // stage 6 2482 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); 2483 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); 2484 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); 2485 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); 2486 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); 2487 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); 2488 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); 2489 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); 2490 step2[8] = step1[8]; 2491 step2[9] = step1[9]; 2492 temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64; 2493 temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64; 2494 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2495 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2496 temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64; 2497 temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64; 2498 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2499 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2500 step2[14] = step1[14]; 2501 step2[15] = step1[15]; 2502 2503 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); 2504 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); 2505 step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); 2506 step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); 2507 step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); 2508 step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); 2509 step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); 2510 step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); 2511 2512 step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); 2513 step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); 2514 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); 2515 step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); 2516 step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); 2517 step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); 2518 step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); 2519 step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); 2520 2521 // stage 7 2522 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); 2523 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); 2524 step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); 2525 step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); 2526 step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); 2527 step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); 2528 step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); 2529 step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); 2530 step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); 2531 step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); 2532 step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); 2533 step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); 2534 step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); 2535 step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); 2536 step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); 2537 step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); 2538 2539 step1[16] = step2[16]; 2540 step1[17] = step2[17]; 2541 step1[18] = step2[18]; 2542 step1[19] = step2[19]; 2543 temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64; 2544 temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64; 2545 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2546 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2547 temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64; 2548 temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64; 2549 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2550 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2551 temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64; 2552 temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64; 2553 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2554 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2555 temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64; 2556 temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64; 2557 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2558 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2559 step1[28] = step2[28]; 2560 step1[29] = step2[29]; 2561 step1[30] = step2[30]; 2562 step1[31] = step2[31]; 2563 2564 // final stage 2565 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); 2566 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); 2567 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); 2568 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); 2569 output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); 2570 output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); 2571 output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); 2572 output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); 2573 output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); 2574 output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); 2575 output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); 2576 output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); 2577 output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); 2578 output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); 2579 output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); 2580 output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); 2581 output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); 2582 output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); 2583 output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); 2584 output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); 2585 output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); 2586 output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); 2587 output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); 2588 output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); 2589 output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); 2590 output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); 2591 output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); 2592 output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); 2593 output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); 2594 output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); 2595 output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); 2596 output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); 2597 } 2598 2599 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, 2600 int stride, int bd) { 2601 int i, j; 2602 tran_low_t out[32 * 32]; 2603 tran_low_t *outptr = out; 2604 tran_low_t temp_in[32], temp_out[32]; 2605 2606 // Rows 2607 for (i = 0; i < 32; ++i) { 2608 tran_low_t zero_coeff = 0; 2609 for (j = 0; j < 32; ++j) zero_coeff |= input[j]; 2610 2611 if (zero_coeff) 2612 highbd_idct32_c(input, outptr, bd); 2613 else 2614 memset(outptr, 0, sizeof(tran_low_t) * 32); 2615 input += 32; 2616 outptr += 32; 2617 } 2618 2619 // Columns 2620 for (i = 0; i < 32; ++i) { 2621 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 2622 highbd_idct32_c(temp_in, temp_out, bd); 2623 for (j = 0; j < 32; ++j) { 2624 dest[j * stride + i] = highbd_clip_pixel_add( 2625 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2626 } 2627 } 2628 } 2629 2630 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, 2631 int stride, int bd) { 2632 int i, j; 2633 tran_low_t out[32 * 32] = { 0 }; 2634 tran_low_t *outptr = out; 2635 tran_low_t temp_in[32], temp_out[32]; 2636 2637 // Rows 2638 // Only upper-left 16x16 has non-zero coeff 2639 for (i = 0; i < 16; ++i) { 2640 highbd_idct32_c(input, outptr, bd); 2641 input += 32; 2642 outptr += 32; 2643 } 2644 2645 // Columns 2646 for (i = 0; i < 32; ++i) { 2647 uint16_t *destT = dest; 2648 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 2649 highbd_idct32_c(temp_in, temp_out, bd); 2650 for (j = 0; j < 32; ++j) { 2651 destT[i] = highbd_clip_pixel_add(destT[i], 2652 ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2653 destT += stride; 2654 } 2655 } 2656 } 2657 2658 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, 2659 int stride, int bd) { 2660 int i, j; 2661 tran_low_t out[32 * 32] = { 0 }; 2662 tran_low_t *outptr = out; 2663 tran_low_t temp_in[32], temp_out[32]; 2664 2665 // Rows 2666 // Only upper-left 8x8 has non-zero coeff 2667 for (i = 0; i < 8; ++i) { 2668 highbd_idct32_c(input, outptr, bd); 2669 input += 32; 2670 outptr += 32; 2671 } 2672 2673 // Columns 2674 for (i = 0; i < 32; ++i) { 2675 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 2676 highbd_idct32_c(temp_in, temp_out, bd); 2677 for (j = 0; j < 32; ++j) { 2678 dest[j * stride + i] = highbd_clip_pixel_add( 2679 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2680 } 2681 } 2682 } 2683 2684 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, 2685 int stride, int bd) { 2686 int i, j; 2687 int a1; 2688 tran_low_t out = HIGHBD_WRAPLOW( 2689 dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); 2690 2691 out = 2692 HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd); 2693 a1 = ROUND_POWER_OF_TWO(out, 6); 2694 2695 for (j = 0; j < 32; ++j) { 2696 for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 2697 dest += stride; 2698 } 2699 } 2700 2701 #endif // CONFIG_VP9_HIGHBITDEPTH 2702