1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <math.h> 13 14 #include "./vpx_config.h" 15 #include "./vp9_rtcd.h" 16 #include "vp9/common/vp9_systemdependent.h" 17 #include "vp9/common/vp9_blockd.h" 18 #include "vp9/common/vp9_common.h" 19 #include "vp9/common/vp9_idct.h" 20 21 #if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 22 // When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict 23 // overflow wrapping to match expected hardware implementations. 24 // bd of 8 uses trans_low with 16bits, need to remove 16bits 25 // bd of 10 uses trans_low with 18bits, need to remove 14bits 26 // bd of 12 uses trans_low with 20bits, need to remove 12bits 27 // bd of x uses trans_low with 8+x bits, need to remove 24-x bits 28 #define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd)) 29 #else 30 #define WRAPLOW(x) (x) 31 #endif // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH 32 33 #if CONFIG_VP9_HIGHBITDEPTH 34 static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low, 35 tran_low_t high) { 36 return value < low ? low : (value > high ? high : value); 37 } 38 39 static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest, 40 tran_high_t trans, int bd) { 41 trans = WRAPLOW(trans); 42 switch (bd) { 43 case 8: 44 default: 45 return clamp_high(WRAPLOW(dest + trans), 0, 255); 46 case 10: 47 return clamp_high(WRAPLOW(dest + trans), 0, 1023); 48 case 12: 49 return clamp_high(WRAPLOW(dest + trans), 0, 4095); 50 } 51 } 52 #endif // CONFIG_VP9_HIGHBITDEPTH 53 54 void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 55 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 56 0.5 shifts per pixel. */ 57 int i; 58 tran_low_t output[16]; 59 tran_high_t a1, b1, c1, d1, e1; 60 const tran_low_t *ip = input; 61 tran_low_t *op = output; 62 63 for (i = 0; i < 4; i++) { 64 a1 = ip[0] >> UNIT_QUANT_SHIFT; 65 c1 = ip[1] >> UNIT_QUANT_SHIFT; 66 d1 = ip[2] >> UNIT_QUANT_SHIFT; 67 b1 = ip[3] >> UNIT_QUANT_SHIFT; 68 a1 += c1; 69 d1 -= b1; 70 e1 = (a1 - d1) >> 1; 71 b1 = e1 - b1; 72 c1 = e1 - c1; 73 a1 -= b1; 74 d1 += c1; 75 op[0] = a1; 76 op[1] = b1; 77 op[2] = c1; 78 op[3] = d1; 79 ip += 4; 80 op += 4; 81 } 82 83 ip = output; 84 for (i = 0; i < 4; i++) { 85 a1 = ip[4 * 0]; 86 c1 = ip[4 * 1]; 87 d1 = ip[4 * 2]; 88 b1 = ip[4 * 3]; 89 a1 += c1; 90 d1 -= b1; 91 e1 = (a1 - d1) >> 1; 92 b1 = e1 - b1; 93 c1 = e1 - c1; 94 a1 -= b1; 95 d1 += c1; 96 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); 97 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); 98 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); 99 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); 100 101 ip++; 102 dest++; 103 } 104 } 105 106 void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { 107 int i; 108 tran_high_t a1, e1; 109 tran_low_t tmp[4]; 110 const tran_low_t *ip = in; 111 tran_low_t *op = tmp; 112 113 a1 = ip[0] >> UNIT_QUANT_SHIFT; 114 e1 = a1 >> 1; 115 a1 -= e1; 116 op[0] = a1; 117 op[1] = op[2] = op[3] = e1; 118 119 ip = tmp; 120 for (i = 0; i < 4; i++) { 121 e1 = ip[0] >> 1; 122 a1 = ip[0] - e1; 123 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); 124 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); 125 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); 126 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); 127 ip++; 128 dest++; 129 } 130 } 131 132 static void idct4(const tran_low_t *input, tran_low_t *output) { 133 tran_low_t step[4]; 134 tran_high_t temp1, temp2; 135 // stage 1 136 temp1 = (input[0] + input[2]) * cospi_16_64; 137 temp2 = (input[0] - input[2]) * cospi_16_64; 138 step[0] = dct_const_round_shift(temp1); 139 step[1] = dct_const_round_shift(temp2); 140 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 141 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 142 step[2] = dct_const_round_shift(temp1); 143 step[3] = dct_const_round_shift(temp2); 144 145 // stage 2 146 output[0] = step[0] + step[3]; 147 output[1] = step[1] + step[2]; 148 output[2] = step[1] - step[2]; 149 output[3] = step[0] - step[3]; 150 } 151 152 void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 153 tran_low_t out[4 * 4]; 154 tran_low_t *outptr = out; 155 int i, j; 156 tran_low_t temp_in[4], temp_out[4]; 157 158 // Rows 159 for (i = 0; i < 4; ++i) { 160 idct4(input, outptr); 161 input += 4; 162 outptr += 4; 163 } 164 165 // Columns 166 for (i = 0; i < 4; ++i) { 167 for (j = 0; j < 4; ++j) 168 temp_in[j] = out[j * 4 + i]; 169 idct4(temp_in, temp_out); 170 for (j = 0; j < 4; ++j) 171 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 172 + dest[j * stride + i]); 173 } 174 } 175 176 void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, 177 int dest_stride) { 178 int i; 179 tran_high_t a1; 180 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); 181 out = dct_const_round_shift(out * cospi_16_64); 182 a1 = ROUND_POWER_OF_TWO(out, 4); 183 184 for (i = 0; i < 4; i++) { 185 dest[0] = clip_pixel(dest[0] + a1); 186 dest[1] = clip_pixel(dest[1] + a1); 187 dest[2] = clip_pixel(dest[2] + a1); 188 dest[3] = clip_pixel(dest[3] + a1); 189 dest += dest_stride; 190 } 191 } 192 193 static void idct8(const tran_low_t *input, tran_low_t *output) { 194 tran_low_t step1[8], step2[8]; 195 tran_high_t temp1, temp2; 196 // stage 1 197 step1[0] = input[0]; 198 step1[2] = input[4]; 199 step1[1] = input[2]; 200 step1[3] = input[6]; 201 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 202 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 203 step1[4] = dct_const_round_shift(temp1); 204 step1[7] = dct_const_round_shift(temp2); 205 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 206 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 207 step1[5] = dct_const_round_shift(temp1); 208 step1[6] = dct_const_round_shift(temp2); 209 210 // stage 2 & stage 3 - even half 211 idct4(step1, step1); 212 213 // stage 2 - odd half 214 step2[4] = step1[4] + step1[5]; 215 step2[5] = step1[4] - step1[5]; 216 step2[6] = -step1[6] + step1[7]; 217 step2[7] = step1[6] + step1[7]; 218 219 // stage 3 -odd half 220 step1[4] = step2[4]; 221 temp1 = (step2[6] - step2[5]) * cospi_16_64; 222 temp2 = (step2[5] + step2[6]) * cospi_16_64; 223 step1[5] = dct_const_round_shift(temp1); 224 step1[6] = dct_const_round_shift(temp2); 225 step1[7] = step2[7]; 226 227 // stage 4 228 output[0] = step1[0] + step1[7]; 229 output[1] = step1[1] + step1[6]; 230 output[2] = step1[2] + step1[5]; 231 output[3] = step1[3] + step1[4]; 232 output[4] = step1[3] - step1[4]; 233 output[5] = step1[2] - step1[5]; 234 output[6] = step1[1] - step1[6]; 235 output[7] = step1[0] - step1[7]; 236 } 237 238 void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 239 tran_low_t out[8 * 8]; 240 tran_low_t *outptr = out; 241 int i, j; 242 tran_low_t temp_in[8], temp_out[8]; 243 244 // First transform rows 245 for (i = 0; i < 8; ++i) { 246 idct8(input, outptr); 247 input += 8; 248 outptr += 8; 249 } 250 251 // Then transform columns 252 for (i = 0; i < 8; ++i) { 253 for (j = 0; j < 8; ++j) 254 temp_in[j] = out[j * 8 + i]; 255 idct8(temp_in, temp_out); 256 for (j = 0; j < 8; ++j) 257 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 258 + dest[j * stride + i]); 259 } 260 } 261 262 void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 263 int i, j; 264 tran_high_t a1; 265 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); 266 out = dct_const_round_shift(out * cospi_16_64); 267 a1 = ROUND_POWER_OF_TWO(out, 5); 268 for (j = 0; j < 8; ++j) { 269 for (i = 0; i < 8; ++i) 270 dest[i] = clip_pixel(dest[i] + a1); 271 dest += stride; 272 } 273 } 274 275 static void iadst4(const tran_low_t *input, tran_low_t *output) { 276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 277 278 tran_high_t x0 = input[0]; 279 tran_high_t x1 = input[1]; 280 tran_high_t x2 = input[2]; 281 tran_high_t x3 = input[3]; 282 283 if (!(x0 | x1 | x2 | x3)) { 284 output[0] = output[1] = output[2] = output[3] = 0; 285 return; 286 } 287 288 s0 = sinpi_1_9 * x0; 289 s1 = sinpi_2_9 * x0; 290 s2 = sinpi_3_9 * x1; 291 s3 = sinpi_4_9 * x2; 292 s4 = sinpi_1_9 * x2; 293 s5 = sinpi_2_9 * x3; 294 s6 = sinpi_4_9 * x3; 295 s7 = x0 - x2 + x3; 296 297 x0 = s0 + s3 + s5; 298 x1 = s1 - s4 - s6; 299 x2 = sinpi_3_9 * s7; 300 x3 = s2; 301 302 s0 = x0 + x3; 303 s1 = x1 + x3; 304 s2 = x2; 305 s3 = x0 + x1 - x3; 306 307 // 1-D transform scaling factor is sqrt(2). 308 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 309 // + 1b (addition) = 29b. 310 // Hence the output bit depth is 15b. 311 output[0] = dct_const_round_shift(s0); 312 output[1] = dct_const_round_shift(s1); 313 output[2] = dct_const_round_shift(s2); 314 output[3] = dct_const_round_shift(s3); 315 } 316 317 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, 318 int tx_type) { 319 const transform_2d IHT_4[] = { 320 { idct4, idct4 }, // DCT_DCT = 0 321 { iadst4, idct4 }, // ADST_DCT = 1 322 { idct4, iadst4 }, // DCT_ADST = 2 323 { iadst4, iadst4 } // ADST_ADST = 3 324 }; 325 326 int i, j; 327 tran_low_t out[4 * 4]; 328 tran_low_t *outptr = out; 329 tran_low_t temp_in[4], temp_out[4]; 330 331 // inverse transform row vectors 332 for (i = 0; i < 4; ++i) { 333 IHT_4[tx_type].rows(input, outptr); 334 input += 4; 335 outptr += 4; 336 } 337 338 // inverse transform column vectors 339 for (i = 0; i < 4; ++i) { 340 for (j = 0; j < 4; ++j) 341 temp_in[j] = out[j * 4 + i]; 342 IHT_4[tx_type].cols(temp_in, temp_out); 343 for (j = 0; j < 4; ++j) 344 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 345 + dest[j * stride + i]); 346 } 347 } 348 static void iadst8(const tran_low_t *input, tran_low_t *output) { 349 int s0, s1, s2, s3, s4, s5, s6, s7; 350 351 tran_high_t x0 = input[7]; 352 tran_high_t x1 = input[0]; 353 tran_high_t x2 = input[5]; 354 tran_high_t x3 = input[2]; 355 tran_high_t x4 = input[3]; 356 tran_high_t x5 = input[4]; 357 tran_high_t x6 = input[1]; 358 tran_high_t x7 = input[6]; 359 360 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 361 output[0] = output[1] = output[2] = output[3] = output[4] 362 = output[5] = output[6] = output[7] = 0; 363 return; 364 } 365 366 // stage 1 367 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 368 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 369 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 370 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 371 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 372 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 373 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 374 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 375 376 x0 = dct_const_round_shift(s0 + s4); 377 x1 = dct_const_round_shift(s1 + s5); 378 x2 = dct_const_round_shift(s2 + s6); 379 x3 = dct_const_round_shift(s3 + s7); 380 x4 = dct_const_round_shift(s0 - s4); 381 x5 = dct_const_round_shift(s1 - s5); 382 x6 = dct_const_round_shift(s2 - s6); 383 x7 = dct_const_round_shift(s3 - s7); 384 385 // stage 2 386 s0 = x0; 387 s1 = x1; 388 s2 = x2; 389 s3 = x3; 390 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 391 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 392 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 393 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 394 395 x0 = s0 + s2; 396 x1 = s1 + s3; 397 x2 = s0 - s2; 398 x3 = s1 - s3; 399 x4 = dct_const_round_shift(s4 + s6); 400 x5 = dct_const_round_shift(s5 + s7); 401 x6 = dct_const_round_shift(s4 - s6); 402 x7 = dct_const_round_shift(s5 - s7); 403 404 // stage 3 405 s2 = cospi_16_64 * (x2 + x3); 406 s3 = cospi_16_64 * (x2 - x3); 407 s6 = cospi_16_64 * (x6 + x7); 408 s7 = cospi_16_64 * (x6 - x7); 409 410 x2 = dct_const_round_shift(s2); 411 x3 = dct_const_round_shift(s3); 412 x6 = dct_const_round_shift(s6); 413 x7 = dct_const_round_shift(s7); 414 415 output[0] = x0; 416 output[1] = -x4; 417 output[2] = x6; 418 output[3] = -x2; 419 output[4] = x3; 420 output[5] = -x7; 421 output[6] = x5; 422 output[7] = -x1; 423 } 424 425 static const transform_2d IHT_8[] = { 426 { idct8, idct8 }, // DCT_DCT = 0 427 { iadst8, idct8 }, // ADST_DCT = 1 428 { idct8, iadst8 }, // DCT_ADST = 2 429 { iadst8, iadst8 } // ADST_ADST = 3 430 }; 431 432 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, 433 int tx_type) { 434 int i, j; 435 tran_low_t out[8 * 8]; 436 tran_low_t *outptr = out; 437 tran_low_t temp_in[8], temp_out[8]; 438 const transform_2d ht = IHT_8[tx_type]; 439 440 // inverse transform row vectors 441 for (i = 0; i < 8; ++i) { 442 ht.rows(input, outptr); 443 input += 8; 444 outptr += 8; 445 } 446 447 // inverse transform column vectors 448 for (i = 0; i < 8; ++i) { 449 for (j = 0; j < 8; ++j) 450 temp_in[j] = out[j * 8 + i]; 451 ht.cols(temp_in, temp_out); 452 for (j = 0; j < 8; ++j) 453 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 454 + dest[j * stride + i]); 455 } 456 } 457 458 void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 459 tran_low_t out[8 * 8] = { 0 }; 460 tran_low_t *outptr = out; 461 int i, j; 462 tran_low_t temp_in[8], temp_out[8]; 463 464 // First transform rows 465 // only first 4 row has non-zero coefs 466 for (i = 0; i < 4; ++i) { 467 idct8(input, outptr); 468 input += 8; 469 outptr += 8; 470 } 471 472 // Then transform columns 473 for (i = 0; i < 8; ++i) { 474 for (j = 0; j < 8; ++j) 475 temp_in[j] = out[j * 8 + i]; 476 idct8(temp_in, temp_out); 477 for (j = 0; j < 8; ++j) 478 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 479 + dest[j * stride + i]); 480 } 481 } 482 483 static void idct16(const tran_low_t *input, tran_low_t *output) { 484 tran_low_t step1[16], step2[16]; 485 tran_high_t temp1, temp2; 486 487 // stage 1 488 step1[0] = input[0/2]; 489 step1[1] = input[16/2]; 490 step1[2] = input[8/2]; 491 step1[3] = input[24/2]; 492 step1[4] = input[4/2]; 493 step1[5] = input[20/2]; 494 step1[6] = input[12/2]; 495 step1[7] = input[28/2]; 496 step1[8] = input[2/2]; 497 step1[9] = input[18/2]; 498 step1[10] = input[10/2]; 499 step1[11] = input[26/2]; 500 step1[12] = input[6/2]; 501 step1[13] = input[22/2]; 502 step1[14] = input[14/2]; 503 step1[15] = input[30/2]; 504 505 // stage 2 506 step2[0] = step1[0]; 507 step2[1] = step1[1]; 508 step2[2] = step1[2]; 509 step2[3] = step1[3]; 510 step2[4] = step1[4]; 511 step2[5] = step1[5]; 512 step2[6] = step1[6]; 513 step2[7] = step1[7]; 514 515 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 516 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 517 step2[8] = dct_const_round_shift(temp1); 518 step2[15] = dct_const_round_shift(temp2); 519 520 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 521 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 522 step2[9] = dct_const_round_shift(temp1); 523 step2[14] = dct_const_round_shift(temp2); 524 525 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 526 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 527 step2[10] = dct_const_round_shift(temp1); 528 step2[13] = dct_const_round_shift(temp2); 529 530 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 531 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 532 step2[11] = dct_const_round_shift(temp1); 533 step2[12] = dct_const_round_shift(temp2); 534 535 // stage 3 536 step1[0] = step2[0]; 537 step1[1] = step2[1]; 538 step1[2] = step2[2]; 539 step1[3] = step2[3]; 540 541 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 542 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 543 step1[4] = dct_const_round_shift(temp1); 544 step1[7] = dct_const_round_shift(temp2); 545 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 546 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 547 step1[5] = dct_const_round_shift(temp1); 548 step1[6] = dct_const_round_shift(temp2); 549 550 step1[8] = step2[8] + step2[9]; 551 step1[9] = step2[8] - step2[9]; 552 step1[10] = -step2[10] + step2[11]; 553 step1[11] = step2[10] + step2[11]; 554 step1[12] = step2[12] + step2[13]; 555 step1[13] = step2[12] - step2[13]; 556 step1[14] = -step2[14] + step2[15]; 557 step1[15] = step2[14] + step2[15]; 558 559 // stage 4 560 temp1 = (step1[0] + step1[1]) * cospi_16_64; 561 temp2 = (step1[0] - step1[1]) * cospi_16_64; 562 step2[0] = dct_const_round_shift(temp1); 563 step2[1] = dct_const_round_shift(temp2); 564 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 565 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 566 step2[2] = dct_const_round_shift(temp1); 567 step2[3] = dct_const_round_shift(temp2); 568 step2[4] = step1[4] + step1[5]; 569 step2[5] = step1[4] - step1[5]; 570 step2[6] = -step1[6] + step1[7]; 571 step2[7] = step1[6] + step1[7]; 572 573 step2[8] = step1[8]; 574 step2[15] = step1[15]; 575 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 576 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 577 step2[9] = dct_const_round_shift(temp1); 578 step2[14] = dct_const_round_shift(temp2); 579 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 580 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 581 step2[10] = dct_const_round_shift(temp1); 582 step2[13] = dct_const_round_shift(temp2); 583 step2[11] = step1[11]; 584 step2[12] = step1[12]; 585 586 // stage 5 587 step1[0] = step2[0] + step2[3]; 588 step1[1] = step2[1] + step2[2]; 589 step1[2] = step2[1] - step2[2]; 590 step1[3] = step2[0] - step2[3]; 591 step1[4] = step2[4]; 592 temp1 = (step2[6] - step2[5]) * cospi_16_64; 593 temp2 = (step2[5] + step2[6]) * cospi_16_64; 594 step1[5] = dct_const_round_shift(temp1); 595 step1[6] = dct_const_round_shift(temp2); 596 step1[7] = step2[7]; 597 598 step1[8] = step2[8] + step2[11]; 599 step1[9] = step2[9] + step2[10]; 600 step1[10] = step2[9] - step2[10]; 601 step1[11] = step2[8] - step2[11]; 602 step1[12] = -step2[12] + step2[15]; 603 step1[13] = -step2[13] + step2[14]; 604 step1[14] = step2[13] + step2[14]; 605 step1[15] = step2[12] + step2[15]; 606 607 // stage 6 608 step2[0] = step1[0] + step1[7]; 609 step2[1] = step1[1] + step1[6]; 610 step2[2] = step1[2] + step1[5]; 611 step2[3] = step1[3] + step1[4]; 612 step2[4] = step1[3] - step1[4]; 613 step2[5] = step1[2] - step1[5]; 614 step2[6] = step1[1] - step1[6]; 615 step2[7] = step1[0] - step1[7]; 616 step2[8] = step1[8]; 617 step2[9] = step1[9]; 618 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 619 temp2 = (step1[10] + step1[13]) * cospi_16_64; 620 step2[10] = dct_const_round_shift(temp1); 621 step2[13] = dct_const_round_shift(temp2); 622 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 623 temp2 = (step1[11] + step1[12]) * cospi_16_64; 624 step2[11] = dct_const_round_shift(temp1); 625 step2[12] = dct_const_round_shift(temp2); 626 step2[14] = step1[14]; 627 step2[15] = step1[15]; 628 629 // stage 7 630 output[0] = step2[0] + step2[15]; 631 output[1] = step2[1] + step2[14]; 632 output[2] = step2[2] + step2[13]; 633 output[3] = step2[3] + step2[12]; 634 output[4] = step2[4] + step2[11]; 635 output[5] = step2[5] + step2[10]; 636 output[6] = step2[6] + step2[9]; 637 output[7] = step2[7] + step2[8]; 638 output[8] = step2[7] - step2[8]; 639 output[9] = step2[6] - step2[9]; 640 output[10] = step2[5] - step2[10]; 641 output[11] = step2[4] - step2[11]; 642 output[12] = step2[3] - step2[12]; 643 output[13] = step2[2] - step2[13]; 644 output[14] = step2[1] - step2[14]; 645 output[15] = step2[0] - step2[15]; 646 } 647 648 void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, 649 int stride) { 650 tran_low_t out[16 * 16]; 651 tran_low_t *outptr = out; 652 int i, j; 653 tran_low_t temp_in[16], temp_out[16]; 654 655 // First transform rows 656 for (i = 0; i < 16; ++i) { 657 idct16(input, outptr); 658 input += 16; 659 outptr += 16; 660 } 661 662 // Then transform columns 663 for (i = 0; i < 16; ++i) { 664 for (j = 0; j < 16; ++j) 665 temp_in[j] = out[j * 16 + i]; 666 idct16(temp_in, temp_out); 667 for (j = 0; j < 16; ++j) 668 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 669 + dest[j * stride + i]); 670 } 671 } 672 673 static void iadst16(const tran_low_t *input, tran_low_t *output) { 674 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 675 tran_high_t s9, s10, s11, s12, s13, s14, s15; 676 677 tran_high_t x0 = input[15]; 678 tran_high_t x1 = input[0]; 679 tran_high_t x2 = input[13]; 680 tran_high_t x3 = input[2]; 681 tran_high_t x4 = input[11]; 682 tran_high_t x5 = input[4]; 683 tran_high_t x6 = input[9]; 684 tran_high_t x7 = input[6]; 685 tran_high_t x8 = input[7]; 686 tran_high_t x9 = input[8]; 687 tran_high_t x10 = input[5]; 688 tran_high_t x11 = input[10]; 689 tran_high_t x12 = input[3]; 690 tran_high_t x13 = input[12]; 691 tran_high_t x14 = input[1]; 692 tran_high_t x15 = input[14]; 693 694 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 695 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 696 output[0] = output[1] = output[2] = output[3] = output[4] 697 = output[5] = output[6] = output[7] = output[8] 698 = output[9] = output[10] = output[11] = output[12] 699 = output[13] = output[14] = output[15] = 0; 700 return; 701 } 702 703 // stage 1 704 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 705 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 706 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 707 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 708 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 709 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 710 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 711 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 712 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 713 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 714 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 715 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 716 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 717 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 718 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 719 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 720 721 x0 = dct_const_round_shift(s0 + s8); 722 x1 = dct_const_round_shift(s1 + s9); 723 x2 = dct_const_round_shift(s2 + s10); 724 x3 = dct_const_round_shift(s3 + s11); 725 x4 = dct_const_round_shift(s4 + s12); 726 x5 = dct_const_round_shift(s5 + s13); 727 x6 = dct_const_round_shift(s6 + s14); 728 x7 = dct_const_round_shift(s7 + s15); 729 x8 = dct_const_round_shift(s0 - s8); 730 x9 = dct_const_round_shift(s1 - s9); 731 x10 = dct_const_round_shift(s2 - s10); 732 x11 = dct_const_round_shift(s3 - s11); 733 x12 = dct_const_round_shift(s4 - s12); 734 x13 = dct_const_round_shift(s5 - s13); 735 x14 = dct_const_round_shift(s6 - s14); 736 x15 = dct_const_round_shift(s7 - s15); 737 738 // stage 2 739 s0 = x0; 740 s1 = x1; 741 s2 = x2; 742 s3 = x3; 743 s4 = x4; 744 s5 = x5; 745 s6 = x6; 746 s7 = x7; 747 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 748 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 749 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 750 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 751 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 752 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 753 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 754 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 755 756 x0 = s0 + s4; 757 x1 = s1 + s5; 758 x2 = s2 + s6; 759 x3 = s3 + s7; 760 x4 = s0 - s4; 761 x5 = s1 - s5; 762 x6 = s2 - s6; 763 x7 = s3 - s7; 764 x8 = dct_const_round_shift(s8 + s12); 765 x9 = dct_const_round_shift(s9 + s13); 766 x10 = dct_const_round_shift(s10 + s14); 767 x11 = dct_const_round_shift(s11 + s15); 768 x12 = dct_const_round_shift(s8 - s12); 769 x13 = dct_const_round_shift(s9 - s13); 770 x14 = dct_const_round_shift(s10 - s14); 771 x15 = dct_const_round_shift(s11 - s15); 772 773 // stage 3 774 s0 = x0; 775 s1 = x1; 776 s2 = x2; 777 s3 = x3; 778 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 779 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 780 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 781 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 782 s8 = x8; 783 s9 = x9; 784 s10 = x10; 785 s11 = x11; 786 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 787 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 788 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 789 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 790 791 x0 = s0 + s2; 792 x1 = s1 + s3; 793 x2 = s0 - s2; 794 x3 = s1 - s3; 795 x4 = dct_const_round_shift(s4 + s6); 796 x5 = dct_const_round_shift(s5 + s7); 797 x6 = dct_const_round_shift(s4 - s6); 798 x7 = dct_const_round_shift(s5 - s7); 799 x8 = s8 + s10; 800 x9 = s9 + s11; 801 x10 = s8 - s10; 802 x11 = s9 - s11; 803 x12 = dct_const_round_shift(s12 + s14); 804 x13 = dct_const_round_shift(s13 + s15); 805 x14 = dct_const_round_shift(s12 - s14); 806 x15 = dct_const_round_shift(s13 - s15); 807 808 // stage 4 809 s2 = (- cospi_16_64) * (x2 + x3); 810 s3 = cospi_16_64 * (x2 - x3); 811 s6 = cospi_16_64 * (x6 + x7); 812 s7 = cospi_16_64 * (- x6 + x7); 813 s10 = cospi_16_64 * (x10 + x11); 814 s11 = cospi_16_64 * (- x10 + x11); 815 s14 = (- cospi_16_64) * (x14 + x15); 816 s15 = cospi_16_64 * (x14 - x15); 817 818 x2 = dct_const_round_shift(s2); 819 x3 = dct_const_round_shift(s3); 820 x6 = dct_const_round_shift(s6); 821 x7 = dct_const_round_shift(s7); 822 x10 = dct_const_round_shift(s10); 823 x11 = dct_const_round_shift(s11); 824 x14 = dct_const_round_shift(s14); 825 x15 = dct_const_round_shift(s15); 826 827 output[0] = x0; 828 output[1] = -x8; 829 output[2] = x12; 830 output[3] = -x4; 831 output[4] = x6; 832 output[5] = x14; 833 output[6] = x10; 834 output[7] = x2; 835 output[8] = x3; 836 output[9] = x11; 837 output[10] = x15; 838 output[11] = x7; 839 output[12] = x5; 840 output[13] = -x13; 841 output[14] = x9; 842 output[15] = -x1; 843 } 844 845 static const transform_2d IHT_16[] = { 846 { idct16, idct16 }, // DCT_DCT = 0 847 { iadst16, idct16 }, // ADST_DCT = 1 848 { idct16, iadst16 }, // DCT_ADST = 2 849 { iadst16, iadst16 } // ADST_ADST = 3 850 }; 851 852 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, 853 int tx_type) { 854 int i, j; 855 tran_low_t out[16 * 16]; 856 tran_low_t *outptr = out; 857 tran_low_t temp_in[16], temp_out[16]; 858 const transform_2d ht = IHT_16[tx_type]; 859 860 // Rows 861 for (i = 0; i < 16; ++i) { 862 ht.rows(input, outptr); 863 input += 16; 864 outptr += 16; 865 } 866 867 // Columns 868 for (i = 0; i < 16; ++i) { 869 for (j = 0; j < 16; ++j) 870 temp_in[j] = out[j * 16 + i]; 871 ht.cols(temp_in, temp_out); 872 for (j = 0; j < 16; ++j) 873 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 874 + dest[j * stride + i]); 875 } 876 } 877 878 void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, 879 int stride) { 880 tran_low_t out[16 * 16] = { 0 }; 881 tran_low_t *outptr = out; 882 int i, j; 883 tran_low_t temp_in[16], temp_out[16]; 884 885 // First transform rows. Since all non-zero dct coefficients are in 886 // upper-left 4x4 area, we only need to calculate first 4 rows here. 887 for (i = 0; i < 4; ++i) { 888 idct16(input, outptr); 889 input += 16; 890 outptr += 16; 891 } 892 893 // Then transform columns 894 for (i = 0; i < 16; ++i) { 895 for (j = 0; j < 16; ++j) 896 temp_in[j] = out[j*16 + i]; 897 idct16(temp_in, temp_out); 898 for (j = 0; j < 16; ++j) 899 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 900 + dest[j * stride + i]); 901 } 902 } 903 904 void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 905 int i, j; 906 tran_high_t a1; 907 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); 908 out = dct_const_round_shift(out * cospi_16_64); 909 a1 = ROUND_POWER_OF_TWO(out, 6); 910 for (j = 0; j < 16; ++j) { 911 for (i = 0; i < 16; ++i) 912 dest[i] = clip_pixel(dest[i] + a1); 913 dest += stride; 914 } 915 } 916 917 static void idct32(const tran_low_t *input, tran_low_t *output) { 918 tran_low_t step1[32], step2[32]; 919 tran_high_t temp1, temp2; 920 921 // stage 1 922 step1[0] = input[0]; 923 step1[1] = input[16]; 924 step1[2] = input[8]; 925 step1[3] = input[24]; 926 step1[4] = input[4]; 927 step1[5] = input[20]; 928 step1[6] = input[12]; 929 step1[7] = input[28]; 930 step1[8] = input[2]; 931 step1[9] = input[18]; 932 step1[10] = input[10]; 933 step1[11] = input[26]; 934 step1[12] = input[6]; 935 step1[13] = input[22]; 936 step1[14] = input[14]; 937 step1[15] = input[30]; 938 939 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 940 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 941 step1[16] = dct_const_round_shift(temp1); 942 step1[31] = dct_const_round_shift(temp2); 943 944 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 945 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 946 step1[17] = dct_const_round_shift(temp1); 947 step1[30] = dct_const_round_shift(temp2); 948 949 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 950 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 951 step1[18] = dct_const_round_shift(temp1); 952 step1[29] = dct_const_round_shift(temp2); 953 954 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 955 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 956 step1[19] = dct_const_round_shift(temp1); 957 step1[28] = dct_const_round_shift(temp2); 958 959 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 960 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 961 step1[20] = dct_const_round_shift(temp1); 962 step1[27] = dct_const_round_shift(temp2); 963 964 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 965 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 966 step1[21] = dct_const_round_shift(temp1); 967 step1[26] = dct_const_round_shift(temp2); 968 969 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 970 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 971 step1[22] = dct_const_round_shift(temp1); 972 step1[25] = dct_const_round_shift(temp2); 973 974 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 975 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 976 step1[23] = dct_const_round_shift(temp1); 977 step1[24] = dct_const_round_shift(temp2); 978 979 // stage 2 980 step2[0] = step1[0]; 981 step2[1] = step1[1]; 982 step2[2] = step1[2]; 983 step2[3] = step1[3]; 984 step2[4] = step1[4]; 985 step2[5] = step1[5]; 986 step2[6] = step1[6]; 987 step2[7] = step1[7]; 988 989 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 990 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 991 step2[8] = dct_const_round_shift(temp1); 992 step2[15] = dct_const_round_shift(temp2); 993 994 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 995 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 996 step2[9] = dct_const_round_shift(temp1); 997 step2[14] = dct_const_round_shift(temp2); 998 999 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1000 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1001 step2[10] = dct_const_round_shift(temp1); 1002 step2[13] = dct_const_round_shift(temp2); 1003 1004 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1005 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1006 step2[11] = dct_const_round_shift(temp1); 1007 step2[12] = dct_const_round_shift(temp2); 1008 1009 step2[16] = step1[16] + step1[17]; 1010 step2[17] = step1[16] - step1[17]; 1011 step2[18] = -step1[18] + step1[19]; 1012 step2[19] = step1[18] + step1[19]; 1013 step2[20] = step1[20] + step1[21]; 1014 step2[21] = step1[20] - step1[21]; 1015 step2[22] = -step1[22] + step1[23]; 1016 step2[23] = step1[22] + step1[23]; 1017 step2[24] = step1[24] + step1[25]; 1018 step2[25] = step1[24] - step1[25]; 1019 step2[26] = -step1[26] + step1[27]; 1020 step2[27] = step1[26] + step1[27]; 1021 step2[28] = step1[28] + step1[29]; 1022 step2[29] = step1[28] - step1[29]; 1023 step2[30] = -step1[30] + step1[31]; 1024 step2[31] = step1[30] + step1[31]; 1025 1026 // stage 3 1027 step1[0] = step2[0]; 1028 step1[1] = step2[1]; 1029 step1[2] = step2[2]; 1030 step1[3] = step2[3]; 1031 1032 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1033 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1034 step1[4] = dct_const_round_shift(temp1); 1035 step1[7] = dct_const_round_shift(temp2); 1036 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1037 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1038 step1[5] = dct_const_round_shift(temp1); 1039 step1[6] = dct_const_round_shift(temp2); 1040 1041 step1[8] = step2[8] + step2[9]; 1042 step1[9] = step2[8] - step2[9]; 1043 step1[10] = -step2[10] + step2[11]; 1044 step1[11] = step2[10] + step2[11]; 1045 step1[12] = step2[12] + step2[13]; 1046 step1[13] = step2[12] - step2[13]; 1047 step1[14] = -step2[14] + step2[15]; 1048 step1[15] = step2[14] + step2[15]; 1049 1050 step1[16] = step2[16]; 1051 step1[31] = step2[31]; 1052 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 1053 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 1054 step1[17] = dct_const_round_shift(temp1); 1055 step1[30] = dct_const_round_shift(temp2); 1056 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 1057 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 1058 step1[18] = dct_const_round_shift(temp1); 1059 step1[29] = dct_const_round_shift(temp2); 1060 step1[19] = step2[19]; 1061 step1[20] = step2[20]; 1062 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 1063 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 1064 step1[21] = dct_const_round_shift(temp1); 1065 step1[26] = dct_const_round_shift(temp2); 1066 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 1067 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 1068 step1[22] = dct_const_round_shift(temp1); 1069 step1[25] = dct_const_round_shift(temp2); 1070 step1[23] = step2[23]; 1071 step1[24] = step2[24]; 1072 step1[27] = step2[27]; 1073 step1[28] = step2[28]; 1074 1075 // stage 4 1076 temp1 = (step1[0] + step1[1]) * cospi_16_64; 1077 temp2 = (step1[0] - step1[1]) * cospi_16_64; 1078 step2[0] = dct_const_round_shift(temp1); 1079 step2[1] = dct_const_round_shift(temp2); 1080 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1081 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1082 step2[2] = dct_const_round_shift(temp1); 1083 step2[3] = dct_const_round_shift(temp2); 1084 step2[4] = step1[4] + step1[5]; 1085 step2[5] = step1[4] - step1[5]; 1086 step2[6] = -step1[6] + step1[7]; 1087 step2[7] = step1[6] + step1[7]; 1088 1089 step2[8] = step1[8]; 1090 step2[15] = step1[15]; 1091 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1092 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1093 step2[9] = dct_const_round_shift(temp1); 1094 step2[14] = dct_const_round_shift(temp2); 1095 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1096 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1097 step2[10] = dct_const_round_shift(temp1); 1098 step2[13] = dct_const_round_shift(temp2); 1099 step2[11] = step1[11]; 1100 step2[12] = step1[12]; 1101 1102 step2[16] = step1[16] + step1[19]; 1103 step2[17] = step1[17] + step1[18]; 1104 step2[18] = step1[17] - step1[18]; 1105 step2[19] = step1[16] - step1[19]; 1106 step2[20] = -step1[20] + step1[23]; 1107 step2[21] = -step1[21] + step1[22]; 1108 step2[22] = step1[21] + step1[22]; 1109 step2[23] = step1[20] + step1[23]; 1110 1111 step2[24] = step1[24] + step1[27]; 1112 step2[25] = step1[25] + step1[26]; 1113 step2[26] = step1[25] - step1[26]; 1114 step2[27] = step1[24] - step1[27]; 1115 step2[28] = -step1[28] + step1[31]; 1116 step2[29] = -step1[29] + step1[30]; 1117 step2[30] = step1[29] + step1[30]; 1118 step2[31] = step1[28] + step1[31]; 1119 1120 // stage 5 1121 step1[0] = step2[0] + step2[3]; 1122 step1[1] = step2[1] + step2[2]; 1123 step1[2] = step2[1] - step2[2]; 1124 step1[3] = step2[0] - step2[3]; 1125 step1[4] = step2[4]; 1126 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1127 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1128 step1[5] = dct_const_round_shift(temp1); 1129 step1[6] = dct_const_round_shift(temp2); 1130 step1[7] = step2[7]; 1131 1132 step1[8] = step2[8] + step2[11]; 1133 step1[9] = step2[9] + step2[10]; 1134 step1[10] = step2[9] - step2[10]; 1135 step1[11] = step2[8] - step2[11]; 1136 step1[12] = -step2[12] + step2[15]; 1137 step1[13] = -step2[13] + step2[14]; 1138 step1[14] = step2[13] + step2[14]; 1139 step1[15] = step2[12] + step2[15]; 1140 1141 step1[16] = step2[16]; 1142 step1[17] = step2[17]; 1143 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 1144 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 1145 step1[18] = dct_const_round_shift(temp1); 1146 step1[29] = dct_const_round_shift(temp2); 1147 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 1148 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 1149 step1[19] = dct_const_round_shift(temp1); 1150 step1[28] = dct_const_round_shift(temp2); 1151 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 1152 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 1153 step1[20] = dct_const_round_shift(temp1); 1154 step1[27] = dct_const_round_shift(temp2); 1155 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 1156 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 1157 step1[21] = dct_const_round_shift(temp1); 1158 step1[26] = dct_const_round_shift(temp2); 1159 step1[22] = step2[22]; 1160 step1[23] = step2[23]; 1161 step1[24] = step2[24]; 1162 step1[25] = step2[25]; 1163 step1[30] = step2[30]; 1164 step1[31] = step2[31]; 1165 1166 // stage 6 1167 step2[0] = step1[0] + step1[7]; 1168 step2[1] = step1[1] + step1[6]; 1169 step2[2] = step1[2] + step1[5]; 1170 step2[3] = step1[3] + step1[4]; 1171 step2[4] = step1[3] - step1[4]; 1172 step2[5] = step1[2] - step1[5]; 1173 step2[6] = step1[1] - step1[6]; 1174 step2[7] = step1[0] - step1[7]; 1175 step2[8] = step1[8]; 1176 step2[9] = step1[9]; 1177 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1178 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1179 step2[10] = dct_const_round_shift(temp1); 1180 step2[13] = dct_const_round_shift(temp2); 1181 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1182 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1183 step2[11] = dct_const_round_shift(temp1); 1184 step2[12] = dct_const_round_shift(temp2); 1185 step2[14] = step1[14]; 1186 step2[15] = step1[15]; 1187 1188 step2[16] = step1[16] + step1[23]; 1189 step2[17] = step1[17] + step1[22]; 1190 step2[18] = step1[18] + step1[21]; 1191 step2[19] = step1[19] + step1[20]; 1192 step2[20] = step1[19] - step1[20]; 1193 step2[21] = step1[18] - step1[21]; 1194 step2[22] = step1[17] - step1[22]; 1195 step2[23] = step1[16] - step1[23]; 1196 1197 step2[24] = -step1[24] + step1[31]; 1198 step2[25] = -step1[25] + step1[30]; 1199 step2[26] = -step1[26] + step1[29]; 1200 step2[27] = -step1[27] + step1[28]; 1201 step2[28] = step1[27] + step1[28]; 1202 step2[29] = step1[26] + step1[29]; 1203 step2[30] = step1[25] + step1[30]; 1204 step2[31] = step1[24] + step1[31]; 1205 1206 // stage 7 1207 step1[0] = step2[0] + step2[15]; 1208 step1[1] = step2[1] + step2[14]; 1209 step1[2] = step2[2] + step2[13]; 1210 step1[3] = step2[3] + step2[12]; 1211 step1[4] = step2[4] + step2[11]; 1212 step1[5] = step2[5] + step2[10]; 1213 step1[6] = step2[6] + step2[9]; 1214 step1[7] = step2[7] + step2[8]; 1215 step1[8] = step2[7] - step2[8]; 1216 step1[9] = step2[6] - step2[9]; 1217 step1[10] = step2[5] - step2[10]; 1218 step1[11] = step2[4] - step2[11]; 1219 step1[12] = step2[3] - step2[12]; 1220 step1[13] = step2[2] - step2[13]; 1221 step1[14] = step2[1] - step2[14]; 1222 step1[15] = step2[0] - step2[15]; 1223 1224 step1[16] = step2[16]; 1225 step1[17] = step2[17]; 1226 step1[18] = step2[18]; 1227 step1[19] = step2[19]; 1228 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 1229 temp2 = (step2[20] + step2[27]) * cospi_16_64; 1230 step1[20] = dct_const_round_shift(temp1); 1231 step1[27] = dct_const_round_shift(temp2); 1232 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 1233 temp2 = (step2[21] + step2[26]) * cospi_16_64; 1234 step1[21] = dct_const_round_shift(temp1); 1235 step1[26] = dct_const_round_shift(temp2); 1236 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 1237 temp2 = (step2[22] + step2[25]) * cospi_16_64; 1238 step1[22] = dct_const_round_shift(temp1); 1239 step1[25] = dct_const_round_shift(temp2); 1240 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 1241 temp2 = (step2[23] + step2[24]) * cospi_16_64; 1242 step1[23] = dct_const_round_shift(temp1); 1243 step1[24] = dct_const_round_shift(temp2); 1244 step1[28] = step2[28]; 1245 step1[29] = step2[29]; 1246 step1[30] = step2[30]; 1247 step1[31] = step2[31]; 1248 1249 // final stage 1250 output[0] = step1[0] + step1[31]; 1251 output[1] = step1[1] + step1[30]; 1252 output[2] = step1[2] + step1[29]; 1253 output[3] = step1[3] + step1[28]; 1254 output[4] = step1[4] + step1[27]; 1255 output[5] = step1[5] + step1[26]; 1256 output[6] = step1[6] + step1[25]; 1257 output[7] = step1[7] + step1[24]; 1258 output[8] = step1[8] + step1[23]; 1259 output[9] = step1[9] + step1[22]; 1260 output[10] = step1[10] + step1[21]; 1261 output[11] = step1[11] + step1[20]; 1262 output[12] = step1[12] + step1[19]; 1263 output[13] = step1[13] + step1[18]; 1264 output[14] = step1[14] + step1[17]; 1265 output[15] = step1[15] + step1[16]; 1266 output[16] = step1[15] - step1[16]; 1267 output[17] = step1[14] - step1[17]; 1268 output[18] = step1[13] - step1[18]; 1269 output[19] = step1[12] - step1[19]; 1270 output[20] = step1[11] - step1[20]; 1271 output[21] = step1[10] - step1[21]; 1272 output[22] = step1[9] - step1[22]; 1273 output[23] = step1[8] - step1[23]; 1274 output[24] = step1[7] - step1[24]; 1275 output[25] = step1[6] - step1[25]; 1276 output[26] = step1[5] - step1[26]; 1277 output[27] = step1[4] - step1[27]; 1278 output[28] = step1[3] - step1[28]; 1279 output[29] = step1[2] - step1[29]; 1280 output[30] = step1[1] - step1[30]; 1281 output[31] = step1[0] - step1[31]; 1282 } 1283 1284 void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, 1285 int stride) { 1286 tran_low_t out[32 * 32]; 1287 tran_low_t *outptr = out; 1288 int i, j; 1289 tran_low_t temp_in[32], temp_out[32]; 1290 1291 // Rows 1292 for (i = 0; i < 32; ++i) { 1293 int16_t zero_coeff[16]; 1294 for (j = 0; j < 16; ++j) 1295 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 1296 for (j = 0; j < 8; ++j) 1297 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1298 for (j = 0; j < 4; ++j) 1299 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1300 for (j = 0; j < 2; ++j) 1301 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1302 1303 if (zero_coeff[0] | zero_coeff[1]) 1304 idct32(input, outptr); 1305 else 1306 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); 1307 input += 32; 1308 outptr += 32; 1309 } 1310 1311 // Columns 1312 for (i = 0; i < 32; ++i) { 1313 for (j = 0; j < 32; ++j) 1314 temp_in[j] = out[j * 32 + i]; 1315 idct32(temp_in, temp_out); 1316 for (j = 0; j < 32; ++j) 1317 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1318 + dest[j * stride + i]); 1319 } 1320 } 1321 1322 void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, 1323 int stride) { 1324 tran_low_t out[32 * 32] = {0}; 1325 tran_low_t *outptr = out; 1326 int i, j; 1327 tran_low_t temp_in[32], temp_out[32]; 1328 1329 // Rows 1330 // only upper-left 8x8 has non-zero coeff 1331 for (i = 0; i < 8; ++i) { 1332 idct32(input, outptr); 1333 input += 32; 1334 outptr += 32; 1335 } 1336 1337 // Columns 1338 for (i = 0; i < 32; ++i) { 1339 for (j = 0; j < 32; ++j) 1340 temp_in[j] = out[j * 32 + i]; 1341 idct32(temp_in, temp_out); 1342 for (j = 0; j < 32; ++j) 1343 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1344 + dest[j * stride + i]); 1345 } 1346 } 1347 1348 void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 1349 int i, j; 1350 tran_high_t a1; 1351 1352 tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64); 1353 out = dct_const_round_shift(out * cospi_16_64); 1354 a1 = ROUND_POWER_OF_TWO(out, 6); 1355 1356 for (j = 0; j < 32; ++j) { 1357 for (i = 0; i < 32; ++i) 1358 dest[i] = clip_pixel(dest[i] + a1); 1359 dest += stride; 1360 } 1361 } 1362 1363 // idct 1364 void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, 1365 int eob) { 1366 if (eob > 1) 1367 vp9_idct4x4_16_add(input, dest, stride); 1368 else 1369 vp9_idct4x4_1_add(input, dest, stride); 1370 } 1371 1372 1373 void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, 1374 int eob) { 1375 if (eob > 1) 1376 vp9_iwht4x4_16_add(input, dest, stride); 1377 else 1378 vp9_iwht4x4_1_add(input, dest, stride); 1379 } 1380 1381 void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, 1382 int eob) { 1383 // If dc is 1, then input[0] is the reconstructed value, do not need 1384 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. 1385 1386 // The calculation can be simplified if there are not many non-zero dct 1387 // coefficients. Use eobs to decide what to do. 1388 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. 1389 // Combine that with code here. 1390 if (eob == 1) 1391 // DC only DCT coefficient 1392 vp9_idct8x8_1_add(input, dest, stride); 1393 else if (eob <= 12) 1394 vp9_idct8x8_12_add(input, dest, stride); 1395 else 1396 vp9_idct8x8_64_add(input, dest, stride); 1397 } 1398 1399 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, 1400 int eob) { 1401 /* The calculation can be simplified if there are not many non-zero dct 1402 * coefficients. Use eobs to separate different cases. */ 1403 if (eob == 1) 1404 /* DC only DCT coefficient. */ 1405 vp9_idct16x16_1_add(input, dest, stride); 1406 else if (eob <= 10) 1407 vp9_idct16x16_10_add(input, dest, stride); 1408 else 1409 vp9_idct16x16_256_add(input, dest, stride); 1410 } 1411 1412 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, 1413 int eob) { 1414 if (eob == 1) 1415 vp9_idct32x32_1_add(input, dest, stride); 1416 else if (eob <= 34) 1417 // non-zero coeff only in upper-left 8x8 1418 vp9_idct32x32_34_add(input, dest, stride); 1419 else 1420 vp9_idct32x32_1024_add(input, dest, stride); 1421 } 1422 1423 // iht 1424 void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, 1425 int stride, int eob) { 1426 if (tx_type == DCT_DCT) 1427 vp9_idct4x4_add(input, dest, stride, eob); 1428 else 1429 vp9_iht4x4_16_add(input, dest, stride, tx_type); 1430 } 1431 1432 void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, 1433 int stride, int eob) { 1434 if (tx_type == DCT_DCT) { 1435 vp9_idct8x8_add(input, dest, stride, eob); 1436 } else { 1437 vp9_iht8x8_64_add(input, dest, stride, tx_type); 1438 } 1439 } 1440 1441 void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest, 1442 int stride, int eob) { 1443 if (tx_type == DCT_DCT) { 1444 vp9_idct16x16_add(input, dest, stride, eob); 1445 } else { 1446 vp9_iht16x16_256_add(input, dest, stride, tx_type); 1447 } 1448 } 1449 1450 #if CONFIG_VP9_HIGHBITDEPTH 1451 void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1452 int stride, int bd) { 1453 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 1454 0.5 shifts per pixel. */ 1455 int i; 1456 tran_low_t output[16]; 1457 tran_high_t a1, b1, c1, d1, e1; 1458 const tran_low_t *ip = input; 1459 tran_low_t *op = output; 1460 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1461 1462 for (i = 0; i < 4; i++) { 1463 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1464 c1 = ip[1] >> UNIT_QUANT_SHIFT; 1465 d1 = ip[2] >> UNIT_QUANT_SHIFT; 1466 b1 = ip[3] >> UNIT_QUANT_SHIFT; 1467 a1 += c1; 1468 d1 -= b1; 1469 e1 = (a1 - d1) >> 1; 1470 b1 = e1 - b1; 1471 c1 = e1 - c1; 1472 a1 -= b1; 1473 d1 += c1; 1474 op[0] = WRAPLOW(a1); 1475 op[1] = WRAPLOW(b1); 1476 op[2] = WRAPLOW(c1); 1477 op[3] = WRAPLOW(d1); 1478 ip += 4; 1479 op += 4; 1480 } 1481 1482 ip = output; 1483 for (i = 0; i < 4; i++) { 1484 a1 = ip[4 * 0]; 1485 c1 = ip[4 * 1]; 1486 d1 = ip[4 * 2]; 1487 b1 = ip[4 * 3]; 1488 a1 += c1; 1489 d1 -= b1; 1490 e1 = (a1 - d1) >> 1; 1491 b1 = e1 - b1; 1492 c1 = e1 - c1; 1493 a1 -= b1; 1494 d1 += c1; 1495 dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd); 1496 dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd); 1497 dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd); 1498 dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd); 1499 1500 ip++; 1501 dest++; 1502 } 1503 } 1504 1505 static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) { 1506 tran_low_t step[4]; 1507 tran_high_t temp1, temp2; 1508 (void) bd; 1509 // stage 1 1510 temp1 = (input[0] + input[2]) * cospi_16_64; 1511 temp2 = (input[0] - input[2]) * cospi_16_64; 1512 step[0] = WRAPLOW(dct_const_round_shift(temp1)); 1513 step[1] = WRAPLOW(dct_const_round_shift(temp2)); 1514 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 1515 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 1516 step[2] = WRAPLOW(dct_const_round_shift(temp1)); 1517 step[3] = WRAPLOW(dct_const_round_shift(temp2)); 1518 1519 // stage 2 1520 output[0] = WRAPLOW(step[0] + step[3]); 1521 output[1] = WRAPLOW(step[1] + step[2]); 1522 output[2] = WRAPLOW(step[1] - step[2]); 1523 output[3] = WRAPLOW(step[0] - step[3]); 1524 } 1525 1526 void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, 1527 int dest_stride, int bd) { 1528 int i; 1529 tran_high_t a1, e1; 1530 tran_low_t tmp[4]; 1531 const tran_low_t *ip = in; 1532 tran_low_t *op = tmp; 1533 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1534 (void) bd; 1535 1536 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1537 e1 = a1 >> 1; 1538 a1 -= e1; 1539 op[0] = WRAPLOW(a1); 1540 op[1] = op[2] = op[3] = WRAPLOW(e1); 1541 1542 ip = tmp; 1543 for (i = 0; i < 4; i++) { 1544 e1 = ip[0] >> 1; 1545 a1 = ip[0] - e1; 1546 dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd); 1547 dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd); 1548 dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd); 1549 dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd); 1550 ip++; 1551 dest++; 1552 } 1553 } 1554 1555 void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1556 int stride, int bd) { 1557 tran_low_t out[4 * 4]; 1558 tran_low_t *outptr = out; 1559 int i, j; 1560 tran_low_t temp_in[4], temp_out[4]; 1561 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1562 1563 // Rows 1564 for (i = 0; i < 4; ++i) { 1565 high_idct4(input, outptr, bd); 1566 input += 4; 1567 outptr += 4; 1568 } 1569 1570 // Columns 1571 for (i = 0; i < 4; ++i) { 1572 for (j = 0; j < 4; ++j) 1573 temp_in[j] = out[j * 4 + i]; 1574 high_idct4(temp_in, temp_out, bd); 1575 for (j = 0; j < 4; ++j) 1576 dest[j * stride + i] = clip_pixel_bd_high( 1577 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 1578 } 1579 } 1580 1581 void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, 1582 int dest_stride, int bd) { 1583 int i; 1584 tran_high_t a1; 1585 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 1586 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1587 1588 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 1589 a1 = ROUND_POWER_OF_TWO(out, 4); 1590 1591 for (i = 0; i < 4; i++) { 1592 dest[0] = clip_pixel_bd_high(dest[0], a1, bd); 1593 dest[1] = clip_pixel_bd_high(dest[1], a1, bd); 1594 dest[2] = clip_pixel_bd_high(dest[2], a1, bd); 1595 dest[3] = clip_pixel_bd_high(dest[3], a1, bd); 1596 dest += dest_stride; 1597 } 1598 } 1599 1600 static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) { 1601 tran_low_t step1[8], step2[8]; 1602 tran_high_t temp1, temp2; 1603 // stage 1 1604 step1[0] = input[0]; 1605 step1[2] = input[4]; 1606 step1[1] = input[2]; 1607 step1[3] = input[6]; 1608 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 1609 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 1610 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 1611 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 1612 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 1613 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 1614 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 1615 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 1616 1617 // stage 2 & stage 3 - even half 1618 high_idct4(step1, step1, bd); 1619 1620 // stage 2 - odd half 1621 step2[4] = WRAPLOW(step1[4] + step1[5]); 1622 step2[5] = WRAPLOW(step1[4] - step1[5]); 1623 step2[6] = WRAPLOW(-step1[6] + step1[7]); 1624 step2[7] = WRAPLOW(step1[6] + step1[7]); 1625 1626 // stage 3 - odd half 1627 step1[4] = step2[4]; 1628 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1629 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1630 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 1631 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 1632 step1[7] = step2[7]; 1633 1634 // stage 4 1635 output[0] = WRAPLOW(step1[0] + step1[7]); 1636 output[1] = WRAPLOW(step1[1] + step1[6]); 1637 output[2] = WRAPLOW(step1[2] + step1[5]); 1638 output[3] = WRAPLOW(step1[3] + step1[4]); 1639 output[4] = WRAPLOW(step1[3] - step1[4]); 1640 output[5] = WRAPLOW(step1[2] - step1[5]); 1641 output[6] = WRAPLOW(step1[1] - step1[6]); 1642 output[7] = WRAPLOW(step1[0] - step1[7]); 1643 } 1644 1645 void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, 1646 int stride, int bd) { 1647 tran_low_t out[8 * 8]; 1648 tran_low_t *outptr = out; 1649 int i, j; 1650 tran_low_t temp_in[8], temp_out[8]; 1651 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1652 1653 // First transform rows. 1654 for (i = 0; i < 8; ++i) { 1655 high_idct8(input, outptr, bd); 1656 input += 8; 1657 outptr += 8; 1658 } 1659 1660 // Then transform columns. 1661 for (i = 0; i < 8; ++i) { 1662 for (j = 0; j < 8; ++j) 1663 temp_in[j] = out[j * 8 + i]; 1664 high_idct8(temp_in, temp_out, bd); 1665 for (j = 0; j < 8; ++j) 1666 dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i], 1667 ROUND_POWER_OF_TWO(temp_out[j], 5), 1668 bd); 1669 } 1670 } 1671 1672 void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, 1673 int stride, int bd) { 1674 int i, j; 1675 tran_high_t a1; 1676 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 1677 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1678 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 1679 a1 = ROUND_POWER_OF_TWO(out, 5); 1680 for (j = 0; j < 8; ++j) { 1681 for (i = 0; i < 8; ++i) 1682 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); 1683 dest += stride; 1684 } 1685 } 1686 1687 static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) { 1688 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1689 1690 tran_high_t x0 = input[0]; 1691 tran_high_t x1 = input[1]; 1692 tran_high_t x2 = input[2]; 1693 tran_high_t x3 = input[3]; 1694 (void) bd; 1695 1696 if (!(x0 | x1 | x2 | x3)) { 1697 vpx_memset(output, 0, 4 * sizeof(*output)); 1698 return; 1699 } 1700 1701 s0 = sinpi_1_9 * x0; 1702 s1 = sinpi_2_9 * x0; 1703 s2 = sinpi_3_9 * x1; 1704 s3 = sinpi_4_9 * x2; 1705 s4 = sinpi_1_9 * x2; 1706 s5 = sinpi_2_9 * x3; 1707 s6 = sinpi_4_9 * x3; 1708 s7 = x0 - x2 + x3; 1709 1710 x0 = s0 + s3 + s5; 1711 x1 = s1 - s4 - s6; 1712 x2 = sinpi_3_9 * s7; 1713 x3 = s2; 1714 1715 s0 = x0 + x3; 1716 s1 = x1 + x3; 1717 s2 = x2; 1718 s3 = x0 + x1 - x3; 1719 1720 // 1-D transform scaling factor is sqrt(2). 1721 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 1722 // + 1b (addition) = 29b. 1723 // Hence the output bit depth is 15b. 1724 output[0] = WRAPLOW(dct_const_round_shift(s0)); 1725 output[1] = WRAPLOW(dct_const_round_shift(s1)); 1726 output[2] = WRAPLOW(dct_const_round_shift(s2)); 1727 output[3] = WRAPLOW(dct_const_round_shift(s3)); 1728 } 1729 1730 void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1731 int stride, int tx_type, int bd) { 1732 const high_transform_2d IHT_4[] = { 1733 { high_idct4, high_idct4 }, // DCT_DCT = 0 1734 { high_iadst4, high_idct4 }, // ADST_DCT = 1 1735 { high_idct4, high_iadst4 }, // DCT_ADST = 2 1736 { high_iadst4, high_iadst4 } // ADST_ADST = 3 1737 }; 1738 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1739 1740 int i, j; 1741 tran_low_t out[4 * 4]; 1742 tran_low_t *outptr = out; 1743 tran_low_t temp_in[4], temp_out[4]; 1744 1745 // Inverse transform row vectors. 1746 for (i = 0; i < 4; ++i) { 1747 IHT_4[tx_type].rows(input, outptr, bd); 1748 input += 4; 1749 outptr += 4; 1750 } 1751 1752 // Inverse transform column vectors. 1753 for (i = 0; i < 4; ++i) { 1754 for (j = 0; j < 4; ++j) 1755 temp_in[j] = out[j * 4 + i]; 1756 IHT_4[tx_type].cols(temp_in, temp_out, bd); 1757 for (j = 0; j < 4; ++j) 1758 dest[j * stride + i] = clip_pixel_bd_high( 1759 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 1760 } 1761 } 1762 1763 static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) { 1764 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1765 1766 tran_high_t x0 = input[7]; 1767 tran_high_t x1 = input[0]; 1768 tran_high_t x2 = input[5]; 1769 tran_high_t x3 = input[2]; 1770 tran_high_t x4 = input[3]; 1771 tran_high_t x5 = input[4]; 1772 tran_high_t x6 = input[1]; 1773 tran_high_t x7 = input[6]; 1774 (void) bd; 1775 1776 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 1777 vpx_memset(output, 0, 8 * sizeof(*output)); 1778 return; 1779 } 1780 1781 // stage 1 1782 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 1783 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 1784 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 1785 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 1786 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 1787 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 1788 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 1789 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 1790 1791 x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); 1792 x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); 1793 x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); 1794 x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); 1795 x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); 1796 x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); 1797 x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); 1798 x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); 1799 1800 // stage 2 1801 s0 = x0; 1802 s1 = x1; 1803 s2 = x2; 1804 s3 = x3; 1805 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 1806 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 1807 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 1808 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 1809 1810 x0 = s0 + s2; 1811 x1 = s1 + s3; 1812 x2 = s0 - s2; 1813 x3 = s1 - s3; 1814 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); 1815 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); 1816 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); 1817 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); 1818 1819 // stage 3 1820 s2 = cospi_16_64 * (x2 + x3); 1821 s3 = cospi_16_64 * (x2 - x3); 1822 s6 = cospi_16_64 * (x6 + x7); 1823 s7 = cospi_16_64 * (x6 - x7); 1824 1825 x2 = WRAPLOW(dct_const_round_shift(s2)); 1826 x3 = WRAPLOW(dct_const_round_shift(s3)); 1827 x6 = WRAPLOW(dct_const_round_shift(s6)); 1828 x7 = WRAPLOW(dct_const_round_shift(s7)); 1829 1830 output[0] = WRAPLOW(x0); 1831 output[1] = WRAPLOW(-x4); 1832 output[2] = WRAPLOW(x6); 1833 output[3] = WRAPLOW(-x2); 1834 output[4] = WRAPLOW(x3); 1835 output[5] = WRAPLOW(-x7); 1836 output[6] = WRAPLOW(x5); 1837 output[7] = WRAPLOW(-x1); 1838 } 1839 1840 static const high_transform_2d HIGH_IHT_8[] = { 1841 { high_idct8, high_idct8 }, // DCT_DCT = 0 1842 { high_iadst8, high_idct8 }, // ADST_DCT = 1 1843 { high_idct8, high_iadst8 }, // DCT_ADST = 2 1844 { high_iadst8, high_iadst8 } // ADST_ADST = 3 1845 }; 1846 1847 void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, 1848 int stride, int tx_type, int bd) { 1849 int i, j; 1850 tran_low_t out[8 * 8]; 1851 tran_low_t *outptr = out; 1852 tran_low_t temp_in[8], temp_out[8]; 1853 const high_transform_2d ht = HIGH_IHT_8[tx_type]; 1854 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1855 1856 // Inverse transform row vectors. 1857 for (i = 0; i < 8; ++i) { 1858 ht.rows(input, outptr, bd); 1859 input += 8; 1860 outptr += 8; 1861 } 1862 1863 // Inverse transform column vectors. 1864 for (i = 0; i < 8; ++i) { 1865 for (j = 0; j < 8; ++j) 1866 temp_in[j] = out[j * 8 + i]; 1867 ht.cols(temp_in, temp_out, bd); 1868 for (j = 0; j < 8; ++j) 1869 dest[j * stride + i] = clip_pixel_bd_high( 1870 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1871 } 1872 } 1873 1874 void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, 1875 int stride, int bd) { 1876 tran_low_t out[8 * 8] = { 0 }; 1877 tran_low_t *outptr = out; 1878 int i, j; 1879 tran_low_t temp_in[8], temp_out[8]; 1880 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1881 1882 // First transform rows. 1883 // Only first 4 row has non-zero coefs. 1884 for (i = 0; i < 4; ++i) { 1885 high_idct8(input, outptr, bd); 1886 input += 8; 1887 outptr += 8; 1888 } 1889 // Then transform columns. 1890 for (i = 0; i < 8; ++i) { 1891 for (j = 0; j < 8; ++j) 1892 temp_in[j] = out[j * 8 + i]; 1893 high_idct8(temp_in, temp_out, bd); 1894 for (j = 0; j < 8; ++j) 1895 dest[j * stride + i] = clip_pixel_bd_high( 1896 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1897 } 1898 } 1899 1900 static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) { 1901 tran_low_t step1[16], step2[16]; 1902 tran_high_t temp1, temp2; 1903 (void) bd; 1904 1905 // stage 1 1906 step1[0] = input[0/2]; 1907 step1[1] = input[16/2]; 1908 step1[2] = input[8/2]; 1909 step1[3] = input[24/2]; 1910 step1[4] = input[4/2]; 1911 step1[5] = input[20/2]; 1912 step1[6] = input[12/2]; 1913 step1[7] = input[28/2]; 1914 step1[8] = input[2/2]; 1915 step1[9] = input[18/2]; 1916 step1[10] = input[10/2]; 1917 step1[11] = input[26/2]; 1918 step1[12] = input[6/2]; 1919 step1[13] = input[22/2]; 1920 step1[14] = input[14/2]; 1921 step1[15] = input[30/2]; 1922 1923 // stage 2 1924 step2[0] = step1[0]; 1925 step2[1] = step1[1]; 1926 step2[2] = step1[2]; 1927 step2[3] = step1[3]; 1928 step2[4] = step1[4]; 1929 step2[5] = step1[5]; 1930 step2[6] = step1[6]; 1931 step2[7] = step1[7]; 1932 1933 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 1934 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 1935 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); 1936 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); 1937 1938 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 1939 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 1940 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 1941 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 1942 1943 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1944 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1945 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 1946 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 1947 1948 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1949 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1950 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 1951 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 1952 1953 // stage 3 1954 step1[0] = step2[0]; 1955 step1[1] = step2[1]; 1956 step1[2] = step2[2]; 1957 step1[3] = step2[3]; 1958 1959 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1960 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1961 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 1962 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 1963 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1964 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1965 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 1966 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 1967 1968 step1[8] = WRAPLOW(step2[8] + step2[9]); 1969 step1[9] = WRAPLOW(step2[8] - step2[9]); 1970 step1[10] = WRAPLOW(-step2[10] + step2[11]); 1971 step1[11] = WRAPLOW(step2[10] + step2[11]); 1972 step1[12] = WRAPLOW(step2[12] + step2[13]); 1973 step1[13] = WRAPLOW(step2[12] - step2[13]); 1974 step1[14] = WRAPLOW(-step2[14] + step2[15]); 1975 step1[15] = WRAPLOW(step2[14] + step2[15]); 1976 1977 // stage 4 1978 temp1 = (step1[0] + step1[1]) * cospi_16_64; 1979 temp2 = (step1[0] - step1[1]) * cospi_16_64; 1980 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 1981 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 1982 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1983 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1984 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 1985 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 1986 step2[4] = WRAPLOW(step1[4] + step1[5]); 1987 step2[5] = WRAPLOW(step1[4] - step1[5]); 1988 step2[6] = WRAPLOW(-step1[6] + step1[7]); 1989 step2[7] = WRAPLOW(step1[6] + step1[7]); 1990 1991 step2[8] = step1[8]; 1992 step2[15] = step1[15]; 1993 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1994 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1995 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 1996 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 1997 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1998 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1999 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2000 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2001 step2[11] = step1[11]; 2002 step2[12] = step1[12]; 2003 2004 // stage 5 2005 step1[0] = WRAPLOW(step2[0] + step2[3]); 2006 step1[1] = WRAPLOW(step2[1] + step2[2]); 2007 step1[2] = WRAPLOW(step2[1] - step2[2]); 2008 step1[3] = WRAPLOW(step2[0] - step2[3]); 2009 step1[4] = step2[4]; 2010 temp1 = (step2[6] - step2[5]) * cospi_16_64; 2011 temp2 = (step2[5] + step2[6]) * cospi_16_64; 2012 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 2013 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 2014 step1[7] = step2[7]; 2015 2016 step1[8] = WRAPLOW(step2[8] + step2[11]); 2017 step1[9] = WRAPLOW(step2[9] + step2[10]); 2018 step1[10] = WRAPLOW(step2[9] - step2[10]); 2019 step1[11] = WRAPLOW(step2[8] - step2[11]); 2020 step1[12] = WRAPLOW(-step2[12] + step2[15]); 2021 step1[13] = WRAPLOW(-step2[13] + step2[14]); 2022 step1[14] = WRAPLOW(step2[13] + step2[14]); 2023 step1[15] = WRAPLOW(step2[12] + step2[15]); 2024 2025 // stage 6 2026 step2[0] = WRAPLOW(step1[0] + step1[7]); 2027 step2[1] = WRAPLOW(step1[1] + step1[6]); 2028 step2[2] = WRAPLOW(step1[2] + step1[5]); 2029 step2[3] = WRAPLOW(step1[3] + step1[4]); 2030 step2[4] = WRAPLOW(step1[3] - step1[4]); 2031 step2[5] = WRAPLOW(step1[2] - step1[5]); 2032 step2[6] = WRAPLOW(step1[1] - step1[6]); 2033 step2[7] = WRAPLOW(step1[0] - step1[7]); 2034 step2[8] = step1[8]; 2035 step2[9] = step1[9]; 2036 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 2037 temp2 = (step1[10] + step1[13]) * cospi_16_64; 2038 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2039 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2040 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 2041 temp2 = (step1[11] + step1[12]) * cospi_16_64; 2042 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 2043 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 2044 step2[14] = step1[14]; 2045 step2[15] = step1[15]; 2046 2047 // stage 7 2048 output[0] = WRAPLOW(step2[0] + step2[15]); 2049 output[1] = WRAPLOW(step2[1] + step2[14]); 2050 output[2] = WRAPLOW(step2[2] + step2[13]); 2051 output[3] = WRAPLOW(step2[3] + step2[12]); 2052 output[4] = WRAPLOW(step2[4] + step2[11]); 2053 output[5] = WRAPLOW(step2[5] + step2[10]); 2054 output[6] = WRAPLOW(step2[6] + step2[9]); 2055 output[7] = WRAPLOW(step2[7] + step2[8]); 2056 output[8] = WRAPLOW(step2[7] - step2[8]); 2057 output[9] = WRAPLOW(step2[6] - step2[9]); 2058 output[10] = WRAPLOW(step2[5] - step2[10]); 2059 output[11] = WRAPLOW(step2[4] - step2[11]); 2060 output[12] = WRAPLOW(step2[3] - step2[12]); 2061 output[13] = WRAPLOW(step2[2] - step2[13]); 2062 output[14] = WRAPLOW(step2[1] - step2[14]); 2063 output[15] = WRAPLOW(step2[0] - step2[15]); 2064 } 2065 2066 void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, 2067 int stride, int bd) { 2068 tran_low_t out[16 * 16]; 2069 tran_low_t *outptr = out; 2070 int i, j; 2071 tran_low_t temp_in[16], temp_out[16]; 2072 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2073 2074 // First transform rows. 2075 for (i = 0; i < 16; ++i) { 2076 high_idct16(input, outptr, bd); 2077 input += 16; 2078 outptr += 16; 2079 } 2080 2081 // Then transform columns. 2082 for (i = 0; i < 16; ++i) { 2083 for (j = 0; j < 16; ++j) 2084 temp_in[j] = out[j * 16 + i]; 2085 high_idct16(temp_in, temp_out, bd); 2086 for (j = 0; j < 16; ++j) 2087 dest[j * stride + i] = clip_pixel_bd_high( 2088 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2089 } 2090 } 2091 2092 static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) { 2093 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 2094 tran_high_t s9, s10, s11, s12, s13, s14, s15; 2095 2096 tran_high_t x0 = input[15]; 2097 tran_high_t x1 = input[0]; 2098 tran_high_t x2 = input[13]; 2099 tran_high_t x3 = input[2]; 2100 tran_high_t x4 = input[11]; 2101 tran_high_t x5 = input[4]; 2102 tran_high_t x6 = input[9]; 2103 tran_high_t x7 = input[6]; 2104 tran_high_t x8 = input[7]; 2105 tran_high_t x9 = input[8]; 2106 tran_high_t x10 = input[5]; 2107 tran_high_t x11 = input[10]; 2108 tran_high_t x12 = input[3]; 2109 tran_high_t x13 = input[12]; 2110 tran_high_t x14 = input[1]; 2111 tran_high_t x15 = input[14]; 2112 (void) bd; 2113 2114 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 2115 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 2116 vpx_memset(output, 0, 16 * sizeof(*output)); 2117 return; 2118 } 2119 2120 // stage 1 2121 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 2122 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 2123 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 2124 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 2125 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 2126 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 2127 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 2128 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 2129 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 2130 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 2131 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 2132 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 2133 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 2134 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 2135 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 2136 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 2137 2138 x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); 2139 x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); 2140 x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); 2141 x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); 2142 x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); 2143 x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); 2144 x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); 2145 x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); 2146 x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); 2147 x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); 2148 x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); 2149 x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); 2150 x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); 2151 x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); 2152 x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); 2153 x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); 2154 2155 // stage 2 2156 s0 = x0; 2157 s1 = x1; 2158 s2 = x2; 2159 s3 = x3; 2160 s4 = x4; 2161 s5 = x5; 2162 s6 = x6; 2163 s7 = x7; 2164 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 2165 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 2166 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 2167 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 2168 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; 2169 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 2170 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; 2171 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 2172 2173 x0 = WRAPLOW(s0 + s4); 2174 x1 = WRAPLOW(s1 + s5); 2175 x2 = WRAPLOW(s2 + s6); 2176 x3 = WRAPLOW(s3 + s7); 2177 x4 = WRAPLOW(s0 - s4); 2178 x5 = WRAPLOW(s1 - s5); 2179 x6 = WRAPLOW(s2 - s6); 2180 x7 = WRAPLOW(s3 - s7); 2181 x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); 2182 x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); 2183 x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); 2184 x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); 2185 x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); 2186 x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); 2187 x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); 2188 x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); 2189 2190 // stage 3 2191 s0 = x0; 2192 s1 = x1; 2193 s2 = x2; 2194 s3 = x3; 2195 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 2196 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 2197 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; 2198 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 2199 s8 = x8; 2200 s9 = x9; 2201 s10 = x10; 2202 s11 = x11; 2203 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 2204 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 2205 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; 2206 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 2207 2208 x0 = WRAPLOW(s0 + s2); 2209 x1 = WRAPLOW(s1 + s3); 2210 x2 = WRAPLOW(s0 - s2); 2211 x3 = WRAPLOW(s1 - s3); 2212 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); 2213 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); 2214 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); 2215 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); 2216 x8 = WRAPLOW(s8 + s10); 2217 x9 = WRAPLOW(s9 + s11); 2218 x10 = WRAPLOW(s8 - s10); 2219 x11 = WRAPLOW(s9 - s11); 2220 x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); 2221 x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); 2222 x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); 2223 x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); 2224 2225 // stage 4 2226 s2 = (- cospi_16_64) * (x2 + x3); 2227 s3 = cospi_16_64 * (x2 - x3); 2228 s6 = cospi_16_64 * (x6 + x7); 2229 s7 = cospi_16_64 * (-x6 + x7); 2230 s10 = cospi_16_64 * (x10 + x11); 2231 s11 = cospi_16_64 * (-x10 + x11); 2232 s14 = (- cospi_16_64) * (x14 + x15); 2233 s15 = cospi_16_64 * (x14 - x15); 2234 2235 x2 = WRAPLOW(dct_const_round_shift(s2)); 2236 x3 = WRAPLOW(dct_const_round_shift(s3)); 2237 x6 = WRAPLOW(dct_const_round_shift(s6)); 2238 x7 = WRAPLOW(dct_const_round_shift(s7)); 2239 x10 = WRAPLOW(dct_const_round_shift(s10)); 2240 x11 = WRAPLOW(dct_const_round_shift(s11)); 2241 x14 = WRAPLOW(dct_const_round_shift(s14)); 2242 x15 = WRAPLOW(dct_const_round_shift(s15)); 2243 2244 output[0] = WRAPLOW(x0); 2245 output[1] = WRAPLOW(-x8); 2246 output[2] = WRAPLOW(x12); 2247 output[3] = WRAPLOW(-x4); 2248 output[4] = WRAPLOW(x6); 2249 output[5] = WRAPLOW(x14); 2250 output[6] = WRAPLOW(x10); 2251 output[7] = WRAPLOW(x2); 2252 output[8] = WRAPLOW(x3); 2253 output[9] = WRAPLOW(x11); 2254 output[10] = WRAPLOW(x15); 2255 output[11] = WRAPLOW(x7); 2256 output[12] = WRAPLOW(x5); 2257 output[13] = WRAPLOW(-x13); 2258 output[14] = WRAPLOW(x9); 2259 output[15] = WRAPLOW(-x1); 2260 } 2261 2262 static const high_transform_2d HIGH_IHT_16[] = { 2263 { high_idct16, high_idct16 }, // DCT_DCT = 0 2264 { high_iadst16, high_idct16 }, // ADST_DCT = 1 2265 { high_idct16, high_iadst16 }, // DCT_ADST = 2 2266 { high_iadst16, high_iadst16 } // ADST_ADST = 3 2267 }; 2268 2269 void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, 2270 int stride, int tx_type, int bd) { 2271 int i, j; 2272 tran_low_t out[16 * 16]; 2273 tran_low_t *outptr = out; 2274 tran_low_t temp_in[16], temp_out[16]; 2275 const high_transform_2d ht = HIGH_IHT_16[tx_type]; 2276 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2277 2278 // Rows 2279 for (i = 0; i < 16; ++i) { 2280 ht.rows(input, outptr, bd); 2281 input += 16; 2282 outptr += 16; 2283 } 2284 2285 // Columns 2286 for (i = 0; i < 16; ++i) { 2287 for (j = 0; j < 16; ++j) 2288 temp_in[j] = out[j * 16 + i]; 2289 ht.cols(temp_in, temp_out, bd); 2290 for (j = 0; j < 16; ++j) 2291 dest[j * stride + i] = clip_pixel_bd_high( 2292 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2293 } 2294 } 2295 2296 void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, 2297 int stride, int bd) { 2298 tran_low_t out[16 * 16] = { 0 }; 2299 tran_low_t *outptr = out; 2300 int i, j; 2301 tran_low_t temp_in[16], temp_out[16]; 2302 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2303 2304 // First transform rows. Since all non-zero dct coefficients are in 2305 // upper-left 4x4 area, we only need to calculate first 4 rows here. 2306 for (i = 0; i < 4; ++i) { 2307 high_idct16(input, outptr, bd); 2308 input += 16; 2309 outptr += 16; 2310 } 2311 2312 // Then transform columns. 2313 for (i = 0; i < 16; ++i) { 2314 for (j = 0; j < 16; ++j) 2315 temp_in[j] = out[j*16 + i]; 2316 high_idct16(temp_in, temp_out, bd); 2317 for (j = 0; j < 16; ++j) 2318 dest[j * stride + i] = clip_pixel_bd_high( 2319 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2320 } 2321 } 2322 2323 void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, 2324 int stride, int bd) { 2325 int i, j; 2326 tran_high_t a1; 2327 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 2328 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2329 2330 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 2331 a1 = ROUND_POWER_OF_TWO(out, 6); 2332 for (j = 0; j < 16; ++j) { 2333 for (i = 0; i < 16; ++i) 2334 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); 2335 dest += stride; 2336 } 2337 } 2338 2339 static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) { 2340 tran_low_t step1[32], step2[32]; 2341 tran_high_t temp1, temp2; 2342 (void) bd; 2343 2344 // stage 1 2345 step1[0] = input[0]; 2346 step1[1] = input[16]; 2347 step1[2] = input[8]; 2348 step1[3] = input[24]; 2349 step1[4] = input[4]; 2350 step1[5] = input[20]; 2351 step1[6] = input[12]; 2352 step1[7] = input[28]; 2353 step1[8] = input[2]; 2354 step1[9] = input[18]; 2355 step1[10] = input[10]; 2356 step1[11] = input[26]; 2357 step1[12] = input[6]; 2358 step1[13] = input[22]; 2359 step1[14] = input[14]; 2360 step1[15] = input[30]; 2361 2362 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 2363 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 2364 step1[16] = WRAPLOW(dct_const_round_shift(temp1)); 2365 step1[31] = WRAPLOW(dct_const_round_shift(temp2)); 2366 2367 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 2368 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 2369 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); 2370 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); 2371 2372 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 2373 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 2374 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 2375 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 2376 2377 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 2378 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 2379 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); 2380 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); 2381 2382 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 2383 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 2384 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 2385 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 2386 2387 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 2388 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 2389 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 2390 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 2391 2392 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 2393 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 2394 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 2395 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 2396 2397 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 2398 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 2399 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); 2400 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); 2401 2402 // stage 2 2403 step2[0] = step1[0]; 2404 step2[1] = step1[1]; 2405 step2[2] = step1[2]; 2406 step2[3] = step1[3]; 2407 step2[4] = step1[4]; 2408 step2[5] = step1[5]; 2409 step2[6] = step1[6]; 2410 step2[7] = step1[7]; 2411 2412 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 2413 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 2414 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); 2415 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); 2416 2417 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 2418 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 2419 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 2420 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 2421 2422 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 2423 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 2424 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2425 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2426 2427 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 2428 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 2429 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 2430 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 2431 2432 step2[16] = WRAPLOW(step1[16] + step1[17]); 2433 step2[17] = WRAPLOW(step1[16] - step1[17]); 2434 step2[18] = WRAPLOW(-step1[18] + step1[19]); 2435 step2[19] = WRAPLOW(step1[18] + step1[19]); 2436 step2[20] = WRAPLOW(step1[20] + step1[21]); 2437 step2[21] = WRAPLOW(step1[20] - step1[21]); 2438 step2[22] = WRAPLOW(-step1[22] + step1[23]); 2439 step2[23] = WRAPLOW(step1[22] + step1[23]); 2440 step2[24] = WRAPLOW(step1[24] + step1[25]); 2441 step2[25] = WRAPLOW(step1[24] - step1[25]); 2442 step2[26] = WRAPLOW(-step1[26] + step1[27]); 2443 step2[27] = WRAPLOW(step1[26] + step1[27]); 2444 step2[28] = WRAPLOW(step1[28] + step1[29]); 2445 step2[29] = WRAPLOW(step1[28] - step1[29]); 2446 step2[30] = WRAPLOW(-step1[30] + step1[31]); 2447 step2[31] = WRAPLOW(step1[30] + step1[31]); 2448 2449 // stage 3 2450 step1[0] = step2[0]; 2451 step1[1] = step2[1]; 2452 step1[2] = step2[2]; 2453 step1[3] = step2[3]; 2454 2455 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 2456 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 2457 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 2458 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 2459 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 2460 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 2461 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 2462 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 2463 2464 step1[8] = WRAPLOW(step2[8] + step2[9]); 2465 step1[9] = WRAPLOW(step2[8] - step2[9]); 2466 step1[10] = WRAPLOW(-step2[10] + step2[11]); 2467 step1[11] = WRAPLOW(step2[10] + step2[11]); 2468 step1[12] = WRAPLOW(step2[12] + step2[13]); 2469 step1[13] = WRAPLOW(step2[12] - step2[13]); 2470 step1[14] = WRAPLOW(-step2[14] + step2[15]); 2471 step1[15] = WRAPLOW(step2[14] + step2[15]); 2472 2473 step1[16] = step2[16]; 2474 step1[31] = step2[31]; 2475 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 2476 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 2477 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); 2478 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); 2479 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 2480 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 2481 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 2482 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 2483 step1[19] = step2[19]; 2484 step1[20] = step2[20]; 2485 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 2486 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 2487 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 2488 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 2489 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 2490 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 2491 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 2492 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 2493 step1[23] = step2[23]; 2494 step1[24] = step2[24]; 2495 step1[27] = step2[27]; 2496 step1[28] = step2[28]; 2497 2498 // stage 4 2499 temp1 = (step1[0] + step1[1]) * cospi_16_64; 2500 temp2 = (step1[0] - step1[1]) * cospi_16_64; 2501 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 2502 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 2503 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 2504 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 2505 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 2506 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 2507 step2[4] = WRAPLOW(step1[4] + step1[5]); 2508 step2[5] = WRAPLOW(step1[4] - step1[5]); 2509 step2[6] = WRAPLOW(-step1[6] + step1[7]); 2510 step2[7] = WRAPLOW(step1[6] + step1[7]); 2511 2512 step2[8] = step1[8]; 2513 step2[15] = step1[15]; 2514 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 2515 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 2516 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 2517 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 2518 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 2519 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 2520 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2521 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2522 step2[11] = step1[11]; 2523 step2[12] = step1[12]; 2524 2525 step2[16] = WRAPLOW(step1[16] + step1[19]); 2526 step2[17] = WRAPLOW(step1[17] + step1[18]); 2527 step2[18] = WRAPLOW(step1[17] - step1[18]); 2528 step2[19] = WRAPLOW(step1[16] - step1[19]); 2529 step2[20] = WRAPLOW(-step1[20] + step1[23]); 2530 step2[21] = WRAPLOW(-step1[21] + step1[22]); 2531 step2[22] = WRAPLOW(step1[21] + step1[22]); 2532 step2[23] = WRAPLOW(step1[20] + step1[23]); 2533 2534 step2[24] = WRAPLOW(step1[24] + step1[27]); 2535 step2[25] = WRAPLOW(step1[25] + step1[26]); 2536 step2[26] = WRAPLOW(step1[25] - step1[26]); 2537 step2[27] = WRAPLOW(step1[24] - step1[27]); 2538 step2[28] = WRAPLOW(-step1[28] + step1[31]); 2539 step2[29] = WRAPLOW(-step1[29] + step1[30]); 2540 step2[30] = WRAPLOW(step1[29] + step1[30]); 2541 step2[31] = WRAPLOW(step1[28] + step1[31]); 2542 2543 // stage 5 2544 step1[0] = WRAPLOW(step2[0] + step2[3]); 2545 step1[1] = WRAPLOW(step2[1] + step2[2]); 2546 step1[2] = WRAPLOW(step2[1] - step2[2]); 2547 step1[3] = WRAPLOW(step2[0] - step2[3]); 2548 step1[4] = step2[4]; 2549 temp1 = (step2[6] - step2[5]) * cospi_16_64; 2550 temp2 = (step2[5] + step2[6]) * cospi_16_64; 2551 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 2552 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 2553 step1[7] = step2[7]; 2554 2555 step1[8] = WRAPLOW(step2[8] + step2[11]); 2556 step1[9] = WRAPLOW(step2[9] + step2[10]); 2557 step1[10] = WRAPLOW(step2[9] - step2[10]); 2558 step1[11] = WRAPLOW(step2[8] - step2[11]); 2559 step1[12] = WRAPLOW(-step2[12] + step2[15]); 2560 step1[13] = WRAPLOW(-step2[13] + step2[14]); 2561 step1[14] = WRAPLOW(step2[13] + step2[14]); 2562 step1[15] = WRAPLOW(step2[12] + step2[15]); 2563 2564 step1[16] = step2[16]; 2565 step1[17] = step2[17]; 2566 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 2567 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 2568 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 2569 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 2570 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 2571 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 2572 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); 2573 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); 2574 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 2575 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 2576 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 2577 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 2578 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 2579 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 2580 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 2581 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 2582 step1[22] = step2[22]; 2583 step1[23] = step2[23]; 2584 step1[24] = step2[24]; 2585 step1[25] = step2[25]; 2586 step1[30] = step2[30]; 2587 step1[31] = step2[31]; 2588 2589 // stage 6 2590 step2[0] = WRAPLOW(step1[0] + step1[7]); 2591 step2[1] = WRAPLOW(step1[1] + step1[6]); 2592 step2[2] = WRAPLOW(step1[2] + step1[5]); 2593 step2[3] = WRAPLOW(step1[3] + step1[4]); 2594 step2[4] = WRAPLOW(step1[3] - step1[4]); 2595 step2[5] = WRAPLOW(step1[2] - step1[5]); 2596 step2[6] = WRAPLOW(step1[1] - step1[6]); 2597 step2[7] = WRAPLOW(step1[0] - step1[7]); 2598 step2[8] = step1[8]; 2599 step2[9] = step1[9]; 2600 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 2601 temp2 = (step1[10] + step1[13]) * cospi_16_64; 2602 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 2603 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 2604 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 2605 temp2 = (step1[11] + step1[12]) * cospi_16_64; 2606 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 2607 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 2608 step2[14] = WRAPLOW(step1[14]); 2609 step2[15] = WRAPLOW(step1[15]); 2610 2611 step2[16] = WRAPLOW(step1[16] + step1[23]); 2612 step2[17] = WRAPLOW(step1[17] + step1[22]); 2613 step2[18] = WRAPLOW(step1[18] + step1[21]); 2614 step2[19] = WRAPLOW(step1[19] + step1[20]); 2615 step2[20] = WRAPLOW(step1[19] - step1[20]); 2616 step2[21] = WRAPLOW(step1[18] - step1[21]); 2617 step2[22] = WRAPLOW(step1[17] - step1[22]); 2618 step2[23] = WRAPLOW(step1[16] - step1[23]); 2619 2620 step2[24] = WRAPLOW(-step1[24] + step1[31]); 2621 step2[25] = WRAPLOW(-step1[25] + step1[30]); 2622 step2[26] = WRAPLOW(-step1[26] + step1[29]); 2623 step2[27] = WRAPLOW(-step1[27] + step1[28]); 2624 step2[28] = WRAPLOW(step1[27] + step1[28]); 2625 step2[29] = WRAPLOW(step1[26] + step1[29]); 2626 step2[30] = WRAPLOW(step1[25] + step1[30]); 2627 step2[31] = WRAPLOW(step1[24] + step1[31]); 2628 2629 // stage 7 2630 step1[0] = WRAPLOW(step2[0] + step2[15]); 2631 step1[1] = WRAPLOW(step2[1] + step2[14]); 2632 step1[2] = WRAPLOW(step2[2] + step2[13]); 2633 step1[3] = WRAPLOW(step2[3] + step2[12]); 2634 step1[4] = WRAPLOW(step2[4] + step2[11]); 2635 step1[5] = WRAPLOW(step2[5] + step2[10]); 2636 step1[6] = WRAPLOW(step2[6] + step2[9]); 2637 step1[7] = WRAPLOW(step2[7] + step2[8]); 2638 step1[8] = WRAPLOW(step2[7] - step2[8]); 2639 step1[9] = WRAPLOW(step2[6] - step2[9]); 2640 step1[10] = WRAPLOW(step2[5] - step2[10]); 2641 step1[11] = WRAPLOW(step2[4] - step2[11]); 2642 step1[12] = WRAPLOW(step2[3] - step2[12]); 2643 step1[13] = WRAPLOW(step2[2] - step2[13]); 2644 step1[14] = WRAPLOW(step2[1] - step2[14]); 2645 step1[15] = WRAPLOW(step2[0] - step2[15]); 2646 2647 step1[16] = step2[16]; 2648 step1[17] = step2[17]; 2649 step1[18] = step2[18]; 2650 step1[19] = step2[19]; 2651 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 2652 temp2 = (step2[20] + step2[27]) * cospi_16_64; 2653 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 2654 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 2655 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 2656 temp2 = (step2[21] + step2[26]) * cospi_16_64; 2657 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 2658 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 2659 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 2660 temp2 = (step2[22] + step2[25]) * cospi_16_64; 2661 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 2662 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 2663 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 2664 temp2 = (step2[23] + step2[24]) * cospi_16_64; 2665 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); 2666 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); 2667 step1[28] = step2[28]; 2668 step1[29] = step2[29]; 2669 step1[30] = step2[30]; 2670 step1[31] = step2[31]; 2671 2672 // final stage 2673 output[0] = WRAPLOW(step1[0] + step1[31]); 2674 output[1] = WRAPLOW(step1[1] + step1[30]); 2675 output[2] = WRAPLOW(step1[2] + step1[29]); 2676 output[3] = WRAPLOW(step1[3] + step1[28]); 2677 output[4] = WRAPLOW(step1[4] + step1[27]); 2678 output[5] = WRAPLOW(step1[5] + step1[26]); 2679 output[6] = WRAPLOW(step1[6] + step1[25]); 2680 output[7] = WRAPLOW(step1[7] + step1[24]); 2681 output[8] = WRAPLOW(step1[8] + step1[23]); 2682 output[9] = WRAPLOW(step1[9] + step1[22]); 2683 output[10] = WRAPLOW(step1[10] + step1[21]); 2684 output[11] = WRAPLOW(step1[11] + step1[20]); 2685 output[12] = WRAPLOW(step1[12] + step1[19]); 2686 output[13] = WRAPLOW(step1[13] + step1[18]); 2687 output[14] = WRAPLOW(step1[14] + step1[17]); 2688 output[15] = WRAPLOW(step1[15] + step1[16]); 2689 output[16] = WRAPLOW(step1[15] - step1[16]); 2690 output[17] = WRAPLOW(step1[14] - step1[17]); 2691 output[18] = WRAPLOW(step1[13] - step1[18]); 2692 output[19] = WRAPLOW(step1[12] - step1[19]); 2693 output[20] = WRAPLOW(step1[11] - step1[20]); 2694 output[21] = WRAPLOW(step1[10] - step1[21]); 2695 output[22] = WRAPLOW(step1[9] - step1[22]); 2696 output[23] = WRAPLOW(step1[8] - step1[23]); 2697 output[24] = WRAPLOW(step1[7] - step1[24]); 2698 output[25] = WRAPLOW(step1[6] - step1[25]); 2699 output[26] = WRAPLOW(step1[5] - step1[26]); 2700 output[27] = WRAPLOW(step1[4] - step1[27]); 2701 output[28] = WRAPLOW(step1[3] - step1[28]); 2702 output[29] = WRAPLOW(step1[2] - step1[29]); 2703 output[30] = WRAPLOW(step1[1] - step1[30]); 2704 output[31] = WRAPLOW(step1[0] - step1[31]); 2705 } 2706 2707 void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, 2708 int stride, int bd) { 2709 tran_low_t out[32 * 32]; 2710 tran_low_t *outptr = out; 2711 int i, j; 2712 tran_low_t temp_in[32], temp_out[32]; 2713 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2714 2715 // Rows 2716 for (i = 0; i < 32; ++i) { 2717 tran_low_t zero_coeff[16]; 2718 for (j = 0; j < 16; ++j) 2719 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 2720 for (j = 0; j < 8; ++j) 2721 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2722 for (j = 0; j < 4; ++j) 2723 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2724 for (j = 0; j < 2; ++j) 2725 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2726 2727 if (zero_coeff[0] | zero_coeff[1]) 2728 high_idct32(input, outptr, bd); 2729 else 2730 vpx_memset(outptr, 0, sizeof(tran_low_t) * 32); 2731 input += 32; 2732 outptr += 32; 2733 } 2734 2735 // Columns 2736 for (i = 0; i < 32; ++i) { 2737 for (j = 0; j < 32; ++j) 2738 temp_in[j] = out[j * 32 + i]; 2739 high_idct32(temp_in, temp_out, bd); 2740 for (j = 0; j < 32; ++j) 2741 dest[j * stride + i] = clip_pixel_bd_high( 2742 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2743 } 2744 } 2745 2746 void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, 2747 int stride, int bd) { 2748 tran_low_t out[32 * 32] = {0}; 2749 tran_low_t *outptr = out; 2750 int i, j; 2751 tran_low_t temp_in[32], temp_out[32]; 2752 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2753 2754 // Rows 2755 // Only upper-left 8x8 has non-zero coeff. 2756 for (i = 0; i < 8; ++i) { 2757 high_idct32(input, outptr, bd); 2758 input += 32; 2759 outptr += 32; 2760 } 2761 // Columns 2762 for (i = 0; i < 32; ++i) { 2763 for (j = 0; j < 32; ++j) 2764 temp_in[j] = out[j * 32 + i]; 2765 high_idct32(temp_in, temp_out, bd); 2766 for (j = 0; j < 32; ++j) 2767 dest[j * stride + i] = clip_pixel_bd_high( 2768 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2769 } 2770 } 2771 2772 void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, 2773 int stride, int bd) { 2774 int i, j; 2775 int a1; 2776 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2777 2778 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 2779 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 2780 a1 = ROUND_POWER_OF_TWO(out, 6); 2781 2782 for (j = 0; j < 32; ++j) { 2783 for (i = 0; i < 32; ++i) 2784 dest[i] = clip_pixel_bd_high(dest[i], a1, bd); 2785 dest += stride; 2786 } 2787 } 2788 2789 // idct 2790 void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride, 2791 int eob, int bd) { 2792 if (eob > 1) 2793 vp9_high_idct4x4_16_add(input, dest, stride, bd); 2794 else 2795 vp9_high_idct4x4_1_add(input, dest, stride, bd); 2796 } 2797 2798 2799 void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride, 2800 int eob, int bd) { 2801 if (eob > 1) 2802 vp9_high_iwht4x4_16_add(input, dest, stride, bd); 2803 else 2804 vp9_high_iwht4x4_1_add(input, dest, stride, bd); 2805 } 2806 2807 void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, 2808 int eob, int bd) { 2809 // If dc is 1, then input[0] is the reconstructed value, do not need 2810 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. 2811 2812 // The calculation can be simplified if there are not many non-zero dct 2813 // coefficients. Use eobs to decide what to do. 2814 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. 2815 // Combine that with code here. 2816 // DC only DCT coefficient 2817 if (eob == 1) { 2818 vp9_high_idct8x8_1_add(input, dest, stride, bd); 2819 } else if (eob <= 10) { 2820 vp9_high_idct8x8_10_add(input, dest, stride, bd); 2821 } else { 2822 vp9_high_idct8x8_64_add(input, dest, stride, bd); 2823 } 2824 } 2825 2826 void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, 2827 int eob, int bd) { 2828 // The calculation can be simplified if there are not many non-zero dct 2829 // coefficients. Use eobs to separate different cases. 2830 // DC only DCT coefficient. 2831 if (eob == 1) { 2832 vp9_high_idct16x16_1_add(input, dest, stride, bd); 2833 } else if (eob <= 10) { 2834 vp9_high_idct16x16_10_add(input, dest, stride, bd); 2835 } else { 2836 vp9_high_idct16x16_256_add(input, dest, stride, bd); 2837 } 2838 } 2839 2840 void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride, 2841 int eob, int bd) { 2842 // Non-zero coeff only in upper-left 8x8 2843 if (eob == 1) { 2844 vp9_high_idct32x32_1_add(input, dest, stride, bd); 2845 } else if (eob <= 34) { 2846 vp9_high_idct32x32_34_add(input, dest, stride, bd); 2847 } else { 2848 vp9_high_idct32x32_1024_add(input, dest, stride, bd); 2849 } 2850 } 2851 2852 // iht 2853 void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, 2854 uint8_t *dest, int stride, int eob, int bd) { 2855 if (tx_type == DCT_DCT) 2856 vp9_high_idct4x4_add(input, dest, stride, eob, bd); 2857 else 2858 vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd); 2859 } 2860 2861 void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, 2862 uint8_t *dest, int stride, int eob, int bd) { 2863 if (tx_type == DCT_DCT) { 2864 vp9_high_idct8x8_add(input, dest, stride, eob, bd); 2865 } else { 2866 vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd); 2867 } 2868 } 2869 2870 void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, 2871 uint8_t *dest, int stride, int eob, int bd) { 2872 if (tx_type == DCT_DCT) { 2873 vp9_high_idct16x16_add(input, dest, stride, eob, bd); 2874 } else { 2875 vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd); 2876 } 2877 } 2878 #endif // CONFIG_VP9_HIGHBITDEPTH 2879