1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <math.h> 13 14 #include "./vpx_config.h" 15 #include "./vp9_rtcd.h" 16 #include "vp9/common/vp9_systemdependent.h" 17 #include "vp9/common/vp9_blockd.h" 18 #include "vp9/common/vp9_common.h" 19 #include "vp9/common/vp9_idct.h" 20 21 void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { 22 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 23 0.5 shifts per pixel. */ 24 int i; 25 int16_t output[16]; 26 int a1, b1, c1, d1, e1; 27 const int16_t *ip = input; 28 int16_t *op = output; 29 30 for (i = 0; i < 4; i++) { 31 a1 = ip[0] >> UNIT_QUANT_SHIFT; 32 c1 = ip[1] >> UNIT_QUANT_SHIFT; 33 d1 = ip[2] >> UNIT_QUANT_SHIFT; 34 b1 = ip[3] >> UNIT_QUANT_SHIFT; 35 a1 += c1; 36 d1 -= b1; 37 e1 = (a1 - d1) >> 1; 38 b1 = e1 - b1; 39 c1 = e1 - c1; 40 a1 -= b1; 41 d1 += c1; 42 op[0] = a1; 43 op[1] = b1; 44 op[2] = c1; 45 op[3] = d1; 46 ip += 4; 47 op += 4; 48 } 49 50 ip = output; 51 for (i = 0; i < 4; i++) { 52 a1 = ip[4 * 0]; 53 c1 = ip[4 * 1]; 54 d1 = ip[4 * 2]; 55 b1 = ip[4 * 3]; 56 a1 += c1; 57 d1 -= b1; 58 e1 = (a1 - d1) >> 1; 59 b1 = e1 - b1; 60 c1 = e1 - c1; 61 a1 -= b1; 62 d1 += c1; 63 dest[stride * 0] = clip_pixel(dest[stride * 0] + a1); 64 dest[stride * 1] = clip_pixel(dest[stride * 1] + b1); 65 dest[stride * 2] = clip_pixel(dest[stride * 2] + c1); 66 dest[stride * 3] = clip_pixel(dest[stride * 3] + d1); 67 68 ip++; 69 dest++; 70 } 71 } 72 73 void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) { 74 int i; 75 int a1, e1; 76 int16_t tmp[4]; 77 const int16_t *ip = in; 78 int16_t *op = tmp; 79 80 a1 = ip[0] >> UNIT_QUANT_SHIFT; 81 e1 = a1 >> 1; 82 a1 -= e1; 83 op[0] = a1; 84 op[1] = op[2] = op[3] = e1; 85 86 ip = tmp; 87 for (i = 0; i < 4; i++) { 88 e1 = ip[0] >> 1; 89 a1 = ip[0] - e1; 90 dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1); 91 dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + e1); 92 dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + e1); 93 dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1); 94 ip++; 95 dest++; 96 } 97 } 98 99 static void idct4(const int16_t *input, int16_t *output) { 100 int16_t step[4]; 101 int temp1, temp2; 102 // stage 1 103 temp1 = (input[0] + input[2]) * cospi_16_64; 104 temp2 = (input[0] - input[2]) * cospi_16_64; 105 step[0] = dct_const_round_shift(temp1); 106 step[1] = dct_const_round_shift(temp2); 107 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 108 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 109 step[2] = dct_const_round_shift(temp1); 110 step[3] = dct_const_round_shift(temp2); 111 112 // stage 2 113 output[0] = step[0] + step[3]; 114 output[1] = step[1] + step[2]; 115 output[2] = step[1] - step[2]; 116 output[3] = step[0] - step[3]; 117 } 118 119 void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) { 120 int16_t out[4 * 4]; 121 int16_t *outptr = out; 122 int i, j; 123 int16_t temp_in[4], temp_out[4]; 124 125 // Rows 126 for (i = 0; i < 4; ++i) { 127 idct4(input, outptr); 128 input += 4; 129 outptr += 4; 130 } 131 132 // Columns 133 for (i = 0; i < 4; ++i) { 134 for (j = 0; j < 4; ++j) 135 temp_in[j] = out[j * 4 + i]; 136 idct4(temp_in, temp_out); 137 for (j = 0; j < 4; ++j) 138 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 139 + dest[j * stride + i]); 140 } 141 } 142 143 void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) { 144 int i; 145 int a1; 146 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 147 out = dct_const_round_shift(out * cospi_16_64); 148 a1 = ROUND_POWER_OF_TWO(out, 4); 149 150 for (i = 0; i < 4; i++) { 151 dest[0] = clip_pixel(dest[0] + a1); 152 dest[1] = clip_pixel(dest[1] + a1); 153 dest[2] = clip_pixel(dest[2] + a1); 154 dest[3] = clip_pixel(dest[3] + a1); 155 dest += dest_stride; 156 } 157 } 158 159 static void idct8(const int16_t *input, int16_t *output) { 160 int16_t step1[8], step2[8]; 161 int temp1, temp2; 162 // stage 1 163 step1[0] = input[0]; 164 step1[2] = input[4]; 165 step1[1] = input[2]; 166 step1[3] = input[6]; 167 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 168 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 169 step1[4] = dct_const_round_shift(temp1); 170 step1[7] = dct_const_round_shift(temp2); 171 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 172 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 173 step1[5] = dct_const_round_shift(temp1); 174 step1[6] = dct_const_round_shift(temp2); 175 176 // stage 2 & stage 3 - even half 177 idct4(step1, step1); 178 179 // stage 2 - odd half 180 step2[4] = step1[4] + step1[5]; 181 step2[5] = step1[4] - step1[5]; 182 step2[6] = -step1[6] + step1[7]; 183 step2[7] = step1[6] + step1[7]; 184 185 // stage 3 -odd half 186 step1[4] = step2[4]; 187 temp1 = (step2[6] - step2[5]) * cospi_16_64; 188 temp2 = (step2[5] + step2[6]) * cospi_16_64; 189 step1[5] = dct_const_round_shift(temp1); 190 step1[6] = dct_const_round_shift(temp2); 191 step1[7] = step2[7]; 192 193 // stage 4 194 output[0] = step1[0] + step1[7]; 195 output[1] = step1[1] + step1[6]; 196 output[2] = step1[2] + step1[5]; 197 output[3] = step1[3] + step1[4]; 198 output[4] = step1[3] - step1[4]; 199 output[5] = step1[2] - step1[5]; 200 output[6] = step1[1] - step1[6]; 201 output[7] = step1[0] - step1[7]; 202 } 203 204 void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) { 205 int16_t out[8 * 8]; 206 int16_t *outptr = out; 207 int i, j; 208 int16_t temp_in[8], temp_out[8]; 209 210 // First transform rows 211 for (i = 0; i < 8; ++i) { 212 idct8(input, outptr); 213 input += 8; 214 outptr += 8; 215 } 216 217 // Then transform columns 218 for (i = 0; i < 8; ++i) { 219 for (j = 0; j < 8; ++j) 220 temp_in[j] = out[j * 8 + i]; 221 idct8(temp_in, temp_out); 222 for (j = 0; j < 8; ++j) 223 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 224 + dest[j * stride + i]); 225 } 226 } 227 228 void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 229 int i, j; 230 int a1; 231 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 232 out = dct_const_round_shift(out * cospi_16_64); 233 a1 = ROUND_POWER_OF_TWO(out, 5); 234 for (j = 0; j < 8; ++j) { 235 for (i = 0; i < 8; ++i) 236 dest[i] = clip_pixel(dest[i] + a1); 237 dest += stride; 238 } 239 } 240 241 static void iadst4(const int16_t *input, int16_t *output) { 242 int s0, s1, s2, s3, s4, s5, s6, s7; 243 244 int x0 = input[0]; 245 int x1 = input[1]; 246 int x2 = input[2]; 247 int x3 = input[3]; 248 249 if (!(x0 | x1 | x2 | x3)) { 250 output[0] = output[1] = output[2] = output[3] = 0; 251 return; 252 } 253 254 s0 = sinpi_1_9 * x0; 255 s1 = sinpi_2_9 * x0; 256 s2 = sinpi_3_9 * x1; 257 s3 = sinpi_4_9 * x2; 258 s4 = sinpi_1_9 * x2; 259 s5 = sinpi_2_9 * x3; 260 s6 = sinpi_4_9 * x3; 261 s7 = x0 - x2 + x3; 262 263 x0 = s0 + s3 + s5; 264 x1 = s1 - s4 - s6; 265 x2 = sinpi_3_9 * s7; 266 x3 = s2; 267 268 s0 = x0 + x3; 269 s1 = x1 + x3; 270 s2 = x2; 271 s3 = x0 + x1 - x3; 272 273 // 1-D transform scaling factor is sqrt(2). 274 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 275 // + 1b (addition) = 29b. 276 // Hence the output bit depth is 15b. 277 output[0] = dct_const_round_shift(s0); 278 output[1] = dct_const_round_shift(s1); 279 output[2] = dct_const_round_shift(s2); 280 output[3] = dct_const_round_shift(s3); 281 } 282 283 void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride, 284 int tx_type) { 285 const transform_2d IHT_4[] = { 286 { idct4, idct4 }, // DCT_DCT = 0 287 { iadst4, idct4 }, // ADST_DCT = 1 288 { idct4, iadst4 }, // DCT_ADST = 2 289 { iadst4, iadst4 } // ADST_ADST = 3 290 }; 291 292 int i, j; 293 int16_t out[4 * 4]; 294 int16_t *outptr = out; 295 int16_t temp_in[4], temp_out[4]; 296 297 // inverse transform row vectors 298 for (i = 0; i < 4; ++i) { 299 IHT_4[tx_type].rows(input, outptr); 300 input += 4; 301 outptr += 4; 302 } 303 304 // inverse transform column vectors 305 for (i = 0; i < 4; ++i) { 306 for (j = 0; j < 4; ++j) 307 temp_in[j] = out[j * 4 + i]; 308 IHT_4[tx_type].cols(temp_in, temp_out); 309 for (j = 0; j < 4; ++j) 310 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) 311 + dest[j * stride + i]); 312 } 313 } 314 static void iadst8(const int16_t *input, int16_t *output) { 315 int s0, s1, s2, s3, s4, s5, s6, s7; 316 317 int x0 = input[7]; 318 int x1 = input[0]; 319 int x2 = input[5]; 320 int x3 = input[2]; 321 int x4 = input[3]; 322 int x5 = input[4]; 323 int x6 = input[1]; 324 int x7 = input[6]; 325 326 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 327 output[0] = output[1] = output[2] = output[3] = output[4] 328 = output[5] = output[6] = output[7] = 0; 329 return; 330 } 331 332 // stage 1 333 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 334 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 335 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 336 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 337 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 338 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 339 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 340 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 341 342 x0 = dct_const_round_shift(s0 + s4); 343 x1 = dct_const_round_shift(s1 + s5); 344 x2 = dct_const_round_shift(s2 + s6); 345 x3 = dct_const_round_shift(s3 + s7); 346 x4 = dct_const_round_shift(s0 - s4); 347 x5 = dct_const_round_shift(s1 - s5); 348 x6 = dct_const_round_shift(s2 - s6); 349 x7 = dct_const_round_shift(s3 - s7); 350 351 // stage 2 352 s0 = x0; 353 s1 = x1; 354 s2 = x2; 355 s3 = x3; 356 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 357 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 358 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 359 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 360 361 x0 = s0 + s2; 362 x1 = s1 + s3; 363 x2 = s0 - s2; 364 x3 = s1 - s3; 365 x4 = dct_const_round_shift(s4 + s6); 366 x5 = dct_const_round_shift(s5 + s7); 367 x6 = dct_const_round_shift(s4 - s6); 368 x7 = dct_const_round_shift(s5 - s7); 369 370 // stage 3 371 s2 = cospi_16_64 * (x2 + x3); 372 s3 = cospi_16_64 * (x2 - x3); 373 s6 = cospi_16_64 * (x6 + x7); 374 s7 = cospi_16_64 * (x6 - x7); 375 376 x2 = dct_const_round_shift(s2); 377 x3 = dct_const_round_shift(s3); 378 x6 = dct_const_round_shift(s6); 379 x7 = dct_const_round_shift(s7); 380 381 output[0] = x0; 382 output[1] = -x4; 383 output[2] = x6; 384 output[3] = -x2; 385 output[4] = x3; 386 output[5] = -x7; 387 output[6] = x5; 388 output[7] = -x1; 389 } 390 391 static const transform_2d IHT_8[] = { 392 { idct8, idct8 }, // DCT_DCT = 0 393 { iadst8, idct8 }, // ADST_DCT = 1 394 { idct8, iadst8 }, // DCT_ADST = 2 395 { iadst8, iadst8 } // ADST_ADST = 3 396 }; 397 398 void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride, 399 int tx_type) { 400 int i, j; 401 int16_t out[8 * 8]; 402 int16_t *outptr = out; 403 int16_t temp_in[8], temp_out[8]; 404 const transform_2d ht = IHT_8[tx_type]; 405 406 // inverse transform row vectors 407 for (i = 0; i < 8; ++i) { 408 ht.rows(input, outptr); 409 input += 8; 410 outptr += 8; 411 } 412 413 // inverse transform column vectors 414 for (i = 0; i < 8; ++i) { 415 for (j = 0; j < 8; ++j) 416 temp_in[j] = out[j * 8 + i]; 417 ht.cols(temp_in, temp_out); 418 for (j = 0; j < 8; ++j) 419 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 420 + dest[j * stride + i]); 421 } 422 } 423 424 void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { 425 int16_t out[8 * 8] = { 0 }; 426 int16_t *outptr = out; 427 int i, j; 428 int16_t temp_in[8], temp_out[8]; 429 430 // First transform rows 431 // only first 4 row has non-zero coefs 432 for (i = 0; i < 4; ++i) { 433 idct8(input, outptr); 434 input += 8; 435 outptr += 8; 436 } 437 438 // Then transform columns 439 for (i = 0; i < 8; ++i) { 440 for (j = 0; j < 8; ++j) 441 temp_in[j] = out[j * 8 + i]; 442 idct8(temp_in, temp_out); 443 for (j = 0; j < 8; ++j) 444 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) 445 + dest[j * stride + i]); 446 } 447 } 448 449 static void idct16(const int16_t *input, int16_t *output) { 450 int16_t step1[16], step2[16]; 451 int temp1, temp2; 452 453 // stage 1 454 step1[0] = input[0/2]; 455 step1[1] = input[16/2]; 456 step1[2] = input[8/2]; 457 step1[3] = input[24/2]; 458 step1[4] = input[4/2]; 459 step1[5] = input[20/2]; 460 step1[6] = input[12/2]; 461 step1[7] = input[28/2]; 462 step1[8] = input[2/2]; 463 step1[9] = input[18/2]; 464 step1[10] = input[10/2]; 465 step1[11] = input[26/2]; 466 step1[12] = input[6/2]; 467 step1[13] = input[22/2]; 468 step1[14] = input[14/2]; 469 step1[15] = input[30/2]; 470 471 // stage 2 472 step2[0] = step1[0]; 473 step2[1] = step1[1]; 474 step2[2] = step1[2]; 475 step2[3] = step1[3]; 476 step2[4] = step1[4]; 477 step2[5] = step1[5]; 478 step2[6] = step1[6]; 479 step2[7] = step1[7]; 480 481 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 482 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 483 step2[8] = dct_const_round_shift(temp1); 484 step2[15] = dct_const_round_shift(temp2); 485 486 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 487 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 488 step2[9] = dct_const_round_shift(temp1); 489 step2[14] = dct_const_round_shift(temp2); 490 491 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 492 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 493 step2[10] = dct_const_round_shift(temp1); 494 step2[13] = dct_const_round_shift(temp2); 495 496 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 497 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 498 step2[11] = dct_const_round_shift(temp1); 499 step2[12] = dct_const_round_shift(temp2); 500 501 // stage 3 502 step1[0] = step2[0]; 503 step1[1] = step2[1]; 504 step1[2] = step2[2]; 505 step1[3] = step2[3]; 506 507 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 508 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 509 step1[4] = dct_const_round_shift(temp1); 510 step1[7] = dct_const_round_shift(temp2); 511 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 512 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 513 step1[5] = dct_const_round_shift(temp1); 514 step1[6] = dct_const_round_shift(temp2); 515 516 step1[8] = step2[8] + step2[9]; 517 step1[9] = step2[8] - step2[9]; 518 step1[10] = -step2[10] + step2[11]; 519 step1[11] = step2[10] + step2[11]; 520 step1[12] = step2[12] + step2[13]; 521 step1[13] = step2[12] - step2[13]; 522 step1[14] = -step2[14] + step2[15]; 523 step1[15] = step2[14] + step2[15]; 524 525 // stage 4 526 temp1 = (step1[0] + step1[1]) * cospi_16_64; 527 temp2 = (step1[0] - step1[1]) * cospi_16_64; 528 step2[0] = dct_const_round_shift(temp1); 529 step2[1] = dct_const_round_shift(temp2); 530 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 531 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 532 step2[2] = dct_const_round_shift(temp1); 533 step2[3] = dct_const_round_shift(temp2); 534 step2[4] = step1[4] + step1[5]; 535 step2[5] = step1[4] - step1[5]; 536 step2[6] = -step1[6] + step1[7]; 537 step2[7] = step1[6] + step1[7]; 538 539 step2[8] = step1[8]; 540 step2[15] = step1[15]; 541 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 542 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 543 step2[9] = dct_const_round_shift(temp1); 544 step2[14] = dct_const_round_shift(temp2); 545 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 546 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 547 step2[10] = dct_const_round_shift(temp1); 548 step2[13] = dct_const_round_shift(temp2); 549 step2[11] = step1[11]; 550 step2[12] = step1[12]; 551 552 // stage 5 553 step1[0] = step2[0] + step2[3]; 554 step1[1] = step2[1] + step2[2]; 555 step1[2] = step2[1] - step2[2]; 556 step1[3] = step2[0] - step2[3]; 557 step1[4] = step2[4]; 558 temp1 = (step2[6] - step2[5]) * cospi_16_64; 559 temp2 = (step2[5] + step2[6]) * cospi_16_64; 560 step1[5] = dct_const_round_shift(temp1); 561 step1[6] = dct_const_round_shift(temp2); 562 step1[7] = step2[7]; 563 564 step1[8] = step2[8] + step2[11]; 565 step1[9] = step2[9] + step2[10]; 566 step1[10] = step2[9] - step2[10]; 567 step1[11] = step2[8] - step2[11]; 568 step1[12] = -step2[12] + step2[15]; 569 step1[13] = -step2[13] + step2[14]; 570 step1[14] = step2[13] + step2[14]; 571 step1[15] = step2[12] + step2[15]; 572 573 // stage 6 574 step2[0] = step1[0] + step1[7]; 575 step2[1] = step1[1] + step1[6]; 576 step2[2] = step1[2] + step1[5]; 577 step2[3] = step1[3] + step1[4]; 578 step2[4] = step1[3] - step1[4]; 579 step2[5] = step1[2] - step1[5]; 580 step2[6] = step1[1] - step1[6]; 581 step2[7] = step1[0] - step1[7]; 582 step2[8] = step1[8]; 583 step2[9] = step1[9]; 584 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 585 temp2 = (step1[10] + step1[13]) * cospi_16_64; 586 step2[10] = dct_const_round_shift(temp1); 587 step2[13] = dct_const_round_shift(temp2); 588 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 589 temp2 = (step1[11] + step1[12]) * cospi_16_64; 590 step2[11] = dct_const_round_shift(temp1); 591 step2[12] = dct_const_round_shift(temp2); 592 step2[14] = step1[14]; 593 step2[15] = step1[15]; 594 595 // stage 7 596 output[0] = step2[0] + step2[15]; 597 output[1] = step2[1] + step2[14]; 598 output[2] = step2[2] + step2[13]; 599 output[3] = step2[3] + step2[12]; 600 output[4] = step2[4] + step2[11]; 601 output[5] = step2[5] + step2[10]; 602 output[6] = step2[6] + step2[9]; 603 output[7] = step2[7] + step2[8]; 604 output[8] = step2[7] - step2[8]; 605 output[9] = step2[6] - step2[9]; 606 output[10] = step2[5] - step2[10]; 607 output[11] = step2[4] - step2[11]; 608 output[12] = step2[3] - step2[12]; 609 output[13] = step2[2] - step2[13]; 610 output[14] = step2[1] - step2[14]; 611 output[15] = step2[0] - step2[15]; 612 } 613 614 void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) { 615 int16_t out[16 * 16]; 616 int16_t *outptr = out; 617 int i, j; 618 int16_t temp_in[16], temp_out[16]; 619 620 // First transform rows 621 for (i = 0; i < 16; ++i) { 622 idct16(input, outptr); 623 input += 16; 624 outptr += 16; 625 } 626 627 // Then transform columns 628 for (i = 0; i < 16; ++i) { 629 for (j = 0; j < 16; ++j) 630 temp_in[j] = out[j * 16 + i]; 631 idct16(temp_in, temp_out); 632 for (j = 0; j < 16; ++j) 633 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 634 + dest[j * stride + i]); 635 } 636 } 637 638 static void iadst16(const int16_t *input, int16_t *output) { 639 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 640 641 int x0 = input[15]; 642 int x1 = input[0]; 643 int x2 = input[13]; 644 int x3 = input[2]; 645 int x4 = input[11]; 646 int x5 = input[4]; 647 int x6 = input[9]; 648 int x7 = input[6]; 649 int x8 = input[7]; 650 int x9 = input[8]; 651 int x10 = input[5]; 652 int x11 = input[10]; 653 int x12 = input[3]; 654 int x13 = input[12]; 655 int x14 = input[1]; 656 int x15 = input[14]; 657 658 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 659 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 660 output[0] = output[1] = output[2] = output[3] = output[4] 661 = output[5] = output[6] = output[7] = output[8] 662 = output[9] = output[10] = output[11] = output[12] 663 = output[13] = output[14] = output[15] = 0; 664 return; 665 } 666 667 // stage 1 668 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 669 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 670 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 671 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 672 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 673 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 674 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 675 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 676 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 677 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 678 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 679 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 680 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 681 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 682 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 683 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 684 685 x0 = dct_const_round_shift(s0 + s8); 686 x1 = dct_const_round_shift(s1 + s9); 687 x2 = dct_const_round_shift(s2 + s10); 688 x3 = dct_const_round_shift(s3 + s11); 689 x4 = dct_const_round_shift(s4 + s12); 690 x5 = dct_const_round_shift(s5 + s13); 691 x6 = dct_const_round_shift(s6 + s14); 692 x7 = dct_const_round_shift(s7 + s15); 693 x8 = dct_const_round_shift(s0 - s8); 694 x9 = dct_const_round_shift(s1 - s9); 695 x10 = dct_const_round_shift(s2 - s10); 696 x11 = dct_const_round_shift(s3 - s11); 697 x12 = dct_const_round_shift(s4 - s12); 698 x13 = dct_const_round_shift(s5 - s13); 699 x14 = dct_const_round_shift(s6 - s14); 700 x15 = dct_const_round_shift(s7 - s15); 701 702 // stage 2 703 s0 = x0; 704 s1 = x1; 705 s2 = x2; 706 s3 = x3; 707 s4 = x4; 708 s5 = x5; 709 s6 = x6; 710 s7 = x7; 711 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 712 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 713 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 714 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 715 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 716 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 717 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 718 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 719 720 x0 = s0 + s4; 721 x1 = s1 + s5; 722 x2 = s2 + s6; 723 x3 = s3 + s7; 724 x4 = s0 - s4; 725 x5 = s1 - s5; 726 x6 = s2 - s6; 727 x7 = s3 - s7; 728 x8 = dct_const_round_shift(s8 + s12); 729 x9 = dct_const_round_shift(s9 + s13); 730 x10 = dct_const_round_shift(s10 + s14); 731 x11 = dct_const_round_shift(s11 + s15); 732 x12 = dct_const_round_shift(s8 - s12); 733 x13 = dct_const_round_shift(s9 - s13); 734 x14 = dct_const_round_shift(s10 - s14); 735 x15 = dct_const_round_shift(s11 - s15); 736 737 // stage 3 738 s0 = x0; 739 s1 = x1; 740 s2 = x2; 741 s3 = x3; 742 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 743 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 744 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 745 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 746 s8 = x8; 747 s9 = x9; 748 s10 = x10; 749 s11 = x11; 750 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 751 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 752 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 753 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 754 755 x0 = s0 + s2; 756 x1 = s1 + s3; 757 x2 = s0 - s2; 758 x3 = s1 - s3; 759 x4 = dct_const_round_shift(s4 + s6); 760 x5 = dct_const_round_shift(s5 + s7); 761 x6 = dct_const_round_shift(s4 - s6); 762 x7 = dct_const_round_shift(s5 - s7); 763 x8 = s8 + s10; 764 x9 = s9 + s11; 765 x10 = s8 - s10; 766 x11 = s9 - s11; 767 x12 = dct_const_round_shift(s12 + s14); 768 x13 = dct_const_round_shift(s13 + s15); 769 x14 = dct_const_round_shift(s12 - s14); 770 x15 = dct_const_round_shift(s13 - s15); 771 772 // stage 4 773 s2 = (- cospi_16_64) * (x2 + x3); 774 s3 = cospi_16_64 * (x2 - x3); 775 s6 = cospi_16_64 * (x6 + x7); 776 s7 = cospi_16_64 * (- x6 + x7); 777 s10 = cospi_16_64 * (x10 + x11); 778 s11 = cospi_16_64 * (- x10 + x11); 779 s14 = (- cospi_16_64) * (x14 + x15); 780 s15 = cospi_16_64 * (x14 - x15); 781 782 x2 = dct_const_round_shift(s2); 783 x3 = dct_const_round_shift(s3); 784 x6 = dct_const_round_shift(s6); 785 x7 = dct_const_round_shift(s7); 786 x10 = dct_const_round_shift(s10); 787 x11 = dct_const_round_shift(s11); 788 x14 = dct_const_round_shift(s14); 789 x15 = dct_const_round_shift(s15); 790 791 output[0] = x0; 792 output[1] = -x8; 793 output[2] = x12; 794 output[3] = -x4; 795 output[4] = x6; 796 output[5] = x14; 797 output[6] = x10; 798 output[7] = x2; 799 output[8] = x3; 800 output[9] = x11; 801 output[10] = x15; 802 output[11] = x7; 803 output[12] = x5; 804 output[13] = -x13; 805 output[14] = x9; 806 output[15] = -x1; 807 } 808 809 static const transform_2d IHT_16[] = { 810 { idct16, idct16 }, // DCT_DCT = 0 811 { iadst16, idct16 }, // ADST_DCT = 1 812 { idct16, iadst16 }, // DCT_ADST = 2 813 { iadst16, iadst16 } // ADST_ADST = 3 814 }; 815 816 void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride, 817 int tx_type) { 818 int i, j; 819 int16_t out[16 * 16]; 820 int16_t *outptr = out; 821 int16_t temp_in[16], temp_out[16]; 822 const transform_2d ht = IHT_16[tx_type]; 823 824 // Rows 825 for (i = 0; i < 16; ++i) { 826 ht.rows(input, outptr); 827 input += 16; 828 outptr += 16; 829 } 830 831 // Columns 832 for (i = 0; i < 16; ++i) { 833 for (j = 0; j < 16; ++j) 834 temp_in[j] = out[j * 16 + i]; 835 ht.cols(temp_in, temp_out); 836 for (j = 0; j < 16; ++j) 837 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 838 + dest[j * stride + i]); 839 } 840 } 841 842 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { 843 int16_t out[16 * 16] = { 0 }; 844 int16_t *outptr = out; 845 int i, j; 846 int16_t temp_in[16], temp_out[16]; 847 848 // First transform rows. Since all non-zero dct coefficients are in 849 // upper-left 4x4 area, we only need to calculate first 4 rows here. 850 for (i = 0; i < 4; ++i) { 851 idct16(input, outptr); 852 input += 16; 853 outptr += 16; 854 } 855 856 // Then transform columns 857 for (i = 0; i < 16; ++i) { 858 for (j = 0; j < 16; ++j) 859 temp_in[j] = out[j*16 + i]; 860 idct16(temp_in, temp_out); 861 for (j = 0; j < 16; ++j) 862 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 863 + dest[j * stride + i]); 864 } 865 } 866 867 void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 868 int i, j; 869 int a1; 870 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 871 out = dct_const_round_shift(out * cospi_16_64); 872 a1 = ROUND_POWER_OF_TWO(out, 6); 873 for (j = 0; j < 16; ++j) { 874 for (i = 0; i < 16; ++i) 875 dest[i] = clip_pixel(dest[i] + a1); 876 dest += stride; 877 } 878 } 879 880 static void idct32(const int16_t *input, int16_t *output) { 881 int16_t step1[32], step2[32]; 882 int temp1, temp2; 883 884 // stage 1 885 step1[0] = input[0]; 886 step1[1] = input[16]; 887 step1[2] = input[8]; 888 step1[3] = input[24]; 889 step1[4] = input[4]; 890 step1[5] = input[20]; 891 step1[6] = input[12]; 892 step1[7] = input[28]; 893 step1[8] = input[2]; 894 step1[9] = input[18]; 895 step1[10] = input[10]; 896 step1[11] = input[26]; 897 step1[12] = input[6]; 898 step1[13] = input[22]; 899 step1[14] = input[14]; 900 step1[15] = input[30]; 901 902 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 903 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 904 step1[16] = dct_const_round_shift(temp1); 905 step1[31] = dct_const_round_shift(temp2); 906 907 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 908 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 909 step1[17] = dct_const_round_shift(temp1); 910 step1[30] = dct_const_round_shift(temp2); 911 912 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 913 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 914 step1[18] = dct_const_round_shift(temp1); 915 step1[29] = dct_const_round_shift(temp2); 916 917 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 918 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 919 step1[19] = dct_const_round_shift(temp1); 920 step1[28] = dct_const_round_shift(temp2); 921 922 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 923 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 924 step1[20] = dct_const_round_shift(temp1); 925 step1[27] = dct_const_round_shift(temp2); 926 927 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 928 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 929 step1[21] = dct_const_round_shift(temp1); 930 step1[26] = dct_const_round_shift(temp2); 931 932 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 933 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 934 step1[22] = dct_const_round_shift(temp1); 935 step1[25] = dct_const_round_shift(temp2); 936 937 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 938 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 939 step1[23] = dct_const_round_shift(temp1); 940 step1[24] = dct_const_round_shift(temp2); 941 942 // stage 2 943 step2[0] = step1[0]; 944 step2[1] = step1[1]; 945 step2[2] = step1[2]; 946 step2[3] = step1[3]; 947 step2[4] = step1[4]; 948 step2[5] = step1[5]; 949 step2[6] = step1[6]; 950 step2[7] = step1[7]; 951 952 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 953 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 954 step2[8] = dct_const_round_shift(temp1); 955 step2[15] = dct_const_round_shift(temp2); 956 957 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 958 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 959 step2[9] = dct_const_round_shift(temp1); 960 step2[14] = dct_const_round_shift(temp2); 961 962 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 963 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 964 step2[10] = dct_const_round_shift(temp1); 965 step2[13] = dct_const_round_shift(temp2); 966 967 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 968 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 969 step2[11] = dct_const_round_shift(temp1); 970 step2[12] = dct_const_round_shift(temp2); 971 972 step2[16] = step1[16] + step1[17]; 973 step2[17] = step1[16] - step1[17]; 974 step2[18] = -step1[18] + step1[19]; 975 step2[19] = step1[18] + step1[19]; 976 step2[20] = step1[20] + step1[21]; 977 step2[21] = step1[20] - step1[21]; 978 step2[22] = -step1[22] + step1[23]; 979 step2[23] = step1[22] + step1[23]; 980 step2[24] = step1[24] + step1[25]; 981 step2[25] = step1[24] - step1[25]; 982 step2[26] = -step1[26] + step1[27]; 983 step2[27] = step1[26] + step1[27]; 984 step2[28] = step1[28] + step1[29]; 985 step2[29] = step1[28] - step1[29]; 986 step2[30] = -step1[30] + step1[31]; 987 step2[31] = step1[30] + step1[31]; 988 989 // stage 3 990 step1[0] = step2[0]; 991 step1[1] = step2[1]; 992 step1[2] = step2[2]; 993 step1[3] = step2[3]; 994 995 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 996 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 997 step1[4] = dct_const_round_shift(temp1); 998 step1[7] = dct_const_round_shift(temp2); 999 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1000 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1001 step1[5] = dct_const_round_shift(temp1); 1002 step1[6] = dct_const_round_shift(temp2); 1003 1004 step1[8] = step2[8] + step2[9]; 1005 step1[9] = step2[8] - step2[9]; 1006 step1[10] = -step2[10] + step2[11]; 1007 step1[11] = step2[10] + step2[11]; 1008 step1[12] = step2[12] + step2[13]; 1009 step1[13] = step2[12] - step2[13]; 1010 step1[14] = -step2[14] + step2[15]; 1011 step1[15] = step2[14] + step2[15]; 1012 1013 step1[16] = step2[16]; 1014 step1[31] = step2[31]; 1015 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 1016 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 1017 step1[17] = dct_const_round_shift(temp1); 1018 step1[30] = dct_const_round_shift(temp2); 1019 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 1020 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 1021 step1[18] = dct_const_round_shift(temp1); 1022 step1[29] = dct_const_round_shift(temp2); 1023 step1[19] = step2[19]; 1024 step1[20] = step2[20]; 1025 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 1026 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 1027 step1[21] = dct_const_round_shift(temp1); 1028 step1[26] = dct_const_round_shift(temp2); 1029 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 1030 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 1031 step1[22] = dct_const_round_shift(temp1); 1032 step1[25] = dct_const_round_shift(temp2); 1033 step1[23] = step2[23]; 1034 step1[24] = step2[24]; 1035 step1[27] = step2[27]; 1036 step1[28] = step2[28]; 1037 1038 // stage 4 1039 temp1 = (step1[0] + step1[1]) * cospi_16_64; 1040 temp2 = (step1[0] - step1[1]) * cospi_16_64; 1041 step2[0] = dct_const_round_shift(temp1); 1042 step2[1] = dct_const_round_shift(temp2); 1043 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1044 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1045 step2[2] = dct_const_round_shift(temp1); 1046 step2[3] = dct_const_round_shift(temp2); 1047 step2[4] = step1[4] + step1[5]; 1048 step2[5] = step1[4] - step1[5]; 1049 step2[6] = -step1[6] + step1[7]; 1050 step2[7] = step1[6] + step1[7]; 1051 1052 step2[8] = step1[8]; 1053 step2[15] = step1[15]; 1054 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1055 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1056 step2[9] = dct_const_round_shift(temp1); 1057 step2[14] = dct_const_round_shift(temp2); 1058 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1059 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1060 step2[10] = dct_const_round_shift(temp1); 1061 step2[13] = dct_const_round_shift(temp2); 1062 step2[11] = step1[11]; 1063 step2[12] = step1[12]; 1064 1065 step2[16] = step1[16] + step1[19]; 1066 step2[17] = step1[17] + step1[18]; 1067 step2[18] = step1[17] - step1[18]; 1068 step2[19] = step1[16] - step1[19]; 1069 step2[20] = -step1[20] + step1[23]; 1070 step2[21] = -step1[21] + step1[22]; 1071 step2[22] = step1[21] + step1[22]; 1072 step2[23] = step1[20] + step1[23]; 1073 1074 step2[24] = step1[24] + step1[27]; 1075 step2[25] = step1[25] + step1[26]; 1076 step2[26] = step1[25] - step1[26]; 1077 step2[27] = step1[24] - step1[27]; 1078 step2[28] = -step1[28] + step1[31]; 1079 step2[29] = -step1[29] + step1[30]; 1080 step2[30] = step1[29] + step1[30]; 1081 step2[31] = step1[28] + step1[31]; 1082 1083 // stage 5 1084 step1[0] = step2[0] + step2[3]; 1085 step1[1] = step2[1] + step2[2]; 1086 step1[2] = step2[1] - step2[2]; 1087 step1[3] = step2[0] - step2[3]; 1088 step1[4] = step2[4]; 1089 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1090 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1091 step1[5] = dct_const_round_shift(temp1); 1092 step1[6] = dct_const_round_shift(temp2); 1093 step1[7] = step2[7]; 1094 1095 step1[8] = step2[8] + step2[11]; 1096 step1[9] = step2[9] + step2[10]; 1097 step1[10] = step2[9] - step2[10]; 1098 step1[11] = step2[8] - step2[11]; 1099 step1[12] = -step2[12] + step2[15]; 1100 step1[13] = -step2[13] + step2[14]; 1101 step1[14] = step2[13] + step2[14]; 1102 step1[15] = step2[12] + step2[15]; 1103 1104 step1[16] = step2[16]; 1105 step1[17] = step2[17]; 1106 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 1107 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 1108 step1[18] = dct_const_round_shift(temp1); 1109 step1[29] = dct_const_round_shift(temp2); 1110 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 1111 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 1112 step1[19] = dct_const_round_shift(temp1); 1113 step1[28] = dct_const_round_shift(temp2); 1114 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 1115 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 1116 step1[20] = dct_const_round_shift(temp1); 1117 step1[27] = dct_const_round_shift(temp2); 1118 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 1119 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 1120 step1[21] = dct_const_round_shift(temp1); 1121 step1[26] = dct_const_round_shift(temp2); 1122 step1[22] = step2[22]; 1123 step1[23] = step2[23]; 1124 step1[24] = step2[24]; 1125 step1[25] = step2[25]; 1126 step1[30] = step2[30]; 1127 step1[31] = step2[31]; 1128 1129 // stage 6 1130 step2[0] = step1[0] + step1[7]; 1131 step2[1] = step1[1] + step1[6]; 1132 step2[2] = step1[2] + step1[5]; 1133 step2[3] = step1[3] + step1[4]; 1134 step2[4] = step1[3] - step1[4]; 1135 step2[5] = step1[2] - step1[5]; 1136 step2[6] = step1[1] - step1[6]; 1137 step2[7] = step1[0] - step1[7]; 1138 step2[8] = step1[8]; 1139 step2[9] = step1[9]; 1140 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1141 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1142 step2[10] = dct_const_round_shift(temp1); 1143 step2[13] = dct_const_round_shift(temp2); 1144 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1145 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1146 step2[11] = dct_const_round_shift(temp1); 1147 step2[12] = dct_const_round_shift(temp2); 1148 step2[14] = step1[14]; 1149 step2[15] = step1[15]; 1150 1151 step2[16] = step1[16] + step1[23]; 1152 step2[17] = step1[17] + step1[22]; 1153 step2[18] = step1[18] + step1[21]; 1154 step2[19] = step1[19] + step1[20]; 1155 step2[20] = step1[19] - step1[20]; 1156 step2[21] = step1[18] - step1[21]; 1157 step2[22] = step1[17] - step1[22]; 1158 step2[23] = step1[16] - step1[23]; 1159 1160 step2[24] = -step1[24] + step1[31]; 1161 step2[25] = -step1[25] + step1[30]; 1162 step2[26] = -step1[26] + step1[29]; 1163 step2[27] = -step1[27] + step1[28]; 1164 step2[28] = step1[27] + step1[28]; 1165 step2[29] = step1[26] + step1[29]; 1166 step2[30] = step1[25] + step1[30]; 1167 step2[31] = step1[24] + step1[31]; 1168 1169 // stage 7 1170 step1[0] = step2[0] + step2[15]; 1171 step1[1] = step2[1] + step2[14]; 1172 step1[2] = step2[2] + step2[13]; 1173 step1[3] = step2[3] + step2[12]; 1174 step1[4] = step2[4] + step2[11]; 1175 step1[5] = step2[5] + step2[10]; 1176 step1[6] = step2[6] + step2[9]; 1177 step1[7] = step2[7] + step2[8]; 1178 step1[8] = step2[7] - step2[8]; 1179 step1[9] = step2[6] - step2[9]; 1180 step1[10] = step2[5] - step2[10]; 1181 step1[11] = step2[4] - step2[11]; 1182 step1[12] = step2[3] - step2[12]; 1183 step1[13] = step2[2] - step2[13]; 1184 step1[14] = step2[1] - step2[14]; 1185 step1[15] = step2[0] - step2[15]; 1186 1187 step1[16] = step2[16]; 1188 step1[17] = step2[17]; 1189 step1[18] = step2[18]; 1190 step1[19] = step2[19]; 1191 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 1192 temp2 = (step2[20] + step2[27]) * cospi_16_64; 1193 step1[20] = dct_const_round_shift(temp1); 1194 step1[27] = dct_const_round_shift(temp2); 1195 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 1196 temp2 = (step2[21] + step2[26]) * cospi_16_64; 1197 step1[21] = dct_const_round_shift(temp1); 1198 step1[26] = dct_const_round_shift(temp2); 1199 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 1200 temp2 = (step2[22] + step2[25]) * cospi_16_64; 1201 step1[22] = dct_const_round_shift(temp1); 1202 step1[25] = dct_const_round_shift(temp2); 1203 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 1204 temp2 = (step2[23] + step2[24]) * cospi_16_64; 1205 step1[23] = dct_const_round_shift(temp1); 1206 step1[24] = dct_const_round_shift(temp2); 1207 step1[28] = step2[28]; 1208 step1[29] = step2[29]; 1209 step1[30] = step2[30]; 1210 step1[31] = step2[31]; 1211 1212 // final stage 1213 output[0] = step1[0] + step1[31]; 1214 output[1] = step1[1] + step1[30]; 1215 output[2] = step1[2] + step1[29]; 1216 output[3] = step1[3] + step1[28]; 1217 output[4] = step1[4] + step1[27]; 1218 output[5] = step1[5] + step1[26]; 1219 output[6] = step1[6] + step1[25]; 1220 output[7] = step1[7] + step1[24]; 1221 output[8] = step1[8] + step1[23]; 1222 output[9] = step1[9] + step1[22]; 1223 output[10] = step1[10] + step1[21]; 1224 output[11] = step1[11] + step1[20]; 1225 output[12] = step1[12] + step1[19]; 1226 output[13] = step1[13] + step1[18]; 1227 output[14] = step1[14] + step1[17]; 1228 output[15] = step1[15] + step1[16]; 1229 output[16] = step1[15] - step1[16]; 1230 output[17] = step1[14] - step1[17]; 1231 output[18] = step1[13] - step1[18]; 1232 output[19] = step1[12] - step1[19]; 1233 output[20] = step1[11] - step1[20]; 1234 output[21] = step1[10] - step1[21]; 1235 output[22] = step1[9] - step1[22]; 1236 output[23] = step1[8] - step1[23]; 1237 output[24] = step1[7] - step1[24]; 1238 output[25] = step1[6] - step1[25]; 1239 output[26] = step1[5] - step1[26]; 1240 output[27] = step1[4] - step1[27]; 1241 output[28] = step1[3] - step1[28]; 1242 output[29] = step1[2] - step1[29]; 1243 output[30] = step1[1] - step1[30]; 1244 output[31] = step1[0] - step1[31]; 1245 } 1246 1247 void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { 1248 int16_t out[32 * 32]; 1249 int16_t *outptr = out; 1250 int i, j; 1251 int16_t temp_in[32], temp_out[32]; 1252 1253 // Rows 1254 for (i = 0; i < 32; ++i) { 1255 int16_t zero_coeff[16]; 1256 for (j = 0; j < 16; ++j) 1257 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 1258 for (j = 0; j < 8; ++j) 1259 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1260 for (j = 0; j < 4; ++j) 1261 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1262 for (j = 0; j < 2; ++j) 1263 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1264 1265 if (zero_coeff[0] | zero_coeff[1]) 1266 idct32(input, outptr); 1267 else 1268 vpx_memset(outptr, 0, sizeof(int16_t) * 32); 1269 input += 32; 1270 outptr += 32; 1271 } 1272 1273 // Columns 1274 for (i = 0; i < 32; ++i) { 1275 for (j = 0; j < 32; ++j) 1276 temp_in[j] = out[j * 32 + i]; 1277 idct32(temp_in, temp_out); 1278 for (j = 0; j < 32; ++j) 1279 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1280 + dest[j * stride + i]); 1281 } 1282 } 1283 1284 void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { 1285 int16_t out[32 * 32] = {0}; 1286 int16_t *outptr = out; 1287 int i, j; 1288 int16_t temp_in[32], temp_out[32]; 1289 1290 // Rows 1291 // only upper-left 8x8 has non-zero coeff 1292 for (i = 0; i < 8; ++i) { 1293 idct32(input, outptr); 1294 input += 32; 1295 outptr += 32; 1296 } 1297 1298 // Columns 1299 for (i = 0; i < 32; ++i) { 1300 for (j = 0; j < 32; ++j) 1301 temp_in[j] = out[j * 32 + i]; 1302 idct32(temp_in, temp_out); 1303 for (j = 0; j < 32; ++j) 1304 dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) 1305 + dest[j * stride + i]); 1306 } 1307 } 1308 1309 void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) { 1310 int i, j; 1311 int a1; 1312 1313 int16_t out = dct_const_round_shift(input[0] * cospi_16_64); 1314 out = dct_const_round_shift(out * cospi_16_64); 1315 a1 = ROUND_POWER_OF_TWO(out, 6); 1316 1317 for (j = 0; j < 32; ++j) { 1318 for (i = 0; i < 32; ++i) 1319 dest[i] = clip_pixel(dest[i] + a1); 1320 dest += stride; 1321 } 1322 } 1323 1324 // idct 1325 void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1326 if (eob > 1) 1327 vp9_idct4x4_16_add(input, dest, stride); 1328 else 1329 vp9_idct4x4_1_add(input, dest, stride); 1330 } 1331 1332 1333 void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1334 if (eob > 1) 1335 vp9_iwht4x4_16_add(input, dest, stride); 1336 else 1337 vp9_iwht4x4_1_add(input, dest, stride); 1338 } 1339 1340 void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) { 1341 // If dc is 1, then input[0] is the reconstructed value, do not need 1342 // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. 1343 1344 // The calculation can be simplified if there are not many non-zero dct 1345 // coefficients. Use eobs to decide what to do. 1346 // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c. 1347 // Combine that with code here. 1348 if (eob == 1) 1349 // DC only DCT coefficient 1350 vp9_idct8x8_1_add(input, dest, stride); 1351 else if (eob <= 10) 1352 vp9_idct8x8_10_add(input, dest, stride); 1353 else 1354 vp9_idct8x8_64_add(input, dest, stride); 1355 } 1356 1357 void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, 1358 int eob) { 1359 /* The calculation can be simplified if there are not many non-zero dct 1360 * coefficients. Use eobs to separate different cases. */ 1361 if (eob == 1) 1362 /* DC only DCT coefficient. */ 1363 vp9_idct16x16_1_add(input, dest, stride); 1364 else if (eob <= 10) 1365 vp9_idct16x16_10_add(input, dest, stride); 1366 else 1367 vp9_idct16x16_256_add(input, dest, stride); 1368 } 1369 1370 void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride, 1371 int eob) { 1372 if (eob == 1) 1373 vp9_idct32x32_1_add(input, dest, stride); 1374 else if (eob <= 34) 1375 // non-zero coeff only in upper-left 8x8 1376 vp9_idct32x32_34_add(input, dest, stride); 1377 else 1378 vp9_idct32x32_1024_add(input, dest, stride); 1379 } 1380 1381 // iht 1382 void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1383 int stride, int eob) { 1384 if (tx_type == DCT_DCT) 1385 vp9_idct4x4_add(input, dest, stride, eob); 1386 else 1387 vp9_iht4x4_16_add(input, dest, stride, tx_type); 1388 } 1389 1390 void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1391 int stride, int eob) { 1392 if (tx_type == DCT_DCT) { 1393 vp9_idct8x8_add(input, dest, stride, eob); 1394 } else { 1395 vp9_iht8x8_64_add(input, dest, stride, tx_type); 1396 } 1397 } 1398 1399 void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest, 1400 int stride, int eob) { 1401 if (tx_type == DCT_DCT) { 1402 vp9_idct16x16_add(input, dest, stride, eob); 1403 } else { 1404 vp9_iht16x16_256_add(input, dest, stride, tx_type); 1405 } 1406 } 1407