1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <math.h> 12 #include <string.h> 13 14 #include "vpx_dsp/inv_txfm.h" 15 16 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 17 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 18 0.5 shifts per pixel. */ 19 int i; 20 tran_low_t output[16]; 21 tran_high_t a1, b1, c1, d1, e1; 22 const tran_low_t *ip = input; 23 tran_low_t *op = output; 24 25 for (i = 0; i < 4; i++) { 26 a1 = ip[0] >> UNIT_QUANT_SHIFT; 27 c1 = ip[1] >> UNIT_QUANT_SHIFT; 28 d1 = ip[2] >> UNIT_QUANT_SHIFT; 29 b1 = ip[3] >> UNIT_QUANT_SHIFT; 30 a1 += c1; 31 d1 -= b1; 32 e1 = (a1 - d1) >> 1; 33 b1 = e1 - b1; 34 c1 = e1 - c1; 35 a1 -= b1; 36 d1 += c1; 37 op[0] = WRAPLOW(a1, 8); 38 op[1] = WRAPLOW(b1, 8); 39 op[2] = WRAPLOW(c1, 8); 40 op[3] = WRAPLOW(d1, 8); 41 ip += 4; 42 op += 4; 43 } 44 45 ip = output; 46 for (i = 0; i < 4; i++) { 47 a1 = ip[4 * 0]; 48 c1 = ip[4 * 1]; 49 d1 = ip[4 * 2]; 50 b1 = ip[4 * 3]; 51 a1 += c1; 52 d1 -= b1; 53 e1 = (a1 - d1) >> 1; 54 b1 = e1 - b1; 55 c1 = e1 - c1; 56 a1 -= b1; 57 d1 += c1; 58 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); 59 dest[stride * 1] = clip_pixel_add(dest[stride * 1], b1); 60 dest[stride * 2] = clip_pixel_add(dest[stride * 2], c1); 61 dest[stride * 3] = clip_pixel_add(dest[stride * 3], d1); 62 63 ip++; 64 dest++; 65 } 66 } 67 68 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { 69 int i; 70 tran_high_t a1, e1; 71 tran_low_t tmp[4]; 72 const tran_low_t *ip = in; 73 tran_low_t *op = tmp; 74 75 a1 = ip[0] >> UNIT_QUANT_SHIFT; 76 e1 = a1 >> 1; 77 a1 -= e1; 78 op[0] = WRAPLOW(a1, 8); 79 op[1] = op[2] = op[3] = WRAPLOW(e1, 8); 80 81 ip = tmp; 82 for (i = 0; i < 4; i++) { 83 e1 = ip[0] >> 1; 84 a1 = ip[0] - e1; 85 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); 86 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); 87 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); 88 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); 89 ip++; 90 dest++; 91 } 92 } 93 94 void idct4_c(const tran_low_t *input, tran_low_t *output) { 95 tran_low_t step[4]; 96 tran_high_t temp1, temp2; 97 // stage 1 98 temp1 = (input[0] + input[2]) * cospi_16_64; 99 temp2 = (input[0] - input[2]) * cospi_16_64; 100 step[0] = WRAPLOW(dct_const_round_shift(temp1), 8); 101 step[1] = WRAPLOW(dct_const_round_shift(temp2), 8); 102 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 103 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 104 step[2] = WRAPLOW(dct_const_round_shift(temp1), 8); 105 step[3] = WRAPLOW(dct_const_round_shift(temp2), 8); 106 107 // stage 2 108 output[0] = WRAPLOW(step[0] + step[3], 8); 109 output[1] = WRAPLOW(step[1] + step[2], 8); 110 output[2] = WRAPLOW(step[1] - step[2], 8); 111 output[3] = WRAPLOW(step[0] - step[3], 8); 112 } 113 114 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 115 tran_low_t out[4 * 4]; 116 tran_low_t *outptr = out; 117 int i, j; 118 tran_low_t temp_in[4], temp_out[4]; 119 120 // Rows 121 for (i = 0; i < 4; ++i) { 122 idct4_c(input, outptr); 123 input += 4; 124 outptr += 4; 125 } 126 127 // Columns 128 for (i = 0; i < 4; ++i) { 129 for (j = 0; j < 4; ++j) 130 temp_in[j] = out[j * 4 + i]; 131 idct4_c(temp_in, temp_out); 132 for (j = 0; j < 4; ++j) { 133 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 134 ROUND_POWER_OF_TWO(temp_out[j], 4)); 135 } 136 } 137 } 138 139 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, 140 int dest_stride) { 141 int i; 142 tran_high_t a1; 143 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); 144 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); 145 a1 = ROUND_POWER_OF_TWO(out, 4); 146 147 for (i = 0; i < 4; i++) { 148 dest[0] = clip_pixel_add(dest[0], a1); 149 dest[1] = clip_pixel_add(dest[1], a1); 150 dest[2] = clip_pixel_add(dest[2], a1); 151 dest[3] = clip_pixel_add(dest[3], a1); 152 dest += dest_stride; 153 } 154 } 155 156 void idct8_c(const tran_low_t *input, tran_low_t *output) { 157 tran_low_t step1[8], step2[8]; 158 tran_high_t temp1, temp2; 159 // stage 1 160 step1[0] = input[0]; 161 step1[2] = input[4]; 162 step1[1] = input[2]; 163 step1[3] = input[6]; 164 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 165 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 166 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); 167 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); 168 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 169 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 170 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 171 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 172 173 // stage 2 174 temp1 = (step1[0] + step1[2]) * cospi_16_64; 175 temp2 = (step1[0] - step1[2]) * cospi_16_64; 176 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); 177 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); 178 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; 179 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; 180 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); 181 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); 182 step2[4] = WRAPLOW(step1[4] + step1[5], 8); 183 step2[5] = WRAPLOW(step1[4] - step1[5], 8); 184 step2[6] = WRAPLOW(-step1[6] + step1[7], 8); 185 step2[7] = WRAPLOW(step1[6] + step1[7], 8); 186 187 // stage 3 188 step1[0] = WRAPLOW(step2[0] + step2[3], 8); 189 step1[1] = WRAPLOW(step2[1] + step2[2], 8); 190 step1[2] = WRAPLOW(step2[1] - step2[2], 8); 191 step1[3] = WRAPLOW(step2[0] - step2[3], 8); 192 step1[4] = step2[4]; 193 temp1 = (step2[6] - step2[5]) * cospi_16_64; 194 temp2 = (step2[5] + step2[6]) * cospi_16_64; 195 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 196 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 197 step1[7] = step2[7]; 198 199 // stage 4 200 output[0] = WRAPLOW(step1[0] + step1[7], 8); 201 output[1] = WRAPLOW(step1[1] + step1[6], 8); 202 output[2] = WRAPLOW(step1[2] + step1[5], 8); 203 output[3] = WRAPLOW(step1[3] + step1[4], 8); 204 output[4] = WRAPLOW(step1[3] - step1[4], 8); 205 output[5] = WRAPLOW(step1[2] - step1[5], 8); 206 output[6] = WRAPLOW(step1[1] - step1[6], 8); 207 output[7] = WRAPLOW(step1[0] - step1[7], 8); 208 } 209 210 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 211 tran_low_t out[8 * 8]; 212 tran_low_t *outptr = out; 213 int i, j; 214 tran_low_t temp_in[8], temp_out[8]; 215 216 // First transform rows 217 for (i = 0; i < 8; ++i) { 218 idct8_c(input, outptr); 219 input += 8; 220 outptr += 8; 221 } 222 223 // Then transform columns 224 for (i = 0; i < 8; ++i) { 225 for (j = 0; j < 8; ++j) 226 temp_in[j] = out[j * 8 + i]; 227 idct8_c(temp_in, temp_out); 228 for (j = 0; j < 8; ++j) { 229 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 230 ROUND_POWER_OF_TWO(temp_out[j], 5)); 231 } 232 } 233 } 234 235 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 236 int i, j; 237 tran_high_t a1; 238 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); 239 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); 240 a1 = ROUND_POWER_OF_TWO(out, 5); 241 for (j = 0; j < 8; ++j) { 242 for (i = 0; i < 8; ++i) 243 dest[i] = clip_pixel_add(dest[i], a1); 244 dest += stride; 245 } 246 } 247 248 void iadst4_c(const tran_low_t *input, tran_low_t *output) { 249 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 250 251 tran_low_t x0 = input[0]; 252 tran_low_t x1 = input[1]; 253 tran_low_t x2 = input[2]; 254 tran_low_t x3 = input[3]; 255 256 if (!(x0 | x1 | x2 | x3)) { 257 output[0] = output[1] = output[2] = output[3] = 0; 258 return; 259 } 260 261 s0 = sinpi_1_9 * x0; 262 s1 = sinpi_2_9 * x0; 263 s2 = sinpi_3_9 * x1; 264 s3 = sinpi_4_9 * x2; 265 s4 = sinpi_1_9 * x2; 266 s5 = sinpi_2_9 * x3; 267 s6 = sinpi_4_9 * x3; 268 s7 = x0 - x2 + x3; 269 270 s0 = s0 + s3 + s5; 271 s1 = s1 - s4 - s6; 272 s3 = s2; 273 s2 = sinpi_3_9 * s7; 274 275 // 1-D transform scaling factor is sqrt(2). 276 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 277 // + 1b (addition) = 29b. 278 // Hence the output bit depth is 15b. 279 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3), 8); 280 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3), 8); 281 output[2] = WRAPLOW(dct_const_round_shift(s2), 8); 282 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3), 8); 283 } 284 285 void iadst8_c(const tran_low_t *input, tran_low_t *output) { 286 int s0, s1, s2, s3, s4, s5, s6, s7; 287 288 tran_high_t x0 = input[7]; 289 tran_high_t x1 = input[0]; 290 tran_high_t x2 = input[5]; 291 tran_high_t x3 = input[2]; 292 tran_high_t x4 = input[3]; 293 tran_high_t x5 = input[4]; 294 tran_high_t x6 = input[1]; 295 tran_high_t x7 = input[6]; 296 297 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 298 output[0] = output[1] = output[2] = output[3] = output[4] 299 = output[5] = output[6] = output[7] = 0; 300 return; 301 } 302 303 // stage 1 304 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); 305 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); 306 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); 307 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); 308 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); 309 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); 310 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); 311 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); 312 313 x0 = WRAPLOW(dct_const_round_shift(s0 + s4), 8); 314 x1 = WRAPLOW(dct_const_round_shift(s1 + s5), 8); 315 x2 = WRAPLOW(dct_const_round_shift(s2 + s6), 8); 316 x3 = WRAPLOW(dct_const_round_shift(s3 + s7), 8); 317 x4 = WRAPLOW(dct_const_round_shift(s0 - s4), 8); 318 x5 = WRAPLOW(dct_const_round_shift(s1 - s5), 8); 319 x6 = WRAPLOW(dct_const_round_shift(s2 - s6), 8); 320 x7 = WRAPLOW(dct_const_round_shift(s3 - s7), 8); 321 322 // stage 2 323 s0 = (int)x0; 324 s1 = (int)x1; 325 s2 = (int)x2; 326 s3 = (int)x3; 327 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); 328 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); 329 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); 330 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); 331 332 x0 = WRAPLOW(s0 + s2, 8); 333 x1 = WRAPLOW(s1 + s3, 8); 334 x2 = WRAPLOW(s0 - s2, 8); 335 x3 = WRAPLOW(s1 - s3, 8); 336 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); 337 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); 338 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); 339 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); 340 341 // stage 3 342 s2 = (int)(cospi_16_64 * (x2 + x3)); 343 s3 = (int)(cospi_16_64 * (x2 - x3)); 344 s6 = (int)(cospi_16_64 * (x6 + x7)); 345 s7 = (int)(cospi_16_64 * (x6 - x7)); 346 347 x2 = WRAPLOW(dct_const_round_shift(s2), 8); 348 x3 = WRAPLOW(dct_const_round_shift(s3), 8); 349 x6 = WRAPLOW(dct_const_round_shift(s6), 8); 350 x7 = WRAPLOW(dct_const_round_shift(s7), 8); 351 352 output[0] = WRAPLOW(x0, 8); 353 output[1] = WRAPLOW(-x4, 8); 354 output[2] = WRAPLOW(x6, 8); 355 output[3] = WRAPLOW(-x2, 8); 356 output[4] = WRAPLOW(x3, 8); 357 output[5] = WRAPLOW(-x7, 8); 358 output[6] = WRAPLOW(x5, 8); 359 output[7] = WRAPLOW(-x1, 8); 360 } 361 362 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 363 tran_low_t out[8 * 8] = { 0 }; 364 tran_low_t *outptr = out; 365 int i, j; 366 tran_low_t temp_in[8], temp_out[8]; 367 368 // First transform rows 369 // only first 4 row has non-zero coefs 370 for (i = 0; i < 4; ++i) { 371 idct8_c(input, outptr); 372 input += 8; 373 outptr += 8; 374 } 375 376 // Then transform columns 377 for (i = 0; i < 8; ++i) { 378 for (j = 0; j < 8; ++j) 379 temp_in[j] = out[j * 8 + i]; 380 idct8_c(temp_in, temp_out); 381 for (j = 0; j < 8; ++j) { 382 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 383 ROUND_POWER_OF_TWO(temp_out[j], 5)); 384 } 385 } 386 } 387 388 void idct16_c(const tran_low_t *input, tran_low_t *output) { 389 tran_low_t step1[16], step2[16]; 390 tran_high_t temp1, temp2; 391 392 // stage 1 393 step1[0] = input[0/2]; 394 step1[1] = input[16/2]; 395 step1[2] = input[8/2]; 396 step1[3] = input[24/2]; 397 step1[4] = input[4/2]; 398 step1[5] = input[20/2]; 399 step1[6] = input[12/2]; 400 step1[7] = input[28/2]; 401 step1[8] = input[2/2]; 402 step1[9] = input[18/2]; 403 step1[10] = input[10/2]; 404 step1[11] = input[26/2]; 405 step1[12] = input[6/2]; 406 step1[13] = input[22/2]; 407 step1[14] = input[14/2]; 408 step1[15] = input[30/2]; 409 410 // stage 2 411 step2[0] = step1[0]; 412 step2[1] = step1[1]; 413 step2[2] = step1[2]; 414 step2[3] = step1[3]; 415 step2[4] = step1[4]; 416 step2[5] = step1[5]; 417 step2[6] = step1[6]; 418 step2[7] = step1[7]; 419 420 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 421 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 422 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); 423 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); 424 425 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 426 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 427 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); 428 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); 429 430 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 431 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 432 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 433 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 434 435 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 436 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 437 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); 438 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); 439 440 // stage 3 441 step1[0] = step2[0]; 442 step1[1] = step2[1]; 443 step1[2] = step2[2]; 444 step1[3] = step2[3]; 445 446 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 447 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 448 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); 449 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); 450 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 451 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 452 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 453 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 454 455 step1[8] = WRAPLOW(step2[8] + step2[9], 8); 456 step1[9] = WRAPLOW(step2[8] - step2[9], 8); 457 step1[10] = WRAPLOW(-step2[10] + step2[11], 8); 458 step1[11] = WRAPLOW(step2[10] + step2[11], 8); 459 step1[12] = WRAPLOW(step2[12] + step2[13], 8); 460 step1[13] = WRAPLOW(step2[12] - step2[13], 8); 461 step1[14] = WRAPLOW(-step2[14] + step2[15], 8); 462 step1[15] = WRAPLOW(step2[14] + step2[15], 8); 463 464 // stage 4 465 temp1 = (step1[0] + step1[1]) * cospi_16_64; 466 temp2 = (step1[0] - step1[1]) * cospi_16_64; 467 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); 468 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); 469 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 470 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 471 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); 472 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); 473 step2[4] = WRAPLOW(step1[4] + step1[5], 8); 474 step2[5] = WRAPLOW(step1[4] - step1[5], 8); 475 step2[6] = WRAPLOW(-step1[6] + step1[7], 8); 476 step2[7] = WRAPLOW(step1[6] + step1[7], 8); 477 478 step2[8] = step1[8]; 479 step2[15] = step1[15]; 480 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 481 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 482 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); 483 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); 484 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 485 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 486 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 487 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 488 step2[11] = step1[11]; 489 step2[12] = step1[12]; 490 491 // stage 5 492 step1[0] = WRAPLOW(step2[0] + step2[3], 8); 493 step1[1] = WRAPLOW(step2[1] + step2[2], 8); 494 step1[2] = WRAPLOW(step2[1] - step2[2], 8); 495 step1[3] = WRAPLOW(step2[0] - step2[3], 8); 496 step1[4] = step2[4]; 497 temp1 = (step2[6] - step2[5]) * cospi_16_64; 498 temp2 = (step2[5] + step2[6]) * cospi_16_64; 499 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 500 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 501 step1[7] = step2[7]; 502 503 step1[8] = WRAPLOW(step2[8] + step2[11], 8); 504 step1[9] = WRAPLOW(step2[9] + step2[10], 8); 505 step1[10] = WRAPLOW(step2[9] - step2[10], 8); 506 step1[11] = WRAPLOW(step2[8] - step2[11], 8); 507 step1[12] = WRAPLOW(-step2[12] + step2[15], 8); 508 step1[13] = WRAPLOW(-step2[13] + step2[14], 8); 509 step1[14] = WRAPLOW(step2[13] + step2[14], 8); 510 step1[15] = WRAPLOW(step2[12] + step2[15], 8); 511 512 // stage 6 513 step2[0] = WRAPLOW(step1[0] + step1[7], 8); 514 step2[1] = WRAPLOW(step1[1] + step1[6], 8); 515 step2[2] = WRAPLOW(step1[2] + step1[5], 8); 516 step2[3] = WRAPLOW(step1[3] + step1[4], 8); 517 step2[4] = WRAPLOW(step1[3] - step1[4], 8); 518 step2[5] = WRAPLOW(step1[2] - step1[5], 8); 519 step2[6] = WRAPLOW(step1[1] - step1[6], 8); 520 step2[7] = WRAPLOW(step1[0] - step1[7], 8); 521 step2[8] = step1[8]; 522 step2[9] = step1[9]; 523 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 524 temp2 = (step1[10] + step1[13]) * cospi_16_64; 525 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 526 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 527 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 528 temp2 = (step1[11] + step1[12]) * cospi_16_64; 529 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); 530 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); 531 step2[14] = step1[14]; 532 step2[15] = step1[15]; 533 534 // stage 7 535 output[0] = WRAPLOW(step2[0] + step2[15], 8); 536 output[1] = WRAPLOW(step2[1] + step2[14], 8); 537 output[2] = WRAPLOW(step2[2] + step2[13], 8); 538 output[3] = WRAPLOW(step2[3] + step2[12], 8); 539 output[4] = WRAPLOW(step2[4] + step2[11], 8); 540 output[5] = WRAPLOW(step2[5] + step2[10], 8); 541 output[6] = WRAPLOW(step2[6] + step2[9], 8); 542 output[7] = WRAPLOW(step2[7] + step2[8], 8); 543 output[8] = WRAPLOW(step2[7] - step2[8], 8); 544 output[9] = WRAPLOW(step2[6] - step2[9], 8); 545 output[10] = WRAPLOW(step2[5] - step2[10], 8); 546 output[11] = WRAPLOW(step2[4] - step2[11], 8); 547 output[12] = WRAPLOW(step2[3] - step2[12], 8); 548 output[13] = WRAPLOW(step2[2] - step2[13], 8); 549 output[14] = WRAPLOW(step2[1] - step2[14], 8); 550 output[15] = WRAPLOW(step2[0] - step2[15], 8); 551 } 552 553 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, 554 int stride) { 555 tran_low_t out[16 * 16]; 556 tran_low_t *outptr = out; 557 int i, j; 558 tran_low_t temp_in[16], temp_out[16]; 559 560 // First transform rows 561 for (i = 0; i < 16; ++i) { 562 idct16_c(input, outptr); 563 input += 16; 564 outptr += 16; 565 } 566 567 // Then transform columns 568 for (i = 0; i < 16; ++i) { 569 for (j = 0; j < 16; ++j) 570 temp_in[j] = out[j * 16 + i]; 571 idct16_c(temp_in, temp_out); 572 for (j = 0; j < 16; ++j) { 573 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 574 ROUND_POWER_OF_TWO(temp_out[j], 6)); 575 } 576 } 577 } 578 579 void iadst16_c(const tran_low_t *input, tran_low_t *output) { 580 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 581 tran_high_t s9, s10, s11, s12, s13, s14, s15; 582 583 tran_high_t x0 = input[15]; 584 tran_high_t x1 = input[0]; 585 tran_high_t x2 = input[13]; 586 tran_high_t x3 = input[2]; 587 tran_high_t x4 = input[11]; 588 tran_high_t x5 = input[4]; 589 tran_high_t x6 = input[9]; 590 tran_high_t x7 = input[6]; 591 tran_high_t x8 = input[7]; 592 tran_high_t x9 = input[8]; 593 tran_high_t x10 = input[5]; 594 tran_high_t x11 = input[10]; 595 tran_high_t x12 = input[3]; 596 tran_high_t x13 = input[12]; 597 tran_high_t x14 = input[1]; 598 tran_high_t x15 = input[14]; 599 600 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 601 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 602 output[0] = output[1] = output[2] = output[3] = output[4] 603 = output[5] = output[6] = output[7] = output[8] 604 = output[9] = output[10] = output[11] = output[12] 605 = output[13] = output[14] = output[15] = 0; 606 return; 607 } 608 609 // stage 1 610 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 611 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 612 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 613 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 614 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 615 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 616 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 617 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 618 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 619 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 620 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 621 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 622 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 623 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 624 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 625 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 626 627 x0 = WRAPLOW(dct_const_round_shift(s0 + s8), 8); 628 x1 = WRAPLOW(dct_const_round_shift(s1 + s9), 8); 629 x2 = WRAPLOW(dct_const_round_shift(s2 + s10), 8); 630 x3 = WRAPLOW(dct_const_round_shift(s3 + s11), 8); 631 x4 = WRAPLOW(dct_const_round_shift(s4 + s12), 8); 632 x5 = WRAPLOW(dct_const_round_shift(s5 + s13), 8); 633 x6 = WRAPLOW(dct_const_round_shift(s6 + s14), 8); 634 x7 = WRAPLOW(dct_const_round_shift(s7 + s15), 8); 635 x8 = WRAPLOW(dct_const_round_shift(s0 - s8), 8); 636 x9 = WRAPLOW(dct_const_round_shift(s1 - s9), 8); 637 x10 = WRAPLOW(dct_const_round_shift(s2 - s10), 8); 638 x11 = WRAPLOW(dct_const_round_shift(s3 - s11), 8); 639 x12 = WRAPLOW(dct_const_round_shift(s4 - s12), 8); 640 x13 = WRAPLOW(dct_const_round_shift(s5 - s13), 8); 641 x14 = WRAPLOW(dct_const_round_shift(s6 - s14), 8); 642 x15 = WRAPLOW(dct_const_round_shift(s7 - s15), 8); 643 644 // stage 2 645 s0 = x0; 646 s1 = x1; 647 s2 = x2; 648 s3 = x3; 649 s4 = x4; 650 s5 = x5; 651 s6 = x6; 652 s7 = x7; 653 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 654 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 655 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 656 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 657 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 658 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 659 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 660 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 661 662 x0 = WRAPLOW(s0 + s4, 8); 663 x1 = WRAPLOW(s1 + s5, 8); 664 x2 = WRAPLOW(s2 + s6, 8); 665 x3 = WRAPLOW(s3 + s7, 8); 666 x4 = WRAPLOW(s0 - s4, 8); 667 x5 = WRAPLOW(s1 - s5, 8); 668 x6 = WRAPLOW(s2 - s6, 8); 669 x7 = WRAPLOW(s3 - s7, 8); 670 x8 = WRAPLOW(dct_const_round_shift(s8 + s12), 8); 671 x9 = WRAPLOW(dct_const_round_shift(s9 + s13), 8); 672 x10 = WRAPLOW(dct_const_round_shift(s10 + s14), 8); 673 x11 = WRAPLOW(dct_const_round_shift(s11 + s15), 8); 674 x12 = WRAPLOW(dct_const_round_shift(s8 - s12), 8); 675 x13 = WRAPLOW(dct_const_round_shift(s9 - s13), 8); 676 x14 = WRAPLOW(dct_const_round_shift(s10 - s14), 8); 677 x15 = WRAPLOW(dct_const_round_shift(s11 - s15), 8); 678 679 // stage 3 680 s0 = x0; 681 s1 = x1; 682 s2 = x2; 683 s3 = x3; 684 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 685 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 686 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 687 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 688 s8 = x8; 689 s9 = x9; 690 s10 = x10; 691 s11 = x11; 692 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 693 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 694 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 695 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 696 697 x0 = WRAPLOW(check_range(s0 + s2), 8); 698 x1 = WRAPLOW(check_range(s1 + s3), 8); 699 x2 = WRAPLOW(check_range(s0 - s2), 8); 700 x3 = WRAPLOW(check_range(s1 - s3), 8); 701 x4 = WRAPLOW(dct_const_round_shift(s4 + s6), 8); 702 x5 = WRAPLOW(dct_const_round_shift(s5 + s7), 8); 703 x6 = WRAPLOW(dct_const_round_shift(s4 - s6), 8); 704 x7 = WRAPLOW(dct_const_round_shift(s5 - s7), 8); 705 x8 = WRAPLOW(check_range(s8 + s10), 8); 706 x9 = WRAPLOW(check_range(s9 + s11), 8); 707 x10 = WRAPLOW(check_range(s8 - s10), 8); 708 x11 = WRAPLOW(check_range(s9 - s11), 8); 709 x12 = WRAPLOW(dct_const_round_shift(s12 + s14), 8); 710 x13 = WRAPLOW(dct_const_round_shift(s13 + s15), 8); 711 x14 = WRAPLOW(dct_const_round_shift(s12 - s14), 8); 712 x15 = WRAPLOW(dct_const_round_shift(s13 - s15), 8); 713 714 // stage 4 715 s2 = (- cospi_16_64) * (x2 + x3); 716 s3 = cospi_16_64 * (x2 - x3); 717 s6 = cospi_16_64 * (x6 + x7); 718 s7 = cospi_16_64 * (- x6 + x7); 719 s10 = cospi_16_64 * (x10 + x11); 720 s11 = cospi_16_64 * (- x10 + x11); 721 s14 = (- cospi_16_64) * (x14 + x15); 722 s15 = cospi_16_64 * (x14 - x15); 723 724 x2 = WRAPLOW(dct_const_round_shift(s2), 8); 725 x3 = WRAPLOW(dct_const_round_shift(s3), 8); 726 x6 = WRAPLOW(dct_const_round_shift(s6), 8); 727 x7 = WRAPLOW(dct_const_round_shift(s7), 8); 728 x10 = WRAPLOW(dct_const_round_shift(s10), 8); 729 x11 = WRAPLOW(dct_const_round_shift(s11), 8); 730 x14 = WRAPLOW(dct_const_round_shift(s14), 8); 731 x15 = WRAPLOW(dct_const_round_shift(s15), 8); 732 733 output[0] = WRAPLOW(x0, 8); 734 output[1] = WRAPLOW(-x8, 8); 735 output[2] = WRAPLOW(x12, 8); 736 output[3] = WRAPLOW(-x4, 8); 737 output[4] = WRAPLOW(x6, 8); 738 output[5] = WRAPLOW(x14, 8); 739 output[6] = WRAPLOW(x10, 8); 740 output[7] = WRAPLOW(x2, 8); 741 output[8] = WRAPLOW(x3, 8); 742 output[9] = WRAPLOW(x11, 8); 743 output[10] = WRAPLOW(x15, 8); 744 output[11] = WRAPLOW(x7, 8); 745 output[12] = WRAPLOW(x5, 8); 746 output[13] = WRAPLOW(-x13, 8); 747 output[14] = WRAPLOW(x9, 8); 748 output[15] = WRAPLOW(-x1, 8); 749 } 750 751 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, 752 int stride) { 753 tran_low_t out[16 * 16] = { 0 }; 754 tran_low_t *outptr = out; 755 int i, j; 756 tran_low_t temp_in[16], temp_out[16]; 757 758 // First transform rows. Since all non-zero dct coefficients are in 759 // upper-left 4x4 area, we only need to calculate first 4 rows here. 760 for (i = 0; i < 4; ++i) { 761 idct16_c(input, outptr); 762 input += 16; 763 outptr += 16; 764 } 765 766 // Then transform columns 767 for (i = 0; i < 16; ++i) { 768 for (j = 0; j < 16; ++j) 769 temp_in[j] = out[j*16 + i]; 770 idct16_c(temp_in, temp_out); 771 for (j = 0; j < 16; ++j) { 772 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 773 ROUND_POWER_OF_TWO(temp_out[j], 6)); 774 } 775 } 776 } 777 778 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 779 int i, j; 780 tran_high_t a1; 781 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); 782 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); 783 a1 = ROUND_POWER_OF_TWO(out, 6); 784 for (j = 0; j < 16; ++j) { 785 for (i = 0; i < 16; ++i) 786 dest[i] = clip_pixel_add(dest[i], a1); 787 dest += stride; 788 } 789 } 790 791 void idct32_c(const tran_low_t *input, tran_low_t *output) { 792 tran_low_t step1[32], step2[32]; 793 tran_high_t temp1, temp2; 794 795 // stage 1 796 step1[0] = input[0]; 797 step1[1] = input[16]; 798 step1[2] = input[8]; 799 step1[3] = input[24]; 800 step1[4] = input[4]; 801 step1[5] = input[20]; 802 step1[6] = input[12]; 803 step1[7] = input[28]; 804 step1[8] = input[2]; 805 step1[9] = input[18]; 806 step1[10] = input[10]; 807 step1[11] = input[26]; 808 step1[12] = input[6]; 809 step1[13] = input[22]; 810 step1[14] = input[14]; 811 step1[15] = input[30]; 812 813 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 814 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 815 step1[16] = WRAPLOW(dct_const_round_shift(temp1), 8); 816 step1[31] = WRAPLOW(dct_const_round_shift(temp2), 8); 817 818 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 819 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 820 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); 821 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); 822 823 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 824 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 825 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); 826 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); 827 828 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 829 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 830 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); 831 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); 832 833 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 834 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 835 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); 836 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); 837 838 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 839 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 840 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); 841 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); 842 843 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 844 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 845 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); 846 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); 847 848 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 849 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 850 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); 851 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); 852 853 // stage 2 854 step2[0] = step1[0]; 855 step2[1] = step1[1]; 856 step2[2] = step1[2]; 857 step2[3] = step1[3]; 858 step2[4] = step1[4]; 859 step2[5] = step1[5]; 860 step2[6] = step1[6]; 861 step2[7] = step1[7]; 862 863 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 864 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 865 step2[8] = WRAPLOW(dct_const_round_shift(temp1), 8); 866 step2[15] = WRAPLOW(dct_const_round_shift(temp2), 8); 867 868 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 869 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 870 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); 871 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); 872 873 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 874 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 875 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 876 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 877 878 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 879 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 880 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); 881 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); 882 883 step2[16] = WRAPLOW(step1[16] + step1[17], 8); 884 step2[17] = WRAPLOW(step1[16] - step1[17], 8); 885 step2[18] = WRAPLOW(-step1[18] + step1[19], 8); 886 step2[19] = WRAPLOW(step1[18] + step1[19], 8); 887 step2[20] = WRAPLOW(step1[20] + step1[21], 8); 888 step2[21] = WRAPLOW(step1[20] - step1[21], 8); 889 step2[22] = WRAPLOW(-step1[22] + step1[23], 8); 890 step2[23] = WRAPLOW(step1[22] + step1[23], 8); 891 step2[24] = WRAPLOW(step1[24] + step1[25], 8); 892 step2[25] = WRAPLOW(step1[24] - step1[25], 8); 893 step2[26] = WRAPLOW(-step1[26] + step1[27], 8); 894 step2[27] = WRAPLOW(step1[26] + step1[27], 8); 895 step2[28] = WRAPLOW(step1[28] + step1[29], 8); 896 step2[29] = WRAPLOW(step1[28] - step1[29], 8); 897 step2[30] = WRAPLOW(-step1[30] + step1[31], 8); 898 step2[31] = WRAPLOW(step1[30] + step1[31], 8); 899 900 // stage 3 901 step1[0] = step2[0]; 902 step1[1] = step2[1]; 903 step1[2] = step2[2]; 904 step1[3] = step2[3]; 905 906 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 907 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 908 step1[4] = WRAPLOW(dct_const_round_shift(temp1), 8); 909 step1[7] = WRAPLOW(dct_const_round_shift(temp2), 8); 910 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 911 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 912 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 913 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 914 915 step1[8] = WRAPLOW(step2[8] + step2[9], 8); 916 step1[9] = WRAPLOW(step2[8] - step2[9], 8); 917 step1[10] = WRAPLOW(-step2[10] + step2[11], 8); 918 step1[11] = WRAPLOW(step2[10] + step2[11], 8); 919 step1[12] = WRAPLOW(step2[12] + step2[13], 8); 920 step1[13] = WRAPLOW(step2[12] - step2[13], 8); 921 step1[14] = WRAPLOW(-step2[14] + step2[15], 8); 922 step1[15] = WRAPLOW(step2[14] + step2[15], 8); 923 924 step1[16] = step2[16]; 925 step1[31] = step2[31]; 926 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 927 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 928 step1[17] = WRAPLOW(dct_const_round_shift(temp1), 8); 929 step1[30] = WRAPLOW(dct_const_round_shift(temp2), 8); 930 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 931 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 932 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); 933 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); 934 step1[19] = step2[19]; 935 step1[20] = step2[20]; 936 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 937 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 938 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); 939 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); 940 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 941 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 942 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); 943 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); 944 step1[23] = step2[23]; 945 step1[24] = step2[24]; 946 step1[27] = step2[27]; 947 step1[28] = step2[28]; 948 949 // stage 4 950 temp1 = (step1[0] + step1[1]) * cospi_16_64; 951 temp2 = (step1[0] - step1[1]) * cospi_16_64; 952 step2[0] = WRAPLOW(dct_const_round_shift(temp1), 8); 953 step2[1] = WRAPLOW(dct_const_round_shift(temp2), 8); 954 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 955 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 956 step2[2] = WRAPLOW(dct_const_round_shift(temp1), 8); 957 step2[3] = WRAPLOW(dct_const_round_shift(temp2), 8); 958 step2[4] = WRAPLOW(step1[4] + step1[5], 8); 959 step2[5] = WRAPLOW(step1[4] - step1[5], 8); 960 step2[6] = WRAPLOW(-step1[6] + step1[7], 8); 961 step2[7] = WRAPLOW(step1[6] + step1[7], 8); 962 963 step2[8] = step1[8]; 964 step2[15] = step1[15]; 965 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 966 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 967 step2[9] = WRAPLOW(dct_const_round_shift(temp1), 8); 968 step2[14] = WRAPLOW(dct_const_round_shift(temp2), 8); 969 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 970 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 971 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 972 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 973 step2[11] = step1[11]; 974 step2[12] = step1[12]; 975 976 step2[16] = WRAPLOW(step1[16] + step1[19], 8); 977 step2[17] = WRAPLOW(step1[17] + step1[18], 8); 978 step2[18] = WRAPLOW(step1[17] - step1[18], 8); 979 step2[19] = WRAPLOW(step1[16] - step1[19], 8); 980 step2[20] = WRAPLOW(-step1[20] + step1[23], 8); 981 step2[21] = WRAPLOW(-step1[21] + step1[22], 8); 982 step2[22] = WRAPLOW(step1[21] + step1[22], 8); 983 step2[23] = WRAPLOW(step1[20] + step1[23], 8); 984 985 step2[24] = WRAPLOW(step1[24] + step1[27], 8); 986 step2[25] = WRAPLOW(step1[25] + step1[26], 8); 987 step2[26] = WRAPLOW(step1[25] - step1[26], 8); 988 step2[27] = WRAPLOW(step1[24] - step1[27], 8); 989 step2[28] = WRAPLOW(-step1[28] + step1[31], 8); 990 step2[29] = WRAPLOW(-step1[29] + step1[30], 8); 991 step2[30] = WRAPLOW(step1[29] + step1[30], 8); 992 step2[31] = WRAPLOW(step1[28] + step1[31], 8); 993 994 // stage 5 995 step1[0] = WRAPLOW(step2[0] + step2[3], 8); 996 step1[1] = WRAPLOW(step2[1] + step2[2], 8); 997 step1[2] = WRAPLOW(step2[1] - step2[2], 8); 998 step1[3] = WRAPLOW(step2[0] - step2[3], 8); 999 step1[4] = step2[4]; 1000 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1001 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1002 step1[5] = WRAPLOW(dct_const_round_shift(temp1), 8); 1003 step1[6] = WRAPLOW(dct_const_round_shift(temp2), 8); 1004 step1[7] = step2[7]; 1005 1006 step1[8] = WRAPLOW(step2[8] + step2[11], 8); 1007 step1[9] = WRAPLOW(step2[9] + step2[10], 8); 1008 step1[10] = WRAPLOW(step2[9] - step2[10], 8); 1009 step1[11] = WRAPLOW(step2[8] - step2[11], 8); 1010 step1[12] = WRAPLOW(-step2[12] + step2[15], 8); 1011 step1[13] = WRAPLOW(-step2[13] + step2[14], 8); 1012 step1[14] = WRAPLOW(step2[13] + step2[14], 8); 1013 step1[15] = WRAPLOW(step2[12] + step2[15], 8); 1014 1015 step1[16] = step2[16]; 1016 step1[17] = step2[17]; 1017 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 1018 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 1019 step1[18] = WRAPLOW(dct_const_round_shift(temp1), 8); 1020 step1[29] = WRAPLOW(dct_const_round_shift(temp2), 8); 1021 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 1022 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 1023 step1[19] = WRAPLOW(dct_const_round_shift(temp1), 8); 1024 step1[28] = WRAPLOW(dct_const_round_shift(temp2), 8); 1025 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 1026 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 1027 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); 1028 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); 1029 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 1030 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 1031 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); 1032 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); 1033 step1[22] = step2[22]; 1034 step1[23] = step2[23]; 1035 step1[24] = step2[24]; 1036 step1[25] = step2[25]; 1037 step1[30] = step2[30]; 1038 step1[31] = step2[31]; 1039 1040 // stage 6 1041 step2[0] = WRAPLOW(step1[0] + step1[7], 8); 1042 step2[1] = WRAPLOW(step1[1] + step1[6], 8); 1043 step2[2] = WRAPLOW(step1[2] + step1[5], 8); 1044 step2[3] = WRAPLOW(step1[3] + step1[4], 8); 1045 step2[4] = WRAPLOW(step1[3] - step1[4], 8); 1046 step2[5] = WRAPLOW(step1[2] - step1[5], 8); 1047 step2[6] = WRAPLOW(step1[1] - step1[6], 8); 1048 step2[7] = WRAPLOW(step1[0] - step1[7], 8); 1049 step2[8] = step1[8]; 1050 step2[9] = step1[9]; 1051 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1052 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1053 step2[10] = WRAPLOW(dct_const_round_shift(temp1), 8); 1054 step2[13] = WRAPLOW(dct_const_round_shift(temp2), 8); 1055 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1056 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1057 step2[11] = WRAPLOW(dct_const_round_shift(temp1), 8); 1058 step2[12] = WRAPLOW(dct_const_round_shift(temp2), 8); 1059 step2[14] = step1[14]; 1060 step2[15] = step1[15]; 1061 1062 step2[16] = WRAPLOW(step1[16] + step1[23], 8); 1063 step2[17] = WRAPLOW(step1[17] + step1[22], 8); 1064 step2[18] = WRAPLOW(step1[18] + step1[21], 8); 1065 step2[19] = WRAPLOW(step1[19] + step1[20], 8); 1066 step2[20] = WRAPLOW(step1[19] - step1[20], 8); 1067 step2[21] = WRAPLOW(step1[18] - step1[21], 8); 1068 step2[22] = WRAPLOW(step1[17] - step1[22], 8); 1069 step2[23] = WRAPLOW(step1[16] - step1[23], 8); 1070 1071 step2[24] = WRAPLOW(-step1[24] + step1[31], 8); 1072 step2[25] = WRAPLOW(-step1[25] + step1[30], 8); 1073 step2[26] = WRAPLOW(-step1[26] + step1[29], 8); 1074 step2[27] = WRAPLOW(-step1[27] + step1[28], 8); 1075 step2[28] = WRAPLOW(step1[27] + step1[28], 8); 1076 step2[29] = WRAPLOW(step1[26] + step1[29], 8); 1077 step2[30] = WRAPLOW(step1[25] + step1[30], 8); 1078 step2[31] = WRAPLOW(step1[24] + step1[31], 8); 1079 1080 // stage 7 1081 step1[0] = WRAPLOW(step2[0] + step2[15], 8); 1082 step1[1] = WRAPLOW(step2[1] + step2[14], 8); 1083 step1[2] = WRAPLOW(step2[2] + step2[13], 8); 1084 step1[3] = WRAPLOW(step2[3] + step2[12], 8); 1085 step1[4] = WRAPLOW(step2[4] + step2[11], 8); 1086 step1[5] = WRAPLOW(step2[5] + step2[10], 8); 1087 step1[6] = WRAPLOW(step2[6] + step2[9], 8); 1088 step1[7] = WRAPLOW(step2[7] + step2[8], 8); 1089 step1[8] = WRAPLOW(step2[7] - step2[8], 8); 1090 step1[9] = WRAPLOW(step2[6] - step2[9], 8); 1091 step1[10] = WRAPLOW(step2[5] - step2[10], 8); 1092 step1[11] = WRAPLOW(step2[4] - step2[11], 8); 1093 step1[12] = WRAPLOW(step2[3] - step2[12], 8); 1094 step1[13] = WRAPLOW(step2[2] - step2[13], 8); 1095 step1[14] = WRAPLOW(step2[1] - step2[14], 8); 1096 step1[15] = WRAPLOW(step2[0] - step2[15], 8); 1097 1098 step1[16] = step2[16]; 1099 step1[17] = step2[17]; 1100 step1[18] = step2[18]; 1101 step1[19] = step2[19]; 1102 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 1103 temp2 = (step2[20] + step2[27]) * cospi_16_64; 1104 step1[20] = WRAPLOW(dct_const_round_shift(temp1), 8); 1105 step1[27] = WRAPLOW(dct_const_round_shift(temp2), 8); 1106 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 1107 temp2 = (step2[21] + step2[26]) * cospi_16_64; 1108 step1[21] = WRAPLOW(dct_const_round_shift(temp1), 8); 1109 step1[26] = WRAPLOW(dct_const_round_shift(temp2), 8); 1110 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 1111 temp2 = (step2[22] + step2[25]) * cospi_16_64; 1112 step1[22] = WRAPLOW(dct_const_round_shift(temp1), 8); 1113 step1[25] = WRAPLOW(dct_const_round_shift(temp2), 8); 1114 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 1115 temp2 = (step2[23] + step2[24]) * cospi_16_64; 1116 step1[23] = WRAPLOW(dct_const_round_shift(temp1), 8); 1117 step1[24] = WRAPLOW(dct_const_round_shift(temp2), 8); 1118 step1[28] = step2[28]; 1119 step1[29] = step2[29]; 1120 step1[30] = step2[30]; 1121 step1[31] = step2[31]; 1122 1123 // final stage 1124 output[0] = WRAPLOW(step1[0] + step1[31], 8); 1125 output[1] = WRAPLOW(step1[1] + step1[30], 8); 1126 output[2] = WRAPLOW(step1[2] + step1[29], 8); 1127 output[3] = WRAPLOW(step1[3] + step1[28], 8); 1128 output[4] = WRAPLOW(step1[4] + step1[27], 8); 1129 output[5] = WRAPLOW(step1[5] + step1[26], 8); 1130 output[6] = WRAPLOW(step1[6] + step1[25], 8); 1131 output[7] = WRAPLOW(step1[7] + step1[24], 8); 1132 output[8] = WRAPLOW(step1[8] + step1[23], 8); 1133 output[9] = WRAPLOW(step1[9] + step1[22], 8); 1134 output[10] = WRAPLOW(step1[10] + step1[21], 8); 1135 output[11] = WRAPLOW(step1[11] + step1[20], 8); 1136 output[12] = WRAPLOW(step1[12] + step1[19], 8); 1137 output[13] = WRAPLOW(step1[13] + step1[18], 8); 1138 output[14] = WRAPLOW(step1[14] + step1[17], 8); 1139 output[15] = WRAPLOW(step1[15] + step1[16], 8); 1140 output[16] = WRAPLOW(step1[15] - step1[16], 8); 1141 output[17] = WRAPLOW(step1[14] - step1[17], 8); 1142 output[18] = WRAPLOW(step1[13] - step1[18], 8); 1143 output[19] = WRAPLOW(step1[12] - step1[19], 8); 1144 output[20] = WRAPLOW(step1[11] - step1[20], 8); 1145 output[21] = WRAPLOW(step1[10] - step1[21], 8); 1146 output[22] = WRAPLOW(step1[9] - step1[22], 8); 1147 output[23] = WRAPLOW(step1[8] - step1[23], 8); 1148 output[24] = WRAPLOW(step1[7] - step1[24], 8); 1149 output[25] = WRAPLOW(step1[6] - step1[25], 8); 1150 output[26] = WRAPLOW(step1[5] - step1[26], 8); 1151 output[27] = WRAPLOW(step1[4] - step1[27], 8); 1152 output[28] = WRAPLOW(step1[3] - step1[28], 8); 1153 output[29] = WRAPLOW(step1[2] - step1[29], 8); 1154 output[30] = WRAPLOW(step1[1] - step1[30], 8); 1155 output[31] = WRAPLOW(step1[0] - step1[31], 8); 1156 } 1157 1158 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, 1159 int stride) { 1160 tran_low_t out[32 * 32]; 1161 tran_low_t *outptr = out; 1162 int i, j; 1163 tran_low_t temp_in[32], temp_out[32]; 1164 1165 // Rows 1166 for (i = 0; i < 32; ++i) { 1167 int16_t zero_coeff[16]; 1168 for (j = 0; j < 16; ++j) 1169 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 1170 for (j = 0; j < 8; ++j) 1171 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1172 for (j = 0; j < 4; ++j) 1173 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1174 for (j = 0; j < 2; ++j) 1175 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 1176 1177 if (zero_coeff[0] | zero_coeff[1]) 1178 idct32_c(input, outptr); 1179 else 1180 memset(outptr, 0, sizeof(tran_low_t) * 32); 1181 input += 32; 1182 outptr += 32; 1183 } 1184 1185 // Columns 1186 for (i = 0; i < 32; ++i) { 1187 for (j = 0; j < 32; ++j) 1188 temp_in[j] = out[j * 32 + i]; 1189 idct32_c(temp_in, temp_out); 1190 for (j = 0; j < 32; ++j) { 1191 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1192 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1193 } 1194 } 1195 } 1196 1197 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, 1198 int stride) { 1199 tran_low_t out[32 * 32] = {0}; 1200 tran_low_t *outptr = out; 1201 int i, j; 1202 tran_low_t temp_in[32], temp_out[32]; 1203 1204 // Rows 1205 // only upper-left 8x8 has non-zero coeff 1206 for (i = 0; i < 8; ++i) { 1207 idct32_c(input, outptr); 1208 input += 32; 1209 outptr += 32; 1210 } 1211 1212 // Columns 1213 for (i = 0; i < 32; ++i) { 1214 for (j = 0; j < 32; ++j) 1215 temp_in[j] = out[j * 32 + i]; 1216 idct32_c(temp_in, temp_out); 1217 for (j = 0; j < 32; ++j) { 1218 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1219 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1220 } 1221 } 1222 } 1223 1224 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 1225 int i, j; 1226 tran_high_t a1; 1227 1228 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), 8); 1229 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64), 8); 1230 a1 = ROUND_POWER_OF_TWO(out, 6); 1231 1232 for (j = 0; j < 32; ++j) { 1233 for (i = 0; i < 32; ++i) 1234 dest[i] = clip_pixel_add(dest[i], a1); 1235 dest += stride; 1236 } 1237 } 1238 1239 #if CONFIG_VP9_HIGHBITDEPTH 1240 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1241 int stride, int bd) { 1242 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 1243 0.5 shifts per pixel. */ 1244 int i; 1245 tran_low_t output[16]; 1246 tran_high_t a1, b1, c1, d1, e1; 1247 const tran_low_t *ip = input; 1248 tran_low_t *op = output; 1249 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1250 1251 for (i = 0; i < 4; i++) { 1252 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1253 c1 = ip[1] >> UNIT_QUANT_SHIFT; 1254 d1 = ip[2] >> UNIT_QUANT_SHIFT; 1255 b1 = ip[3] >> UNIT_QUANT_SHIFT; 1256 a1 += c1; 1257 d1 -= b1; 1258 e1 = (a1 - d1) >> 1; 1259 b1 = e1 - b1; 1260 c1 = e1 - c1; 1261 a1 -= b1; 1262 d1 += c1; 1263 op[0] = WRAPLOW(a1, bd); 1264 op[1] = WRAPLOW(b1, bd); 1265 op[2] = WRAPLOW(c1, bd); 1266 op[3] = WRAPLOW(d1, bd); 1267 ip += 4; 1268 op += 4; 1269 } 1270 1271 ip = output; 1272 for (i = 0; i < 4; i++) { 1273 a1 = ip[4 * 0]; 1274 c1 = ip[4 * 1]; 1275 d1 = ip[4 * 2]; 1276 b1 = ip[4 * 3]; 1277 a1 += c1; 1278 d1 -= b1; 1279 e1 = (a1 - d1) >> 1; 1280 b1 = e1 - b1; 1281 c1 = e1 - c1; 1282 a1 -= b1; 1283 d1 += c1; 1284 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); 1285 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], b1, bd); 1286 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], c1, bd); 1287 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], d1, bd); 1288 1289 ip++; 1290 dest++; 1291 } 1292 } 1293 1294 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, 1295 int dest_stride, int bd) { 1296 int i; 1297 tran_high_t a1, e1; 1298 tran_low_t tmp[4]; 1299 const tran_low_t *ip = in; 1300 tran_low_t *op = tmp; 1301 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1302 (void) bd; 1303 1304 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1305 e1 = a1 >> 1; 1306 a1 -= e1; 1307 op[0] = WRAPLOW(a1, bd); 1308 op[1] = op[2] = op[3] = WRAPLOW(e1, bd); 1309 1310 ip = tmp; 1311 for (i = 0; i < 4; i++) { 1312 e1 = ip[0] >> 1; 1313 a1 = ip[0] - e1; 1314 dest[dest_stride * 0] = highbd_clip_pixel_add( 1315 dest[dest_stride * 0], a1, bd); 1316 dest[dest_stride * 1] = highbd_clip_pixel_add( 1317 dest[dest_stride * 1], e1, bd); 1318 dest[dest_stride * 2] = highbd_clip_pixel_add( 1319 dest[dest_stride * 2], e1, bd); 1320 dest[dest_stride * 3] = highbd_clip_pixel_add( 1321 dest[dest_stride * 3], e1, bd); 1322 ip++; 1323 dest++; 1324 } 1325 } 1326 1327 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { 1328 tran_low_t step[4]; 1329 tran_high_t temp1, temp2; 1330 (void) bd; 1331 // stage 1 1332 temp1 = (input[0] + input[2]) * cospi_16_64; 1333 temp2 = (input[0] - input[2]) * cospi_16_64; 1334 step[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1335 step[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1336 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 1337 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 1338 step[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1339 step[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1340 1341 // stage 2 1342 output[0] = WRAPLOW(step[0] + step[3], bd); 1343 output[1] = WRAPLOW(step[1] + step[2], bd); 1344 output[2] = WRAPLOW(step[1] - step[2], bd); 1345 output[3] = WRAPLOW(step[0] - step[3], bd); 1346 } 1347 1348 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, 1349 int stride, int bd) { 1350 tran_low_t out[4 * 4]; 1351 tran_low_t *outptr = out; 1352 int i, j; 1353 tran_low_t temp_in[4], temp_out[4]; 1354 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1355 1356 // Rows 1357 for (i = 0; i < 4; ++i) { 1358 vpx_highbd_idct4_c(input, outptr, bd); 1359 input += 4; 1360 outptr += 4; 1361 } 1362 1363 // Columns 1364 for (i = 0; i < 4; ++i) { 1365 for (j = 0; j < 4; ++j) 1366 temp_in[j] = out[j * 4 + i]; 1367 vpx_highbd_idct4_c(temp_in, temp_out, bd); 1368 for (j = 0; j < 4; ++j) { 1369 dest[j * stride + i] = highbd_clip_pixel_add( 1370 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 1371 } 1372 } 1373 } 1374 1375 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, 1376 int dest_stride, int bd) { 1377 int i; 1378 tran_high_t a1; 1379 tran_low_t out = WRAPLOW( 1380 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); 1381 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1382 1383 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); 1384 a1 = ROUND_POWER_OF_TWO(out, 4); 1385 1386 for (i = 0; i < 4; i++) { 1387 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); 1388 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); 1389 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); 1390 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); 1391 dest += dest_stride; 1392 } 1393 } 1394 1395 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { 1396 tran_low_t step1[8], step2[8]; 1397 tran_high_t temp1, temp2; 1398 // stage 1 1399 step1[0] = input[0]; 1400 step1[2] = input[4]; 1401 step1[1] = input[2]; 1402 step1[3] = input[6]; 1403 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 1404 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 1405 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1406 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1407 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 1408 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 1409 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1410 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1411 1412 // stage 2 & stage 3 - even half 1413 vpx_highbd_idct4_c(step1, step1, bd); 1414 1415 // stage 2 - odd half 1416 step2[4] = WRAPLOW(step1[4] + step1[5], bd); 1417 step2[5] = WRAPLOW(step1[4] - step1[5], bd); 1418 step2[6] = WRAPLOW(-step1[6] + step1[7], bd); 1419 step2[7] = WRAPLOW(step1[6] + step1[7], bd); 1420 1421 // stage 3 - odd half 1422 step1[4] = step2[4]; 1423 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1424 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1425 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1426 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1427 step1[7] = step2[7]; 1428 1429 // stage 4 1430 output[0] = WRAPLOW(step1[0] + step1[7], bd); 1431 output[1] = WRAPLOW(step1[1] + step1[6], bd); 1432 output[2] = WRAPLOW(step1[2] + step1[5], bd); 1433 output[3] = WRAPLOW(step1[3] + step1[4], bd); 1434 output[4] = WRAPLOW(step1[3] - step1[4], bd); 1435 output[5] = WRAPLOW(step1[2] - step1[5], bd); 1436 output[6] = WRAPLOW(step1[1] - step1[6], bd); 1437 output[7] = WRAPLOW(step1[0] - step1[7], bd); 1438 } 1439 1440 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, 1441 int stride, int bd) { 1442 tran_low_t out[8 * 8]; 1443 tran_low_t *outptr = out; 1444 int i, j; 1445 tran_low_t temp_in[8], temp_out[8]; 1446 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1447 1448 // First transform rows. 1449 for (i = 0; i < 8; ++i) { 1450 vpx_highbd_idct8_c(input, outptr, bd); 1451 input += 8; 1452 outptr += 8; 1453 } 1454 1455 // Then transform columns. 1456 for (i = 0; i < 8; ++i) { 1457 for (j = 0; j < 8; ++j) 1458 temp_in[j] = out[j * 8 + i]; 1459 vpx_highbd_idct8_c(temp_in, temp_out, bd); 1460 for (j = 0; j < 8; ++j) { 1461 dest[j * stride + i] = highbd_clip_pixel_add( 1462 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1463 } 1464 } 1465 } 1466 1467 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, 1468 int stride, int bd) { 1469 int i, j; 1470 tran_high_t a1; 1471 tran_low_t out = WRAPLOW( 1472 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); 1473 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1474 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); 1475 a1 = ROUND_POWER_OF_TWO(out, 5); 1476 for (j = 0; j < 8; ++j) { 1477 for (i = 0; i < 8; ++i) 1478 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 1479 dest += stride; 1480 } 1481 } 1482 1483 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { 1484 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1485 1486 tran_low_t x0 = input[0]; 1487 tran_low_t x1 = input[1]; 1488 tran_low_t x2 = input[2]; 1489 tran_low_t x3 = input[3]; 1490 (void) bd; 1491 1492 if (!(x0 | x1 | x2 | x3)) { 1493 memset(output, 0, 4 * sizeof(*output)); 1494 return; 1495 } 1496 1497 s0 = sinpi_1_9 * x0; 1498 s1 = sinpi_2_9 * x0; 1499 s2 = sinpi_3_9 * x1; 1500 s3 = sinpi_4_9 * x2; 1501 s4 = sinpi_1_9 * x2; 1502 s5 = sinpi_2_9 * x3; 1503 s6 = sinpi_4_9 * x3; 1504 s7 = (tran_high_t)(x0 - x2 + x3); 1505 1506 s0 = s0 + s3 + s5; 1507 s1 = s1 - s4 - s6; 1508 s3 = s2; 1509 s2 = sinpi_3_9 * s7; 1510 1511 // 1-D transform scaling factor is sqrt(2). 1512 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 1513 // + 1b (addition) = 29b. 1514 // Hence the output bit depth is 15b. 1515 output[0] = WRAPLOW(highbd_dct_const_round_shift(s0 + s3, bd), bd); 1516 output[1] = WRAPLOW(highbd_dct_const_round_shift(s1 + s3, bd), bd); 1517 output[2] = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); 1518 output[3] = WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3, bd), bd); 1519 } 1520 1521 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { 1522 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1523 1524 tran_low_t x0 = input[7]; 1525 tran_low_t x1 = input[0]; 1526 tran_low_t x2 = input[5]; 1527 tran_low_t x3 = input[2]; 1528 tran_low_t x4 = input[3]; 1529 tran_low_t x5 = input[4]; 1530 tran_low_t x6 = input[1]; 1531 tran_low_t x7 = input[6]; 1532 (void) bd; 1533 1534 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 1535 memset(output, 0, 8 * sizeof(*output)); 1536 return; 1537 } 1538 1539 // stage 1 1540 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 1541 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 1542 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 1543 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 1544 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 1545 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 1546 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 1547 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 1548 1549 x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s4, bd), bd); 1550 x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s5, bd), bd); 1551 x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s6, bd), bd); 1552 x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s7, bd), bd); 1553 x4 = WRAPLOW(highbd_dct_const_round_shift(s0 - s4, bd), bd); 1554 x5 = WRAPLOW(highbd_dct_const_round_shift(s1 - s5, bd), bd); 1555 x6 = WRAPLOW(highbd_dct_const_round_shift(s2 - s6, bd), bd); 1556 x7 = WRAPLOW(highbd_dct_const_round_shift(s3 - s7, bd), bd); 1557 1558 // stage 2 1559 s0 = x0; 1560 s1 = x1; 1561 s2 = x2; 1562 s3 = x3; 1563 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 1564 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 1565 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 1566 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 1567 1568 x0 = WRAPLOW(s0 + s2, bd); 1569 x1 = WRAPLOW(s1 + s3, bd); 1570 x2 = WRAPLOW(s0 - s2, bd); 1571 x3 = WRAPLOW(s1 - s3, bd); 1572 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); 1573 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); 1574 x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); 1575 x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); 1576 1577 // stage 3 1578 s2 = cospi_16_64 * (x2 + x3); 1579 s3 = cospi_16_64 * (x2 - x3); 1580 s6 = cospi_16_64 * (x6 + x7); 1581 s7 = cospi_16_64 * (x6 - x7); 1582 1583 x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); 1584 x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); 1585 x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); 1586 x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); 1587 1588 output[0] = WRAPLOW(x0, bd); 1589 output[1] = WRAPLOW(-x4, bd); 1590 output[2] = WRAPLOW(x6, bd); 1591 output[3] = WRAPLOW(-x2, bd); 1592 output[4] = WRAPLOW(x3, bd); 1593 output[5] = WRAPLOW(-x7, bd); 1594 output[6] = WRAPLOW(x5, bd); 1595 output[7] = WRAPLOW(-x1, bd); 1596 } 1597 1598 void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, 1599 int stride, int bd) { 1600 tran_low_t out[8 * 8] = { 0 }; 1601 tran_low_t *outptr = out; 1602 int i, j; 1603 tran_low_t temp_in[8], temp_out[8]; 1604 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1605 1606 // First transform rows. 1607 // Only first 4 row has non-zero coefs. 1608 for (i = 0; i < 4; ++i) { 1609 vpx_highbd_idct8_c(input, outptr, bd); 1610 input += 8; 1611 outptr += 8; 1612 } 1613 // Then transform columns. 1614 for (i = 0; i < 8; ++i) { 1615 for (j = 0; j < 8; ++j) 1616 temp_in[j] = out[j * 8 + i]; 1617 vpx_highbd_idct8_c(temp_in, temp_out, bd); 1618 for (j = 0; j < 8; ++j) { 1619 dest[j * stride + i] = highbd_clip_pixel_add( 1620 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1621 } 1622 } 1623 } 1624 1625 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { 1626 tran_low_t step1[16], step2[16]; 1627 tran_high_t temp1, temp2; 1628 (void) bd; 1629 1630 // stage 1 1631 step1[0] = input[0/2]; 1632 step1[1] = input[16/2]; 1633 step1[2] = input[8/2]; 1634 step1[3] = input[24/2]; 1635 step1[4] = input[4/2]; 1636 step1[5] = input[20/2]; 1637 step1[6] = input[12/2]; 1638 step1[7] = input[28/2]; 1639 step1[8] = input[2/2]; 1640 step1[9] = input[18/2]; 1641 step1[10] = input[10/2]; 1642 step1[11] = input[26/2]; 1643 step1[12] = input[6/2]; 1644 step1[13] = input[22/2]; 1645 step1[14] = input[14/2]; 1646 step1[15] = input[30/2]; 1647 1648 // stage 2 1649 step2[0] = step1[0]; 1650 step2[1] = step1[1]; 1651 step2[2] = step1[2]; 1652 step2[3] = step1[3]; 1653 step2[4] = step1[4]; 1654 step2[5] = step1[5]; 1655 step2[6] = step1[6]; 1656 step2[7] = step1[7]; 1657 1658 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 1659 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 1660 step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1661 step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1662 1663 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 1664 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 1665 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1666 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1667 1668 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1669 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1670 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1671 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1672 1673 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1674 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1675 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1676 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1677 1678 // stage 3 1679 step1[0] = step2[0]; 1680 step1[1] = step2[1]; 1681 step1[2] = step2[2]; 1682 step1[3] = step2[3]; 1683 1684 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1685 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1686 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1687 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1688 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1689 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1690 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1691 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1692 1693 step1[8] = WRAPLOW(step2[8] + step2[9], bd); 1694 step1[9] = WRAPLOW(step2[8] - step2[9], bd); 1695 step1[10] = WRAPLOW(-step2[10] + step2[11], bd); 1696 step1[11] = WRAPLOW(step2[10] + step2[11], bd); 1697 step1[12] = WRAPLOW(step2[12] + step2[13], bd); 1698 step1[13] = WRAPLOW(step2[12] - step2[13], bd); 1699 step1[14] = WRAPLOW(-step2[14] + step2[15], bd); 1700 step1[15] = WRAPLOW(step2[14] + step2[15], bd); 1701 1702 // stage 4 1703 temp1 = (step1[0] + step1[1]) * cospi_16_64; 1704 temp2 = (step1[0] - step1[1]) * cospi_16_64; 1705 step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1706 step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1707 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1708 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1709 step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1710 step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1711 step2[4] = WRAPLOW(step1[4] + step1[5], bd); 1712 step2[5] = WRAPLOW(step1[4] - step1[5], bd); 1713 step2[6] = WRAPLOW(-step1[6] + step1[7], bd); 1714 step2[7] = WRAPLOW(step1[6] + step1[7], bd); 1715 1716 step2[8] = step1[8]; 1717 step2[15] = step1[15]; 1718 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1719 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1720 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1721 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1722 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1723 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1724 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1725 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1726 step2[11] = step1[11]; 1727 step2[12] = step1[12]; 1728 1729 // stage 5 1730 step1[0] = WRAPLOW(step2[0] + step2[3], bd); 1731 step1[1] = WRAPLOW(step2[1] + step2[2], bd); 1732 step1[2] = WRAPLOW(step2[1] - step2[2], bd); 1733 step1[3] = WRAPLOW(step2[0] - step2[3], bd); 1734 step1[4] = step2[4]; 1735 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1736 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1737 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1738 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1739 step1[7] = step2[7]; 1740 1741 step1[8] = WRAPLOW(step2[8] + step2[11], bd); 1742 step1[9] = WRAPLOW(step2[9] + step2[10], bd); 1743 step1[10] = WRAPLOW(step2[9] - step2[10], bd); 1744 step1[11] = WRAPLOW(step2[8] - step2[11], bd); 1745 step1[12] = WRAPLOW(-step2[12] + step2[15], bd); 1746 step1[13] = WRAPLOW(-step2[13] + step2[14], bd); 1747 step1[14] = WRAPLOW(step2[13] + step2[14], bd); 1748 step1[15] = WRAPLOW(step2[12] + step2[15], bd); 1749 1750 // stage 6 1751 step2[0] = WRAPLOW(step1[0] + step1[7], bd); 1752 step2[1] = WRAPLOW(step1[1] + step1[6], bd); 1753 step2[2] = WRAPLOW(step1[2] + step1[5], bd); 1754 step2[3] = WRAPLOW(step1[3] + step1[4], bd); 1755 step2[4] = WRAPLOW(step1[3] - step1[4], bd); 1756 step2[5] = WRAPLOW(step1[2] - step1[5], bd); 1757 step2[6] = WRAPLOW(step1[1] - step1[6], bd); 1758 step2[7] = WRAPLOW(step1[0] - step1[7], bd); 1759 step2[8] = step1[8]; 1760 step2[9] = step1[9]; 1761 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1762 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1763 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1764 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1765 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1766 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1767 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 1768 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 1769 step2[14] = step1[14]; 1770 step2[15] = step1[15]; 1771 1772 // stage 7 1773 output[0] = WRAPLOW(step2[0] + step2[15], bd); 1774 output[1] = WRAPLOW(step2[1] + step2[14], bd); 1775 output[2] = WRAPLOW(step2[2] + step2[13], bd); 1776 output[3] = WRAPLOW(step2[3] + step2[12], bd); 1777 output[4] = WRAPLOW(step2[4] + step2[11], bd); 1778 output[5] = WRAPLOW(step2[5] + step2[10], bd); 1779 output[6] = WRAPLOW(step2[6] + step2[9], bd); 1780 output[7] = WRAPLOW(step2[7] + step2[8], bd); 1781 output[8] = WRAPLOW(step2[7] - step2[8], bd); 1782 output[9] = WRAPLOW(step2[6] - step2[9], bd); 1783 output[10] = WRAPLOW(step2[5] - step2[10], bd); 1784 output[11] = WRAPLOW(step2[4] - step2[11], bd); 1785 output[12] = WRAPLOW(step2[3] - step2[12], bd); 1786 output[13] = WRAPLOW(step2[2] - step2[13], bd); 1787 output[14] = WRAPLOW(step2[1] - step2[14], bd); 1788 output[15] = WRAPLOW(step2[0] - step2[15], bd); 1789 } 1790 1791 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, 1792 int stride, int bd) { 1793 tran_low_t out[16 * 16]; 1794 tran_low_t *outptr = out; 1795 int i, j; 1796 tran_low_t temp_in[16], temp_out[16]; 1797 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1798 1799 // First transform rows. 1800 for (i = 0; i < 16; ++i) { 1801 vpx_highbd_idct16_c(input, outptr, bd); 1802 input += 16; 1803 outptr += 16; 1804 } 1805 1806 // Then transform columns. 1807 for (i = 0; i < 16; ++i) { 1808 for (j = 0; j < 16; ++j) 1809 temp_in[j] = out[j * 16 + i]; 1810 vpx_highbd_idct16_c(temp_in, temp_out, bd); 1811 for (j = 0; j < 16; ++j) { 1812 dest[j * stride + i] = highbd_clip_pixel_add( 1813 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 1814 } 1815 } 1816 } 1817 1818 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { 1819 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 1820 tran_high_t s9, s10, s11, s12, s13, s14, s15; 1821 1822 tran_low_t x0 = input[15]; 1823 tran_low_t x1 = input[0]; 1824 tran_low_t x2 = input[13]; 1825 tran_low_t x3 = input[2]; 1826 tran_low_t x4 = input[11]; 1827 tran_low_t x5 = input[4]; 1828 tran_low_t x6 = input[9]; 1829 tran_low_t x7 = input[6]; 1830 tran_low_t x8 = input[7]; 1831 tran_low_t x9 = input[8]; 1832 tran_low_t x10 = input[5]; 1833 tran_low_t x11 = input[10]; 1834 tran_low_t x12 = input[3]; 1835 tran_low_t x13 = input[12]; 1836 tran_low_t x14 = input[1]; 1837 tran_low_t x15 = input[14]; 1838 (void) bd; 1839 1840 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 1841 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { 1842 memset(output, 0, 16 * sizeof(*output)); 1843 return; 1844 } 1845 1846 // stage 1 1847 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 1848 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 1849 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 1850 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 1851 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 1852 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 1853 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 1854 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 1855 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 1856 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 1857 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 1858 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 1859 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 1860 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 1861 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 1862 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 1863 1864 x0 = WRAPLOW(highbd_dct_const_round_shift(s0 + s8, bd), bd); 1865 x1 = WRAPLOW(highbd_dct_const_round_shift(s1 + s9, bd), bd); 1866 x2 = WRAPLOW(highbd_dct_const_round_shift(s2 + s10, bd), bd); 1867 x3 = WRAPLOW(highbd_dct_const_round_shift(s3 + s11, bd), bd); 1868 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s12, bd), bd); 1869 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s13, bd), bd); 1870 x6 = WRAPLOW(highbd_dct_const_round_shift(s6 + s14, bd), bd); 1871 x7 = WRAPLOW(highbd_dct_const_round_shift(s7 + s15, bd), bd); 1872 x8 = WRAPLOW(highbd_dct_const_round_shift(s0 - s8, bd), bd); 1873 x9 = WRAPLOW(highbd_dct_const_round_shift(s1 - s9, bd), bd); 1874 x10 = WRAPLOW(highbd_dct_const_round_shift(s2 - s10, bd), bd); 1875 x11 = WRAPLOW(highbd_dct_const_round_shift(s3 - s11, bd), bd); 1876 x12 = WRAPLOW(highbd_dct_const_round_shift(s4 - s12, bd), bd); 1877 x13 = WRAPLOW(highbd_dct_const_round_shift(s5 - s13, bd), bd); 1878 x14 = WRAPLOW(highbd_dct_const_round_shift(s6 - s14, bd), bd); 1879 x15 = WRAPLOW(highbd_dct_const_round_shift(s7 - s15, bd), bd); 1880 1881 // stage 2 1882 s0 = x0; 1883 s1 = x1; 1884 s2 = x2; 1885 s3 = x3; 1886 s4 = x4; 1887 s5 = x5; 1888 s6 = x6; 1889 s7 = x7; 1890 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 1891 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 1892 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 1893 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 1894 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; 1895 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 1896 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; 1897 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 1898 1899 x0 = WRAPLOW(s0 + s4, bd); 1900 x1 = WRAPLOW(s1 + s5, bd); 1901 x2 = WRAPLOW(s2 + s6, bd); 1902 x3 = WRAPLOW(s3 + s7, bd); 1903 x4 = WRAPLOW(s0 - s4, bd); 1904 x5 = WRAPLOW(s1 - s5, bd); 1905 x6 = WRAPLOW(s2 - s6, bd); 1906 x7 = WRAPLOW(s3 - s7, bd); 1907 x8 = WRAPLOW(highbd_dct_const_round_shift(s8 + s12, bd), bd); 1908 x9 = WRAPLOW(highbd_dct_const_round_shift(s9 + s13, bd), bd); 1909 x10 = WRAPLOW(highbd_dct_const_round_shift(s10 + s14, bd), bd); 1910 x11 = WRAPLOW(highbd_dct_const_round_shift(s11 + s15, bd), bd); 1911 x12 = WRAPLOW(highbd_dct_const_round_shift(s8 - s12, bd), bd); 1912 x13 = WRAPLOW(highbd_dct_const_round_shift(s9 - s13, bd), bd); 1913 x14 = WRAPLOW(highbd_dct_const_round_shift(s10 - s14, bd), bd); 1914 x15 = WRAPLOW(highbd_dct_const_round_shift(s11 - s15, bd), bd); 1915 1916 // stage 3 1917 s0 = x0; 1918 s1 = x1; 1919 s2 = x2; 1920 s3 = x3; 1921 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 1922 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 1923 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; 1924 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 1925 s8 = x8; 1926 s9 = x9; 1927 s10 = x10; 1928 s11 = x11; 1929 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 1930 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 1931 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; 1932 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 1933 1934 x0 = WRAPLOW(s0 + s2, bd); 1935 x1 = WRAPLOW(s1 + s3, bd); 1936 x2 = WRAPLOW(s0 - s2, bd); 1937 x3 = WRAPLOW(s1 - s3, bd); 1938 x4 = WRAPLOW(highbd_dct_const_round_shift(s4 + s6, bd), bd); 1939 x5 = WRAPLOW(highbd_dct_const_round_shift(s5 + s7, bd), bd); 1940 x6 = WRAPLOW(highbd_dct_const_round_shift(s4 - s6, bd), bd); 1941 x7 = WRAPLOW(highbd_dct_const_round_shift(s5 - s7, bd), bd); 1942 x8 = WRAPLOW(s8 + s10, bd); 1943 x9 = WRAPLOW(s9 + s11, bd); 1944 x10 = WRAPLOW(s8 - s10, bd); 1945 x11 = WRAPLOW(s9 - s11, bd); 1946 x12 = WRAPLOW(highbd_dct_const_round_shift(s12 + s14, bd), bd); 1947 x13 = WRAPLOW(highbd_dct_const_round_shift(s13 + s15, bd), bd); 1948 x14 = WRAPLOW(highbd_dct_const_round_shift(s12 - s14, bd), bd); 1949 x15 = WRAPLOW(highbd_dct_const_round_shift(s13 - s15, bd), bd); 1950 1951 // stage 4 1952 s2 = (- cospi_16_64) * (x2 + x3); 1953 s3 = cospi_16_64 * (x2 - x3); 1954 s6 = cospi_16_64 * (x6 + x7); 1955 s7 = cospi_16_64 * (-x6 + x7); 1956 s10 = cospi_16_64 * (x10 + x11); 1957 s11 = cospi_16_64 * (-x10 + x11); 1958 s14 = (- cospi_16_64) * (x14 + x15); 1959 s15 = cospi_16_64 * (x14 - x15); 1960 1961 x2 = WRAPLOW(highbd_dct_const_round_shift(s2, bd), bd); 1962 x3 = WRAPLOW(highbd_dct_const_round_shift(s3, bd), bd); 1963 x6 = WRAPLOW(highbd_dct_const_round_shift(s6, bd), bd); 1964 x7 = WRAPLOW(highbd_dct_const_round_shift(s7, bd), bd); 1965 x10 = WRAPLOW(highbd_dct_const_round_shift(s10, bd), bd); 1966 x11 = WRAPLOW(highbd_dct_const_round_shift(s11, bd), bd); 1967 x14 = WRAPLOW(highbd_dct_const_round_shift(s14, bd), bd); 1968 x15 = WRAPLOW(highbd_dct_const_round_shift(s15, bd), bd); 1969 1970 output[0] = WRAPLOW(x0, bd); 1971 output[1] = WRAPLOW(-x8, bd); 1972 output[2] = WRAPLOW(x12, bd); 1973 output[3] = WRAPLOW(-x4, bd); 1974 output[4] = WRAPLOW(x6, bd); 1975 output[5] = WRAPLOW(x14, bd); 1976 output[6] = WRAPLOW(x10, bd); 1977 output[7] = WRAPLOW(x2, bd); 1978 output[8] = WRAPLOW(x3, bd); 1979 output[9] = WRAPLOW(x11, bd); 1980 output[10] = WRAPLOW(x15, bd); 1981 output[11] = WRAPLOW(x7, bd); 1982 output[12] = WRAPLOW(x5, bd); 1983 output[13] = WRAPLOW(-x13, bd); 1984 output[14] = WRAPLOW(x9, bd); 1985 output[15] = WRAPLOW(-x1, bd); 1986 } 1987 1988 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, 1989 int stride, int bd) { 1990 tran_low_t out[16 * 16] = { 0 }; 1991 tran_low_t *outptr = out; 1992 int i, j; 1993 tran_low_t temp_in[16], temp_out[16]; 1994 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 1995 1996 // First transform rows. Since all non-zero dct coefficients are in 1997 // upper-left 4x4 area, we only need to calculate first 4 rows here. 1998 for (i = 0; i < 4; ++i) { 1999 vpx_highbd_idct16_c(input, outptr, bd); 2000 input += 16; 2001 outptr += 16; 2002 } 2003 2004 // Then transform columns. 2005 for (i = 0; i < 16; ++i) { 2006 for (j = 0; j < 16; ++j) 2007 temp_in[j] = out[j*16 + i]; 2008 vpx_highbd_idct16_c(temp_in, temp_out, bd); 2009 for (j = 0; j < 16; ++j) { 2010 dest[j * stride + i] = highbd_clip_pixel_add( 2011 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2012 } 2013 } 2014 } 2015 2016 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, 2017 int stride, int bd) { 2018 int i, j; 2019 tran_high_t a1; 2020 tran_low_t out = WRAPLOW( 2021 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); 2022 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2023 2024 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); 2025 a1 = ROUND_POWER_OF_TWO(out, 6); 2026 for (j = 0; j < 16; ++j) { 2027 for (i = 0; i < 16; ++i) 2028 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 2029 dest += stride; 2030 } 2031 } 2032 2033 static void highbd_idct32_c(const tran_low_t *input, 2034 tran_low_t *output, int bd) { 2035 tran_low_t step1[32], step2[32]; 2036 tran_high_t temp1, temp2; 2037 (void) bd; 2038 2039 // stage 1 2040 step1[0] = input[0]; 2041 step1[1] = input[16]; 2042 step1[2] = input[8]; 2043 step1[3] = input[24]; 2044 step1[4] = input[4]; 2045 step1[5] = input[20]; 2046 step1[6] = input[12]; 2047 step1[7] = input[28]; 2048 step1[8] = input[2]; 2049 step1[9] = input[18]; 2050 step1[10] = input[10]; 2051 step1[11] = input[26]; 2052 step1[12] = input[6]; 2053 step1[13] = input[22]; 2054 step1[14] = input[14]; 2055 step1[15] = input[30]; 2056 2057 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 2058 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 2059 step1[16] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2060 step1[31] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2061 2062 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 2063 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 2064 step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2065 step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2066 2067 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 2068 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 2069 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2070 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2071 2072 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 2073 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 2074 step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2075 step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2076 2077 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 2078 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 2079 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2080 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2081 2082 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 2083 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 2084 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2085 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2086 2087 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 2088 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 2089 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2090 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2091 2092 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 2093 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 2094 step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2095 step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2096 2097 // stage 2 2098 step2[0] = step1[0]; 2099 step2[1] = step1[1]; 2100 step2[2] = step1[2]; 2101 step2[3] = step1[3]; 2102 step2[4] = step1[4]; 2103 step2[5] = step1[5]; 2104 step2[6] = step1[6]; 2105 step2[7] = step1[7]; 2106 2107 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 2108 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 2109 step2[8] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2110 step2[15] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2111 2112 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 2113 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 2114 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2115 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2116 2117 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 2118 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 2119 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2120 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2121 2122 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 2123 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 2124 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2125 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2126 2127 step2[16] = WRAPLOW(step1[16] + step1[17], bd); 2128 step2[17] = WRAPLOW(step1[16] - step1[17], bd); 2129 step2[18] = WRAPLOW(-step1[18] + step1[19], bd); 2130 step2[19] = WRAPLOW(step1[18] + step1[19], bd); 2131 step2[20] = WRAPLOW(step1[20] + step1[21], bd); 2132 step2[21] = WRAPLOW(step1[20] - step1[21], bd); 2133 step2[22] = WRAPLOW(-step1[22] + step1[23], bd); 2134 step2[23] = WRAPLOW(step1[22] + step1[23], bd); 2135 step2[24] = WRAPLOW(step1[24] + step1[25], bd); 2136 step2[25] = WRAPLOW(step1[24] - step1[25], bd); 2137 step2[26] = WRAPLOW(-step1[26] + step1[27], bd); 2138 step2[27] = WRAPLOW(step1[26] + step1[27], bd); 2139 step2[28] = WRAPLOW(step1[28] + step1[29], bd); 2140 step2[29] = WRAPLOW(step1[28] - step1[29], bd); 2141 step2[30] = WRAPLOW(-step1[30] + step1[31], bd); 2142 step2[31] = WRAPLOW(step1[30] + step1[31], bd); 2143 2144 // stage 3 2145 step1[0] = step2[0]; 2146 step1[1] = step2[1]; 2147 step1[2] = step2[2]; 2148 step1[3] = step2[3]; 2149 2150 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 2151 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 2152 step1[4] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2153 step1[7] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2154 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 2155 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 2156 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2157 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2158 2159 step1[8] = WRAPLOW(step2[8] + step2[9], bd); 2160 step1[9] = WRAPLOW(step2[8] - step2[9], bd); 2161 step1[10] = WRAPLOW(-step2[10] + step2[11], bd); 2162 step1[11] = WRAPLOW(step2[10] + step2[11], bd); 2163 step1[12] = WRAPLOW(step2[12] + step2[13], bd); 2164 step1[13] = WRAPLOW(step2[12] - step2[13], bd); 2165 step1[14] = WRAPLOW(-step2[14] + step2[15], bd); 2166 step1[15] = WRAPLOW(step2[14] + step2[15], bd); 2167 2168 step1[16] = step2[16]; 2169 step1[31] = step2[31]; 2170 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 2171 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 2172 step1[17] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2173 step1[30] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2174 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 2175 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 2176 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2177 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2178 step1[19] = step2[19]; 2179 step1[20] = step2[20]; 2180 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 2181 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 2182 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2183 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2184 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 2185 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 2186 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2187 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2188 step1[23] = step2[23]; 2189 step1[24] = step2[24]; 2190 step1[27] = step2[27]; 2191 step1[28] = step2[28]; 2192 2193 // stage 4 2194 temp1 = (step1[0] + step1[1]) * cospi_16_64; 2195 temp2 = (step1[0] - step1[1]) * cospi_16_64; 2196 step2[0] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2197 step2[1] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2198 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 2199 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 2200 step2[2] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2201 step2[3] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2202 step2[4] = WRAPLOW(step1[4] + step1[5], bd); 2203 step2[5] = WRAPLOW(step1[4] - step1[5], bd); 2204 step2[6] = WRAPLOW(-step1[6] + step1[7], bd); 2205 step2[7] = WRAPLOW(step1[6] + step1[7], bd); 2206 2207 step2[8] = step1[8]; 2208 step2[15] = step1[15]; 2209 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 2210 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 2211 step2[9] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2212 step2[14] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2213 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 2214 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 2215 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2216 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2217 step2[11] = step1[11]; 2218 step2[12] = step1[12]; 2219 2220 step2[16] = WRAPLOW(step1[16] + step1[19], bd); 2221 step2[17] = WRAPLOW(step1[17] + step1[18], bd); 2222 step2[18] = WRAPLOW(step1[17] - step1[18], bd); 2223 step2[19] = WRAPLOW(step1[16] - step1[19], bd); 2224 step2[20] = WRAPLOW(-step1[20] + step1[23], bd); 2225 step2[21] = WRAPLOW(-step1[21] + step1[22], bd); 2226 step2[22] = WRAPLOW(step1[21] + step1[22], bd); 2227 step2[23] = WRAPLOW(step1[20] + step1[23], bd); 2228 2229 step2[24] = WRAPLOW(step1[24] + step1[27], bd); 2230 step2[25] = WRAPLOW(step1[25] + step1[26], bd); 2231 step2[26] = WRAPLOW(step1[25] - step1[26], bd); 2232 step2[27] = WRAPLOW(step1[24] - step1[27], bd); 2233 step2[28] = WRAPLOW(-step1[28] + step1[31], bd); 2234 step2[29] = WRAPLOW(-step1[29] + step1[30], bd); 2235 step2[30] = WRAPLOW(step1[29] + step1[30], bd); 2236 step2[31] = WRAPLOW(step1[28] + step1[31], bd); 2237 2238 // stage 5 2239 step1[0] = WRAPLOW(step2[0] + step2[3], bd); 2240 step1[1] = WRAPLOW(step2[1] + step2[2], bd); 2241 step1[2] = WRAPLOW(step2[1] - step2[2], bd); 2242 step1[3] = WRAPLOW(step2[0] - step2[3], bd); 2243 step1[4] = step2[4]; 2244 temp1 = (step2[6] - step2[5]) * cospi_16_64; 2245 temp2 = (step2[5] + step2[6]) * cospi_16_64; 2246 step1[5] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2247 step1[6] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2248 step1[7] = step2[7]; 2249 2250 step1[8] = WRAPLOW(step2[8] + step2[11], bd); 2251 step1[9] = WRAPLOW(step2[9] + step2[10], bd); 2252 step1[10] = WRAPLOW(step2[9] - step2[10], bd); 2253 step1[11] = WRAPLOW(step2[8] - step2[11], bd); 2254 step1[12] = WRAPLOW(-step2[12] + step2[15], bd); 2255 step1[13] = WRAPLOW(-step2[13] + step2[14], bd); 2256 step1[14] = WRAPLOW(step2[13] + step2[14], bd); 2257 step1[15] = WRAPLOW(step2[12] + step2[15], bd); 2258 2259 step1[16] = step2[16]; 2260 step1[17] = step2[17]; 2261 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 2262 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 2263 step1[18] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2264 step1[29] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2265 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 2266 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 2267 step1[19] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2268 step1[28] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2269 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 2270 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 2271 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2272 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2273 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 2274 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 2275 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2276 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2277 step1[22] = step2[22]; 2278 step1[23] = step2[23]; 2279 step1[24] = step2[24]; 2280 step1[25] = step2[25]; 2281 step1[30] = step2[30]; 2282 step1[31] = step2[31]; 2283 2284 // stage 6 2285 step2[0] = WRAPLOW(step1[0] + step1[7], bd); 2286 step2[1] = WRAPLOW(step1[1] + step1[6], bd); 2287 step2[2] = WRAPLOW(step1[2] + step1[5], bd); 2288 step2[3] = WRAPLOW(step1[3] + step1[4], bd); 2289 step2[4] = WRAPLOW(step1[3] - step1[4], bd); 2290 step2[5] = WRAPLOW(step1[2] - step1[5], bd); 2291 step2[6] = WRAPLOW(step1[1] - step1[6], bd); 2292 step2[7] = WRAPLOW(step1[0] - step1[7], bd); 2293 step2[8] = step1[8]; 2294 step2[9] = step1[9]; 2295 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 2296 temp2 = (step1[10] + step1[13]) * cospi_16_64; 2297 step2[10] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2298 step2[13] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2299 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 2300 temp2 = (step1[11] + step1[12]) * cospi_16_64; 2301 step2[11] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2302 step2[12] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2303 step2[14] = step1[14]; 2304 step2[15] = step1[15]; 2305 2306 step2[16] = WRAPLOW(step1[16] + step1[23], bd); 2307 step2[17] = WRAPLOW(step1[17] + step1[22], bd); 2308 step2[18] = WRAPLOW(step1[18] + step1[21], bd); 2309 step2[19] = WRAPLOW(step1[19] + step1[20], bd); 2310 step2[20] = WRAPLOW(step1[19] - step1[20], bd); 2311 step2[21] = WRAPLOW(step1[18] - step1[21], bd); 2312 step2[22] = WRAPLOW(step1[17] - step1[22], bd); 2313 step2[23] = WRAPLOW(step1[16] - step1[23], bd); 2314 2315 step2[24] = WRAPLOW(-step1[24] + step1[31], bd); 2316 step2[25] = WRAPLOW(-step1[25] + step1[30], bd); 2317 step2[26] = WRAPLOW(-step1[26] + step1[29], bd); 2318 step2[27] = WRAPLOW(-step1[27] + step1[28], bd); 2319 step2[28] = WRAPLOW(step1[27] + step1[28], bd); 2320 step2[29] = WRAPLOW(step1[26] + step1[29], bd); 2321 step2[30] = WRAPLOW(step1[25] + step1[30], bd); 2322 step2[31] = WRAPLOW(step1[24] + step1[31], bd); 2323 2324 // stage 7 2325 step1[0] = WRAPLOW(step2[0] + step2[15], bd); 2326 step1[1] = WRAPLOW(step2[1] + step2[14], bd); 2327 step1[2] = WRAPLOW(step2[2] + step2[13], bd); 2328 step1[3] = WRAPLOW(step2[3] + step2[12], bd); 2329 step1[4] = WRAPLOW(step2[4] + step2[11], bd); 2330 step1[5] = WRAPLOW(step2[5] + step2[10], bd); 2331 step1[6] = WRAPLOW(step2[6] + step2[9], bd); 2332 step1[7] = WRAPLOW(step2[7] + step2[8], bd); 2333 step1[8] = WRAPLOW(step2[7] - step2[8], bd); 2334 step1[9] = WRAPLOW(step2[6] - step2[9], bd); 2335 step1[10] = WRAPLOW(step2[5] - step2[10], bd); 2336 step1[11] = WRAPLOW(step2[4] - step2[11], bd); 2337 step1[12] = WRAPLOW(step2[3] - step2[12], bd); 2338 step1[13] = WRAPLOW(step2[2] - step2[13], bd); 2339 step1[14] = WRAPLOW(step2[1] - step2[14], bd); 2340 step1[15] = WRAPLOW(step2[0] - step2[15], bd); 2341 2342 step1[16] = step2[16]; 2343 step1[17] = step2[17]; 2344 step1[18] = step2[18]; 2345 step1[19] = step2[19]; 2346 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 2347 temp2 = (step2[20] + step2[27]) * cospi_16_64; 2348 step1[20] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2349 step1[27] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2350 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 2351 temp2 = (step2[21] + step2[26]) * cospi_16_64; 2352 step1[21] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2353 step1[26] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2354 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 2355 temp2 = (step2[22] + step2[25]) * cospi_16_64; 2356 step1[22] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2357 step1[25] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2358 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 2359 temp2 = (step2[23] + step2[24]) * cospi_16_64; 2360 step1[23] = WRAPLOW(highbd_dct_const_round_shift(temp1, bd), bd); 2361 step1[24] = WRAPLOW(highbd_dct_const_round_shift(temp2, bd), bd); 2362 step1[28] = step2[28]; 2363 step1[29] = step2[29]; 2364 step1[30] = step2[30]; 2365 step1[31] = step2[31]; 2366 2367 // final stage 2368 output[0] = WRAPLOW(step1[0] + step1[31], bd); 2369 output[1] = WRAPLOW(step1[1] + step1[30], bd); 2370 output[2] = WRAPLOW(step1[2] + step1[29], bd); 2371 output[3] = WRAPLOW(step1[3] + step1[28], bd); 2372 output[4] = WRAPLOW(step1[4] + step1[27], bd); 2373 output[5] = WRAPLOW(step1[5] + step1[26], bd); 2374 output[6] = WRAPLOW(step1[6] + step1[25], bd); 2375 output[7] = WRAPLOW(step1[7] + step1[24], bd); 2376 output[8] = WRAPLOW(step1[8] + step1[23], bd); 2377 output[9] = WRAPLOW(step1[9] + step1[22], bd); 2378 output[10] = WRAPLOW(step1[10] + step1[21], bd); 2379 output[11] = WRAPLOW(step1[11] + step1[20], bd); 2380 output[12] = WRAPLOW(step1[12] + step1[19], bd); 2381 output[13] = WRAPLOW(step1[13] + step1[18], bd); 2382 output[14] = WRAPLOW(step1[14] + step1[17], bd); 2383 output[15] = WRAPLOW(step1[15] + step1[16], bd); 2384 output[16] = WRAPLOW(step1[15] - step1[16], bd); 2385 output[17] = WRAPLOW(step1[14] - step1[17], bd); 2386 output[18] = WRAPLOW(step1[13] - step1[18], bd); 2387 output[19] = WRAPLOW(step1[12] - step1[19], bd); 2388 output[20] = WRAPLOW(step1[11] - step1[20], bd); 2389 output[21] = WRAPLOW(step1[10] - step1[21], bd); 2390 output[22] = WRAPLOW(step1[9] - step1[22], bd); 2391 output[23] = WRAPLOW(step1[8] - step1[23], bd); 2392 output[24] = WRAPLOW(step1[7] - step1[24], bd); 2393 output[25] = WRAPLOW(step1[6] - step1[25], bd); 2394 output[26] = WRAPLOW(step1[5] - step1[26], bd); 2395 output[27] = WRAPLOW(step1[4] - step1[27], bd); 2396 output[28] = WRAPLOW(step1[3] - step1[28], bd); 2397 output[29] = WRAPLOW(step1[2] - step1[29], bd); 2398 output[30] = WRAPLOW(step1[1] - step1[30], bd); 2399 output[31] = WRAPLOW(step1[0] - step1[31], bd); 2400 } 2401 2402 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, 2403 int stride, int bd) { 2404 tran_low_t out[32 * 32]; 2405 tran_low_t *outptr = out; 2406 int i, j; 2407 tran_low_t temp_in[32], temp_out[32]; 2408 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2409 2410 // Rows 2411 for (i = 0; i < 32; ++i) { 2412 tran_low_t zero_coeff[16]; 2413 for (j = 0; j < 16; ++j) 2414 zero_coeff[j] = input[2 * j] | input[2 * j + 1]; 2415 for (j = 0; j < 8; ++j) 2416 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2417 for (j = 0; j < 4; ++j) 2418 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2419 for (j = 0; j < 2; ++j) 2420 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; 2421 2422 if (zero_coeff[0] | zero_coeff[1]) 2423 highbd_idct32_c(input, outptr, bd); 2424 else 2425 memset(outptr, 0, sizeof(tran_low_t) * 32); 2426 input += 32; 2427 outptr += 32; 2428 } 2429 2430 // Columns 2431 for (i = 0; i < 32; ++i) { 2432 for (j = 0; j < 32; ++j) 2433 temp_in[j] = out[j * 32 + i]; 2434 highbd_idct32_c(temp_in, temp_out, bd); 2435 for (j = 0; j < 32; ++j) { 2436 dest[j * stride + i] = highbd_clip_pixel_add( 2437 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2438 } 2439 } 2440 } 2441 2442 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, 2443 int stride, int bd) { 2444 tran_low_t out[32 * 32] = {0}; 2445 tran_low_t *outptr = out; 2446 int i, j; 2447 tran_low_t temp_in[32], temp_out[32]; 2448 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2449 2450 // Rows 2451 // Only upper-left 8x8 has non-zero coeff. 2452 for (i = 0; i < 8; ++i) { 2453 highbd_idct32_c(input, outptr, bd); 2454 input += 32; 2455 outptr += 32; 2456 } 2457 // Columns 2458 for (i = 0; i < 32; ++i) { 2459 for (j = 0; j < 32; ++j) 2460 temp_in[j] = out[j * 32 + i]; 2461 highbd_idct32_c(temp_in, temp_out, bd); 2462 for (j = 0; j < 32; ++j) { 2463 dest[j * stride + i] = highbd_clip_pixel_add( 2464 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2465 } 2466 } 2467 } 2468 2469 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, 2470 int stride, int bd) { 2471 int i, j; 2472 int a1; 2473 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); 2474 2475 tran_low_t out = WRAPLOW( 2476 highbd_dct_const_round_shift(input[0] * cospi_16_64, bd), bd); 2477 out = WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64, bd), bd); 2478 a1 = ROUND_POWER_OF_TWO(out, 6); 2479 2480 for (j = 0; j < 32; ++j) { 2481 for (i = 0; i < 32; ++i) 2482 dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 2483 dest += stride; 2484 } 2485 } 2486 #endif // CONFIG_VP9_HIGHBITDEPTH 2487