1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <math.h> 12 #include <stdlib.h> 13 #include <string.h> 14 15 #include "./vpx_dsp_rtcd.h" 16 #include "vpx_dsp/inv_txfm.h" 17 18 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 19 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 20 0.5 shifts per pixel. */ 21 int i; 22 tran_low_t output[16]; 23 tran_high_t a1, b1, c1, d1, e1; 24 const tran_low_t *ip = input; 25 tran_low_t *op = output; 26 27 for (i = 0; i < 4; i++) { 28 a1 = ip[0] >> UNIT_QUANT_SHIFT; 29 c1 = ip[1] >> UNIT_QUANT_SHIFT; 30 d1 = ip[2] >> UNIT_QUANT_SHIFT; 31 b1 = ip[3] >> UNIT_QUANT_SHIFT; 32 a1 += c1; 33 d1 -= b1; 34 e1 = (a1 - d1) >> 1; 35 b1 = e1 - b1; 36 c1 = e1 - c1; 37 a1 -= b1; 38 d1 += c1; 39 op[0] = WRAPLOW(a1); 40 op[1] = WRAPLOW(b1); 41 op[2] = WRAPLOW(c1); 42 op[3] = WRAPLOW(d1); 43 ip += 4; 44 op += 4; 45 } 46 47 ip = output; 48 for (i = 0; i < 4; i++) { 49 a1 = ip[4 * 0]; 50 c1 = ip[4 * 1]; 51 d1 = ip[4 * 2]; 52 b1 = ip[4 * 3]; 53 a1 += c1; 54 d1 -= b1; 55 e1 = (a1 - d1) >> 1; 56 b1 = e1 - b1; 57 c1 = e1 - c1; 58 a1 -= b1; 59 d1 += c1; 60 dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1)); 61 dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1)); 62 dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1)); 63 dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1)); 64 65 ip++; 66 dest++; 67 } 68 } 69 70 void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) { 71 int i; 72 tran_high_t a1, e1; 73 tran_low_t tmp[4]; 74 const tran_low_t *ip = in; 75 tran_low_t *op = tmp; 76 77 a1 = ip[0] >> UNIT_QUANT_SHIFT; 78 e1 = a1 >> 1; 79 a1 -= e1; 80 op[0] = WRAPLOW(a1); 81 op[1] = op[2] = op[3] = WRAPLOW(e1); 82 83 ip = tmp; 84 for (i = 0; i < 4; i++) { 85 e1 = ip[0] >> 1; 86 a1 = ip[0] - e1; 87 dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); 88 dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1); 89 dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1); 90 dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1); 91 ip++; 92 dest++; 93 } 94 } 95 96 void iadst4_c(const tran_low_t *input, tran_low_t *output) { 97 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 98 tran_low_t x0 = input[0]; 99 tran_low_t x1 = input[1]; 100 tran_low_t x2 = input[2]; 101 tran_low_t x3 = input[3]; 102 103 if (!(x0 | x1 | x2 | x3)) { 104 memset(output, 0, 4 * sizeof(*output)); 105 return; 106 } 107 108 s0 = sinpi_1_9 * x0; 109 s1 = sinpi_2_9 * x0; 110 s2 = sinpi_3_9 * x1; 111 s3 = sinpi_4_9 * x2; 112 s4 = sinpi_1_9 * x2; 113 s5 = sinpi_2_9 * x3; 114 s6 = sinpi_4_9 * x3; 115 s7 = WRAPLOW(x0 - x2 + x3); 116 117 s0 = s0 + s3 + s5; 118 s1 = s1 - s4 - s6; 119 s3 = s2; 120 s2 = sinpi_3_9 * s7; 121 122 // 1-D transform scaling factor is sqrt(2). 123 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 124 // + 1b (addition) = 29b. 125 // Hence the output bit depth is 15b. 126 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); 127 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); 128 output[2] = WRAPLOW(dct_const_round_shift(s2)); 129 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); 130 } 131 132 void idct4_c(const tran_low_t *input, tran_low_t *output) { 133 tran_low_t step[4]; 134 tran_high_t temp1, temp2; 135 136 // stage 1 137 temp1 = (input[0] + input[2]) * cospi_16_64; 138 temp2 = (input[0] - input[2]) * cospi_16_64; 139 step[0] = WRAPLOW(dct_const_round_shift(temp1)); 140 step[1] = WRAPLOW(dct_const_round_shift(temp2)); 141 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 142 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 143 step[2] = WRAPLOW(dct_const_round_shift(temp1)); 144 step[3] = WRAPLOW(dct_const_round_shift(temp2)); 145 146 // stage 2 147 output[0] = WRAPLOW(step[0] + step[3]); 148 output[1] = WRAPLOW(step[1] + step[2]); 149 output[2] = WRAPLOW(step[1] - step[2]); 150 output[3] = WRAPLOW(step[0] - step[3]); 151 } 152 153 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 154 int i, j; 155 tran_low_t out[4 * 4]; 156 tran_low_t *outptr = out; 157 tran_low_t temp_in[4], temp_out[4]; 158 159 // Rows 160 for (i = 0; i < 4; ++i) { 161 idct4_c(input, outptr); 162 input += 4; 163 outptr += 4; 164 } 165 166 // Columns 167 for (i = 0; i < 4; ++i) { 168 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; 169 idct4_c(temp_in, temp_out); 170 for (j = 0; j < 4; ++j) { 171 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 172 ROUND_POWER_OF_TWO(temp_out[j], 4)); 173 } 174 } 175 } 176 177 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 178 int i; 179 tran_high_t a1; 180 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 181 182 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 183 a1 = ROUND_POWER_OF_TWO(out, 4); 184 185 for (i = 0; i < 4; i++) { 186 dest[0] = clip_pixel_add(dest[0], a1); 187 dest[1] = clip_pixel_add(dest[1], a1); 188 dest[2] = clip_pixel_add(dest[2], a1); 189 dest[3] = clip_pixel_add(dest[3], a1); 190 dest += stride; 191 } 192 } 193 194 void iadst8_c(const tran_low_t *input, tran_low_t *output) { 195 int s0, s1, s2, s3, s4, s5, s6, s7; 196 tran_high_t x0 = input[7]; 197 tran_high_t x1 = input[0]; 198 tran_high_t x2 = input[5]; 199 tran_high_t x3 = input[2]; 200 tran_high_t x4 = input[3]; 201 tran_high_t x5 = input[4]; 202 tran_high_t x6 = input[1]; 203 tran_high_t x7 = input[6]; 204 205 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 206 memset(output, 0, 8 * sizeof(*output)); 207 return; 208 } 209 210 // stage 1 211 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); 212 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); 213 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); 214 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); 215 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); 216 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); 217 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); 218 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); 219 220 x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); 221 x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); 222 x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); 223 x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); 224 x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); 225 x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); 226 x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); 227 x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); 228 229 // stage 2 230 s0 = (int)x0; 231 s1 = (int)x1; 232 s2 = (int)x2; 233 s3 = (int)x3; 234 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); 235 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); 236 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); 237 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); 238 239 x0 = WRAPLOW(s0 + s2); 240 x1 = WRAPLOW(s1 + s3); 241 x2 = WRAPLOW(s0 - s2); 242 x3 = WRAPLOW(s1 - s3); 243 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); 244 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); 245 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); 246 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); 247 248 // stage 3 249 s2 = (int)(cospi_16_64 * (x2 + x3)); 250 s3 = (int)(cospi_16_64 * (x2 - x3)); 251 s6 = (int)(cospi_16_64 * (x6 + x7)); 252 s7 = (int)(cospi_16_64 * (x6 - x7)); 253 254 x2 = WRAPLOW(dct_const_round_shift(s2)); 255 x3 = WRAPLOW(dct_const_round_shift(s3)); 256 x6 = WRAPLOW(dct_const_round_shift(s6)); 257 x7 = WRAPLOW(dct_const_round_shift(s7)); 258 259 output[0] = WRAPLOW(x0); 260 output[1] = WRAPLOW(-x4); 261 output[2] = WRAPLOW(x6); 262 output[3] = WRAPLOW(-x2); 263 output[4] = WRAPLOW(x3); 264 output[5] = WRAPLOW(-x7); 265 output[6] = WRAPLOW(x5); 266 output[7] = WRAPLOW(-x1); 267 } 268 269 void idct8_c(const tran_low_t *input, tran_low_t *output) { 270 tran_low_t step1[8], step2[8]; 271 tran_high_t temp1, temp2; 272 273 // stage 1 274 step1[0] = input[0]; 275 step1[2] = input[4]; 276 step1[1] = input[2]; 277 step1[3] = input[6]; 278 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 279 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 280 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 281 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 282 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 283 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 284 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 285 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 286 287 // stage 2 288 temp1 = (step1[0] + step1[2]) * cospi_16_64; 289 temp2 = (step1[0] - step1[2]) * cospi_16_64; 290 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 291 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 292 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64; 293 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64; 294 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 295 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 296 step2[4] = WRAPLOW(step1[4] + step1[5]); 297 step2[5] = WRAPLOW(step1[4] - step1[5]); 298 step2[6] = WRAPLOW(-step1[6] + step1[7]); 299 step2[7] = WRAPLOW(step1[6] + step1[7]); 300 301 // stage 3 302 step1[0] = WRAPLOW(step2[0] + step2[3]); 303 step1[1] = WRAPLOW(step2[1] + step2[2]); 304 step1[2] = WRAPLOW(step2[1] - step2[2]); 305 step1[3] = WRAPLOW(step2[0] - step2[3]); 306 step1[4] = step2[4]; 307 temp1 = (step2[6] - step2[5]) * cospi_16_64; 308 temp2 = (step2[5] + step2[6]) * cospi_16_64; 309 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 310 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 311 step1[7] = step2[7]; 312 313 // stage 4 314 output[0] = WRAPLOW(step1[0] + step1[7]); 315 output[1] = WRAPLOW(step1[1] + step1[6]); 316 output[2] = WRAPLOW(step1[2] + step1[5]); 317 output[3] = WRAPLOW(step1[3] + step1[4]); 318 output[4] = WRAPLOW(step1[3] - step1[4]); 319 output[5] = WRAPLOW(step1[2] - step1[5]); 320 output[6] = WRAPLOW(step1[1] - step1[6]); 321 output[7] = WRAPLOW(step1[0] - step1[7]); 322 } 323 324 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 325 int i, j; 326 tran_low_t out[8 * 8]; 327 tran_low_t *outptr = out; 328 tran_low_t temp_in[8], temp_out[8]; 329 330 // First transform rows 331 for (i = 0; i < 8; ++i) { 332 idct8_c(input, outptr); 333 input += 8; 334 outptr += 8; 335 } 336 337 // Then transform columns 338 for (i = 0; i < 8; ++i) { 339 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; 340 idct8_c(temp_in, temp_out); 341 for (j = 0; j < 8; ++j) { 342 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 343 ROUND_POWER_OF_TWO(temp_out[j], 5)); 344 } 345 } 346 } 347 348 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 349 int i, j; 350 tran_low_t out[8 * 8] = { 0 }; 351 tran_low_t *outptr = out; 352 tran_low_t temp_in[8], temp_out[8]; 353 354 // First transform rows 355 // Only first 4 row has non-zero coefs 356 for (i = 0; i < 4; ++i) { 357 idct8_c(input, outptr); 358 input += 8; 359 outptr += 8; 360 } 361 362 // Then transform columns 363 for (i = 0; i < 8; ++i) { 364 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; 365 idct8_c(temp_in, temp_out); 366 for (j = 0; j < 8; ++j) { 367 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 368 ROUND_POWER_OF_TWO(temp_out[j], 5)); 369 } 370 } 371 } 372 373 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 374 int i, j; 375 tran_high_t a1; 376 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 377 378 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 379 a1 = ROUND_POWER_OF_TWO(out, 5); 380 for (j = 0; j < 8; ++j) { 381 for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1); 382 dest += stride; 383 } 384 } 385 386 void iadst16_c(const tran_low_t *input, tran_low_t *output) { 387 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 388 tran_high_t s9, s10, s11, s12, s13, s14, s15; 389 tran_high_t x0 = input[15]; 390 tran_high_t x1 = input[0]; 391 tran_high_t x2 = input[13]; 392 tran_high_t x3 = input[2]; 393 tran_high_t x4 = input[11]; 394 tran_high_t x5 = input[4]; 395 tran_high_t x6 = input[9]; 396 tran_high_t x7 = input[6]; 397 tran_high_t x8 = input[7]; 398 tran_high_t x9 = input[8]; 399 tran_high_t x10 = input[5]; 400 tran_high_t x11 = input[10]; 401 tran_high_t x12 = input[3]; 402 tran_high_t x13 = input[12]; 403 tran_high_t x14 = input[1]; 404 tran_high_t x15 = input[14]; 405 406 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | 407 x13 | x14 | x15)) { 408 memset(output, 0, 16 * sizeof(*output)); 409 return; 410 } 411 412 // stage 1 413 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 414 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 415 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 416 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 417 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 418 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 419 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 420 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 421 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 422 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 423 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 424 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 425 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 426 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 427 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 428 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 429 430 x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); 431 x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); 432 x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); 433 x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); 434 x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); 435 x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); 436 x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); 437 x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); 438 x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); 439 x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); 440 x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); 441 x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); 442 x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); 443 x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); 444 x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); 445 x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); 446 447 // stage 2 448 s0 = x0; 449 s1 = x1; 450 s2 = x2; 451 s3 = x3; 452 s4 = x4; 453 s5 = x5; 454 s6 = x6; 455 s7 = x7; 456 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 457 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 458 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 459 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 460 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; 461 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 462 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; 463 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 464 465 x0 = WRAPLOW(s0 + s4); 466 x1 = WRAPLOW(s1 + s5); 467 x2 = WRAPLOW(s2 + s6); 468 x3 = WRAPLOW(s3 + s7); 469 x4 = WRAPLOW(s0 - s4); 470 x5 = WRAPLOW(s1 - s5); 471 x6 = WRAPLOW(s2 - s6); 472 x7 = WRAPLOW(s3 - s7); 473 x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); 474 x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); 475 x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); 476 x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); 477 x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); 478 x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); 479 x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); 480 x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); 481 482 // stage 3 483 s0 = x0; 484 s1 = x1; 485 s2 = x2; 486 s3 = x3; 487 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 488 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 489 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; 490 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 491 s8 = x8; 492 s9 = x9; 493 s10 = x10; 494 s11 = x11; 495 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 496 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 497 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; 498 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 499 500 x0 = WRAPLOW(s0 + s2); 501 x1 = WRAPLOW(s1 + s3); 502 x2 = WRAPLOW(s0 - s2); 503 x3 = WRAPLOW(s1 - s3); 504 x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); 505 x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); 506 x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); 507 x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); 508 x8 = WRAPLOW(s8 + s10); 509 x9 = WRAPLOW(s9 + s11); 510 x10 = WRAPLOW(s8 - s10); 511 x11 = WRAPLOW(s9 - s11); 512 x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); 513 x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); 514 x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); 515 x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); 516 517 // stage 4 518 s2 = (-cospi_16_64) * (x2 + x3); 519 s3 = cospi_16_64 * (x2 - x3); 520 s6 = cospi_16_64 * (x6 + x7); 521 s7 = cospi_16_64 * (-x6 + x7); 522 s10 = cospi_16_64 * (x10 + x11); 523 s11 = cospi_16_64 * (-x10 + x11); 524 s14 = (-cospi_16_64) * (x14 + x15); 525 s15 = cospi_16_64 * (x14 - x15); 526 527 x2 = WRAPLOW(dct_const_round_shift(s2)); 528 x3 = WRAPLOW(dct_const_round_shift(s3)); 529 x6 = WRAPLOW(dct_const_round_shift(s6)); 530 x7 = WRAPLOW(dct_const_round_shift(s7)); 531 x10 = WRAPLOW(dct_const_round_shift(s10)); 532 x11 = WRAPLOW(dct_const_round_shift(s11)); 533 x14 = WRAPLOW(dct_const_round_shift(s14)); 534 x15 = WRAPLOW(dct_const_round_shift(s15)); 535 536 output[0] = WRAPLOW(x0); 537 output[1] = WRAPLOW(-x8); 538 output[2] = WRAPLOW(x12); 539 output[3] = WRAPLOW(-x4); 540 output[4] = WRAPLOW(x6); 541 output[5] = WRAPLOW(x14); 542 output[6] = WRAPLOW(x10); 543 output[7] = WRAPLOW(x2); 544 output[8] = WRAPLOW(x3); 545 output[9] = WRAPLOW(x11); 546 output[10] = WRAPLOW(x15); 547 output[11] = WRAPLOW(x7); 548 output[12] = WRAPLOW(x5); 549 output[13] = WRAPLOW(-x13); 550 output[14] = WRAPLOW(x9); 551 output[15] = WRAPLOW(-x1); 552 } 553 554 void idct16_c(const tran_low_t *input, tran_low_t *output) { 555 tran_low_t step1[16], step2[16]; 556 tran_high_t temp1, temp2; 557 558 // stage 1 559 step1[0] = input[0 / 2]; 560 step1[1] = input[16 / 2]; 561 step1[2] = input[8 / 2]; 562 step1[3] = input[24 / 2]; 563 step1[4] = input[4 / 2]; 564 step1[5] = input[20 / 2]; 565 step1[6] = input[12 / 2]; 566 step1[7] = input[28 / 2]; 567 step1[8] = input[2 / 2]; 568 step1[9] = input[18 / 2]; 569 step1[10] = input[10 / 2]; 570 step1[11] = input[26 / 2]; 571 step1[12] = input[6 / 2]; 572 step1[13] = input[22 / 2]; 573 step1[14] = input[14 / 2]; 574 step1[15] = input[30 / 2]; 575 576 // stage 2 577 step2[0] = step1[0]; 578 step2[1] = step1[1]; 579 step2[2] = step1[2]; 580 step2[3] = step1[3]; 581 step2[4] = step1[4]; 582 step2[5] = step1[5]; 583 step2[6] = step1[6]; 584 step2[7] = step1[7]; 585 586 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 587 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 588 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); 589 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); 590 591 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 592 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 593 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 594 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 595 596 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 597 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 598 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 599 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 600 601 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 602 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 603 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 604 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 605 606 // stage 3 607 step1[0] = step2[0]; 608 step1[1] = step2[1]; 609 step1[2] = step2[2]; 610 step1[3] = step2[3]; 611 612 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 613 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 614 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 615 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 616 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 617 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 618 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 619 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 620 621 step1[8] = WRAPLOW(step2[8] + step2[9]); 622 step1[9] = WRAPLOW(step2[8] - step2[9]); 623 step1[10] = WRAPLOW(-step2[10] + step2[11]); 624 step1[11] = WRAPLOW(step2[10] + step2[11]); 625 step1[12] = WRAPLOW(step2[12] + step2[13]); 626 step1[13] = WRAPLOW(step2[12] - step2[13]); 627 step1[14] = WRAPLOW(-step2[14] + step2[15]); 628 step1[15] = WRAPLOW(step2[14] + step2[15]); 629 630 // stage 4 631 temp1 = (step1[0] + step1[1]) * cospi_16_64; 632 temp2 = (step1[0] - step1[1]) * cospi_16_64; 633 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 634 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 635 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 636 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 637 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 638 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 639 step2[4] = WRAPLOW(step1[4] + step1[5]); 640 step2[5] = WRAPLOW(step1[4] - step1[5]); 641 step2[6] = WRAPLOW(-step1[6] + step1[7]); 642 step2[7] = WRAPLOW(step1[6] + step1[7]); 643 644 step2[8] = step1[8]; 645 step2[15] = step1[15]; 646 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 647 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 648 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 649 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 650 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 651 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 652 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 653 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 654 step2[11] = step1[11]; 655 step2[12] = step1[12]; 656 657 // stage 5 658 step1[0] = WRAPLOW(step2[0] + step2[3]); 659 step1[1] = WRAPLOW(step2[1] + step2[2]); 660 step1[2] = WRAPLOW(step2[1] - step2[2]); 661 step1[3] = WRAPLOW(step2[0] - step2[3]); 662 step1[4] = step2[4]; 663 temp1 = (step2[6] - step2[5]) * cospi_16_64; 664 temp2 = (step2[5] + step2[6]) * cospi_16_64; 665 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 666 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 667 step1[7] = step2[7]; 668 669 step1[8] = WRAPLOW(step2[8] + step2[11]); 670 step1[9] = WRAPLOW(step2[9] + step2[10]); 671 step1[10] = WRAPLOW(step2[9] - step2[10]); 672 step1[11] = WRAPLOW(step2[8] - step2[11]); 673 step1[12] = WRAPLOW(-step2[12] + step2[15]); 674 step1[13] = WRAPLOW(-step2[13] + step2[14]); 675 step1[14] = WRAPLOW(step2[13] + step2[14]); 676 step1[15] = WRAPLOW(step2[12] + step2[15]); 677 678 // stage 6 679 step2[0] = WRAPLOW(step1[0] + step1[7]); 680 step2[1] = WRAPLOW(step1[1] + step1[6]); 681 step2[2] = WRAPLOW(step1[2] + step1[5]); 682 step2[3] = WRAPLOW(step1[3] + step1[4]); 683 step2[4] = WRAPLOW(step1[3] - step1[4]); 684 step2[5] = WRAPLOW(step1[2] - step1[5]); 685 step2[6] = WRAPLOW(step1[1] - step1[6]); 686 step2[7] = WRAPLOW(step1[0] - step1[7]); 687 step2[8] = step1[8]; 688 step2[9] = step1[9]; 689 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 690 temp2 = (step1[10] + step1[13]) * cospi_16_64; 691 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 692 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 693 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 694 temp2 = (step1[11] + step1[12]) * cospi_16_64; 695 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 696 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 697 step2[14] = step1[14]; 698 step2[15] = step1[15]; 699 700 // stage 7 701 output[0] = WRAPLOW(step2[0] + step2[15]); 702 output[1] = WRAPLOW(step2[1] + step2[14]); 703 output[2] = WRAPLOW(step2[2] + step2[13]); 704 output[3] = WRAPLOW(step2[3] + step2[12]); 705 output[4] = WRAPLOW(step2[4] + step2[11]); 706 output[5] = WRAPLOW(step2[5] + step2[10]); 707 output[6] = WRAPLOW(step2[6] + step2[9]); 708 output[7] = WRAPLOW(step2[7] + step2[8]); 709 output[8] = WRAPLOW(step2[7] - step2[8]); 710 output[9] = WRAPLOW(step2[6] - step2[9]); 711 output[10] = WRAPLOW(step2[5] - step2[10]); 712 output[11] = WRAPLOW(step2[4] - step2[11]); 713 output[12] = WRAPLOW(step2[3] - step2[12]); 714 output[13] = WRAPLOW(step2[2] - step2[13]); 715 output[14] = WRAPLOW(step2[1] - step2[14]); 716 output[15] = WRAPLOW(step2[0] - step2[15]); 717 } 718 719 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, 720 int stride) { 721 int i, j; 722 tran_low_t out[16 * 16]; 723 tran_low_t *outptr = out; 724 tran_low_t temp_in[16], temp_out[16]; 725 726 // First transform rows 727 for (i = 0; i < 16; ++i) { 728 idct16_c(input, outptr); 729 input += 16; 730 outptr += 16; 731 } 732 733 // Then transform columns 734 for (i = 0; i < 16; ++i) { 735 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 736 idct16_c(temp_in, temp_out); 737 for (j = 0; j < 16; ++j) { 738 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 739 ROUND_POWER_OF_TWO(temp_out[j], 6)); 740 } 741 } 742 } 743 744 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, 745 int stride) { 746 int i, j; 747 tran_low_t out[16 * 16] = { 0 }; 748 tran_low_t *outptr = out; 749 tran_low_t temp_in[16], temp_out[16]; 750 751 // First transform rows. Since all non-zero dct coefficients are in 752 // upper-left 8x8 area, we only need to calculate first 8 rows here. 753 for (i = 0; i < 8; ++i) { 754 idct16_c(input, outptr); 755 input += 16; 756 outptr += 16; 757 } 758 759 // Then transform columns 760 for (i = 0; i < 16; ++i) { 761 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 762 idct16_c(temp_in, temp_out); 763 for (j = 0; j < 16; ++j) { 764 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 765 ROUND_POWER_OF_TWO(temp_out[j], 6)); 766 } 767 } 768 } 769 770 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, 771 int stride) { 772 int i, j; 773 tran_low_t out[16 * 16] = { 0 }; 774 tran_low_t *outptr = out; 775 tran_low_t temp_in[16], temp_out[16]; 776 777 // First transform rows. Since all non-zero dct coefficients are in 778 // upper-left 4x4 area, we only need to calculate first 4 rows here. 779 for (i = 0; i < 4; ++i) { 780 idct16_c(input, outptr); 781 input += 16; 782 outptr += 16; 783 } 784 785 // Then transform columns 786 for (i = 0; i < 16; ++i) { 787 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 788 idct16_c(temp_in, temp_out); 789 for (j = 0; j < 16; ++j) { 790 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 791 ROUND_POWER_OF_TWO(temp_out[j], 6)); 792 } 793 } 794 } 795 796 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 797 int i, j; 798 tran_high_t a1; 799 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 800 801 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 802 a1 = ROUND_POWER_OF_TWO(out, 6); 803 for (j = 0; j < 16; ++j) { 804 for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1); 805 dest += stride; 806 } 807 } 808 809 void idct32_c(const tran_low_t *input, tran_low_t *output) { 810 tran_low_t step1[32], step2[32]; 811 tran_high_t temp1, temp2; 812 813 // stage 1 814 step1[0] = input[0]; 815 step1[1] = input[16]; 816 step1[2] = input[8]; 817 step1[3] = input[24]; 818 step1[4] = input[4]; 819 step1[5] = input[20]; 820 step1[6] = input[12]; 821 step1[7] = input[28]; 822 step1[8] = input[2]; 823 step1[9] = input[18]; 824 step1[10] = input[10]; 825 step1[11] = input[26]; 826 step1[12] = input[6]; 827 step1[13] = input[22]; 828 step1[14] = input[14]; 829 step1[15] = input[30]; 830 831 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 832 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 833 step1[16] = WRAPLOW(dct_const_round_shift(temp1)); 834 step1[31] = WRAPLOW(dct_const_round_shift(temp2)); 835 836 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 837 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 838 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); 839 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); 840 841 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 842 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 843 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 844 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 845 846 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 847 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 848 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); 849 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); 850 851 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 852 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 853 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 854 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 855 856 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 857 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 858 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 859 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 860 861 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 862 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 863 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 864 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 865 866 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 867 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 868 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); 869 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); 870 871 // stage 2 872 step2[0] = step1[0]; 873 step2[1] = step1[1]; 874 step2[2] = step1[2]; 875 step2[3] = step1[3]; 876 step2[4] = step1[4]; 877 step2[5] = step1[5]; 878 step2[6] = step1[6]; 879 step2[7] = step1[7]; 880 881 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 882 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 883 step2[8] = WRAPLOW(dct_const_round_shift(temp1)); 884 step2[15] = WRAPLOW(dct_const_round_shift(temp2)); 885 886 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 887 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 888 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 889 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 890 891 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 892 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 893 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 894 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 895 896 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 897 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 898 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 899 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 900 901 step2[16] = WRAPLOW(step1[16] + step1[17]); 902 step2[17] = WRAPLOW(step1[16] - step1[17]); 903 step2[18] = WRAPLOW(-step1[18] + step1[19]); 904 step2[19] = WRAPLOW(step1[18] + step1[19]); 905 step2[20] = WRAPLOW(step1[20] + step1[21]); 906 step2[21] = WRAPLOW(step1[20] - step1[21]); 907 step2[22] = WRAPLOW(-step1[22] + step1[23]); 908 step2[23] = WRAPLOW(step1[22] + step1[23]); 909 step2[24] = WRAPLOW(step1[24] + step1[25]); 910 step2[25] = WRAPLOW(step1[24] - step1[25]); 911 step2[26] = WRAPLOW(-step1[26] + step1[27]); 912 step2[27] = WRAPLOW(step1[26] + step1[27]); 913 step2[28] = WRAPLOW(step1[28] + step1[29]); 914 step2[29] = WRAPLOW(step1[28] - step1[29]); 915 step2[30] = WRAPLOW(-step1[30] + step1[31]); 916 step2[31] = WRAPLOW(step1[30] + step1[31]); 917 918 // stage 3 919 step1[0] = step2[0]; 920 step1[1] = step2[1]; 921 step1[2] = step2[2]; 922 step1[3] = step2[3]; 923 924 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 925 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 926 step1[4] = WRAPLOW(dct_const_round_shift(temp1)); 927 step1[7] = WRAPLOW(dct_const_round_shift(temp2)); 928 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 929 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 930 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 931 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 932 933 step1[8] = WRAPLOW(step2[8] + step2[9]); 934 step1[9] = WRAPLOW(step2[8] - step2[9]); 935 step1[10] = WRAPLOW(-step2[10] + step2[11]); 936 step1[11] = WRAPLOW(step2[10] + step2[11]); 937 step1[12] = WRAPLOW(step2[12] + step2[13]); 938 step1[13] = WRAPLOW(step2[12] - step2[13]); 939 step1[14] = WRAPLOW(-step2[14] + step2[15]); 940 step1[15] = WRAPLOW(step2[14] + step2[15]); 941 942 step1[16] = step2[16]; 943 step1[31] = step2[31]; 944 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 945 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 946 step1[17] = WRAPLOW(dct_const_round_shift(temp1)); 947 step1[30] = WRAPLOW(dct_const_round_shift(temp2)); 948 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 949 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 950 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 951 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 952 step1[19] = step2[19]; 953 step1[20] = step2[20]; 954 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 955 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 956 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 957 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 958 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 959 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 960 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 961 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 962 step1[23] = step2[23]; 963 step1[24] = step2[24]; 964 step1[27] = step2[27]; 965 step1[28] = step2[28]; 966 967 // stage 4 968 temp1 = (step1[0] + step1[1]) * cospi_16_64; 969 temp2 = (step1[0] - step1[1]) * cospi_16_64; 970 step2[0] = WRAPLOW(dct_const_round_shift(temp1)); 971 step2[1] = WRAPLOW(dct_const_round_shift(temp2)); 972 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 973 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 974 step2[2] = WRAPLOW(dct_const_round_shift(temp1)); 975 step2[3] = WRAPLOW(dct_const_round_shift(temp2)); 976 step2[4] = WRAPLOW(step1[4] + step1[5]); 977 step2[5] = WRAPLOW(step1[4] - step1[5]); 978 step2[6] = WRAPLOW(-step1[6] + step1[7]); 979 step2[7] = WRAPLOW(step1[6] + step1[7]); 980 981 step2[8] = step1[8]; 982 step2[15] = step1[15]; 983 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 984 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 985 step2[9] = WRAPLOW(dct_const_round_shift(temp1)); 986 step2[14] = WRAPLOW(dct_const_round_shift(temp2)); 987 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 988 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 989 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 990 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 991 step2[11] = step1[11]; 992 step2[12] = step1[12]; 993 994 step2[16] = WRAPLOW(step1[16] + step1[19]); 995 step2[17] = WRAPLOW(step1[17] + step1[18]); 996 step2[18] = WRAPLOW(step1[17] - step1[18]); 997 step2[19] = WRAPLOW(step1[16] - step1[19]); 998 step2[20] = WRAPLOW(-step1[20] + step1[23]); 999 step2[21] = WRAPLOW(-step1[21] + step1[22]); 1000 step2[22] = WRAPLOW(step1[21] + step1[22]); 1001 step2[23] = WRAPLOW(step1[20] + step1[23]); 1002 1003 step2[24] = WRAPLOW(step1[24] + step1[27]); 1004 step2[25] = WRAPLOW(step1[25] + step1[26]); 1005 step2[26] = WRAPLOW(step1[25] - step1[26]); 1006 step2[27] = WRAPLOW(step1[24] - step1[27]); 1007 step2[28] = WRAPLOW(-step1[28] + step1[31]); 1008 step2[29] = WRAPLOW(-step1[29] + step1[30]); 1009 step2[30] = WRAPLOW(step1[29] + step1[30]); 1010 step2[31] = WRAPLOW(step1[28] + step1[31]); 1011 1012 // stage 5 1013 step1[0] = WRAPLOW(step2[0] + step2[3]); 1014 step1[1] = WRAPLOW(step2[1] + step2[2]); 1015 step1[2] = WRAPLOW(step2[1] - step2[2]); 1016 step1[3] = WRAPLOW(step2[0] - step2[3]); 1017 step1[4] = step2[4]; 1018 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1019 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1020 step1[5] = WRAPLOW(dct_const_round_shift(temp1)); 1021 step1[6] = WRAPLOW(dct_const_round_shift(temp2)); 1022 step1[7] = step2[7]; 1023 1024 step1[8] = WRAPLOW(step2[8] + step2[11]); 1025 step1[9] = WRAPLOW(step2[9] + step2[10]); 1026 step1[10] = WRAPLOW(step2[9] - step2[10]); 1027 step1[11] = WRAPLOW(step2[8] - step2[11]); 1028 step1[12] = WRAPLOW(-step2[12] + step2[15]); 1029 step1[13] = WRAPLOW(-step2[13] + step2[14]); 1030 step1[14] = WRAPLOW(step2[13] + step2[14]); 1031 step1[15] = WRAPLOW(step2[12] + step2[15]); 1032 1033 step1[16] = step2[16]; 1034 step1[17] = step2[17]; 1035 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 1036 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 1037 step1[18] = WRAPLOW(dct_const_round_shift(temp1)); 1038 step1[29] = WRAPLOW(dct_const_round_shift(temp2)); 1039 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 1040 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 1041 step1[19] = WRAPLOW(dct_const_round_shift(temp1)); 1042 step1[28] = WRAPLOW(dct_const_round_shift(temp2)); 1043 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 1044 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 1045 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 1046 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 1047 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 1048 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 1049 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 1050 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 1051 step1[22] = step2[22]; 1052 step1[23] = step2[23]; 1053 step1[24] = step2[24]; 1054 step1[25] = step2[25]; 1055 step1[30] = step2[30]; 1056 step1[31] = step2[31]; 1057 1058 // stage 6 1059 step2[0] = WRAPLOW(step1[0] + step1[7]); 1060 step2[1] = WRAPLOW(step1[1] + step1[6]); 1061 step2[2] = WRAPLOW(step1[2] + step1[5]); 1062 step2[3] = WRAPLOW(step1[3] + step1[4]); 1063 step2[4] = WRAPLOW(step1[3] - step1[4]); 1064 step2[5] = WRAPLOW(step1[2] - step1[5]); 1065 step2[6] = WRAPLOW(step1[1] - step1[6]); 1066 step2[7] = WRAPLOW(step1[0] - step1[7]); 1067 step2[8] = step1[8]; 1068 step2[9] = step1[9]; 1069 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 1070 temp2 = (step1[10] + step1[13]) * cospi_16_64; 1071 step2[10] = WRAPLOW(dct_const_round_shift(temp1)); 1072 step2[13] = WRAPLOW(dct_const_round_shift(temp2)); 1073 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 1074 temp2 = (step1[11] + step1[12]) * cospi_16_64; 1075 step2[11] = WRAPLOW(dct_const_round_shift(temp1)); 1076 step2[12] = WRAPLOW(dct_const_round_shift(temp2)); 1077 step2[14] = step1[14]; 1078 step2[15] = step1[15]; 1079 1080 step2[16] = WRAPLOW(step1[16] + step1[23]); 1081 step2[17] = WRAPLOW(step1[17] + step1[22]); 1082 step2[18] = WRAPLOW(step1[18] + step1[21]); 1083 step2[19] = WRAPLOW(step1[19] + step1[20]); 1084 step2[20] = WRAPLOW(step1[19] - step1[20]); 1085 step2[21] = WRAPLOW(step1[18] - step1[21]); 1086 step2[22] = WRAPLOW(step1[17] - step1[22]); 1087 step2[23] = WRAPLOW(step1[16] - step1[23]); 1088 1089 step2[24] = WRAPLOW(-step1[24] + step1[31]); 1090 step2[25] = WRAPLOW(-step1[25] + step1[30]); 1091 step2[26] = WRAPLOW(-step1[26] + step1[29]); 1092 step2[27] = WRAPLOW(-step1[27] + step1[28]); 1093 step2[28] = WRAPLOW(step1[27] + step1[28]); 1094 step2[29] = WRAPLOW(step1[26] + step1[29]); 1095 step2[30] = WRAPLOW(step1[25] + step1[30]); 1096 step2[31] = WRAPLOW(step1[24] + step1[31]); 1097 1098 // stage 7 1099 step1[0] = WRAPLOW(step2[0] + step2[15]); 1100 step1[1] = WRAPLOW(step2[1] + step2[14]); 1101 step1[2] = WRAPLOW(step2[2] + step2[13]); 1102 step1[3] = WRAPLOW(step2[3] + step2[12]); 1103 step1[4] = WRAPLOW(step2[4] + step2[11]); 1104 step1[5] = WRAPLOW(step2[5] + step2[10]); 1105 step1[6] = WRAPLOW(step2[6] + step2[9]); 1106 step1[7] = WRAPLOW(step2[7] + step2[8]); 1107 step1[8] = WRAPLOW(step2[7] - step2[8]); 1108 step1[9] = WRAPLOW(step2[6] - step2[9]); 1109 step1[10] = WRAPLOW(step2[5] - step2[10]); 1110 step1[11] = WRAPLOW(step2[4] - step2[11]); 1111 step1[12] = WRAPLOW(step2[3] - step2[12]); 1112 step1[13] = WRAPLOW(step2[2] - step2[13]); 1113 step1[14] = WRAPLOW(step2[1] - step2[14]); 1114 step1[15] = WRAPLOW(step2[0] - step2[15]); 1115 1116 step1[16] = step2[16]; 1117 step1[17] = step2[17]; 1118 step1[18] = step2[18]; 1119 step1[19] = step2[19]; 1120 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 1121 temp2 = (step2[20] + step2[27]) * cospi_16_64; 1122 step1[20] = WRAPLOW(dct_const_round_shift(temp1)); 1123 step1[27] = WRAPLOW(dct_const_round_shift(temp2)); 1124 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 1125 temp2 = (step2[21] + step2[26]) * cospi_16_64; 1126 step1[21] = WRAPLOW(dct_const_round_shift(temp1)); 1127 step1[26] = WRAPLOW(dct_const_round_shift(temp2)); 1128 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 1129 temp2 = (step2[22] + step2[25]) * cospi_16_64; 1130 step1[22] = WRAPLOW(dct_const_round_shift(temp1)); 1131 step1[25] = WRAPLOW(dct_const_round_shift(temp2)); 1132 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 1133 temp2 = (step2[23] + step2[24]) * cospi_16_64; 1134 step1[23] = WRAPLOW(dct_const_round_shift(temp1)); 1135 step1[24] = WRAPLOW(dct_const_round_shift(temp2)); 1136 step1[28] = step2[28]; 1137 step1[29] = step2[29]; 1138 step1[30] = step2[30]; 1139 step1[31] = step2[31]; 1140 1141 // final stage 1142 output[0] = WRAPLOW(step1[0] + step1[31]); 1143 output[1] = WRAPLOW(step1[1] + step1[30]); 1144 output[2] = WRAPLOW(step1[2] + step1[29]); 1145 output[3] = WRAPLOW(step1[3] + step1[28]); 1146 output[4] = WRAPLOW(step1[4] + step1[27]); 1147 output[5] = WRAPLOW(step1[5] + step1[26]); 1148 output[6] = WRAPLOW(step1[6] + step1[25]); 1149 output[7] = WRAPLOW(step1[7] + step1[24]); 1150 output[8] = WRAPLOW(step1[8] + step1[23]); 1151 output[9] = WRAPLOW(step1[9] + step1[22]); 1152 output[10] = WRAPLOW(step1[10] + step1[21]); 1153 output[11] = WRAPLOW(step1[11] + step1[20]); 1154 output[12] = WRAPLOW(step1[12] + step1[19]); 1155 output[13] = WRAPLOW(step1[13] + step1[18]); 1156 output[14] = WRAPLOW(step1[14] + step1[17]); 1157 output[15] = WRAPLOW(step1[15] + step1[16]); 1158 output[16] = WRAPLOW(step1[15] - step1[16]); 1159 output[17] = WRAPLOW(step1[14] - step1[17]); 1160 output[18] = WRAPLOW(step1[13] - step1[18]); 1161 output[19] = WRAPLOW(step1[12] - step1[19]); 1162 output[20] = WRAPLOW(step1[11] - step1[20]); 1163 output[21] = WRAPLOW(step1[10] - step1[21]); 1164 output[22] = WRAPLOW(step1[9] - step1[22]); 1165 output[23] = WRAPLOW(step1[8] - step1[23]); 1166 output[24] = WRAPLOW(step1[7] - step1[24]); 1167 output[25] = WRAPLOW(step1[6] - step1[25]); 1168 output[26] = WRAPLOW(step1[5] - step1[26]); 1169 output[27] = WRAPLOW(step1[4] - step1[27]); 1170 output[28] = WRAPLOW(step1[3] - step1[28]); 1171 output[29] = WRAPLOW(step1[2] - step1[29]); 1172 output[30] = WRAPLOW(step1[1] - step1[30]); 1173 output[31] = WRAPLOW(step1[0] - step1[31]); 1174 } 1175 1176 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, 1177 int stride) { 1178 int i, j; 1179 tran_low_t out[32 * 32]; 1180 tran_low_t *outptr = out; 1181 tran_low_t temp_in[32], temp_out[32]; 1182 1183 // Rows 1184 for (i = 0; i < 32; ++i) { 1185 int16_t zero_coeff = 0; 1186 for (j = 0; j < 32; ++j) zero_coeff |= input[j]; 1187 1188 if (zero_coeff) 1189 idct32_c(input, outptr); 1190 else 1191 memset(outptr, 0, sizeof(tran_low_t) * 32); 1192 input += 32; 1193 outptr += 32; 1194 } 1195 1196 // Columns 1197 for (i = 0; i < 32; ++i) { 1198 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 1199 idct32_c(temp_in, temp_out); 1200 for (j = 0; j < 32; ++j) { 1201 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1202 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1203 } 1204 } 1205 } 1206 1207 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, 1208 int stride) { 1209 int i, j; 1210 tran_low_t out[32 * 32] = { 0 }; 1211 tran_low_t *outptr = out; 1212 tran_low_t temp_in[32], temp_out[32]; 1213 1214 // Rows 1215 // Only upper-left 16x16 has non-zero coeff 1216 for (i = 0; i < 16; ++i) { 1217 idct32_c(input, outptr); 1218 input += 32; 1219 outptr += 32; 1220 } 1221 1222 // Columns 1223 for (i = 0; i < 32; ++i) { 1224 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 1225 idct32_c(temp_in, temp_out); 1226 for (j = 0; j < 32; ++j) { 1227 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1228 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1229 } 1230 } 1231 } 1232 1233 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, 1234 int stride) { 1235 int i, j; 1236 tran_low_t out[32 * 32] = { 0 }; 1237 tran_low_t *outptr = out; 1238 tran_low_t temp_in[32], temp_out[32]; 1239 1240 // Rows 1241 // Only upper-left 8x8 has non-zero coeff 1242 for (i = 0; i < 8; ++i) { 1243 idct32_c(input, outptr); 1244 input += 32; 1245 outptr += 32; 1246 } 1247 1248 // Columns 1249 for (i = 0; i < 32; ++i) { 1250 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 1251 idct32_c(temp_in, temp_out); 1252 for (j = 0; j < 32; ++j) { 1253 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], 1254 ROUND_POWER_OF_TWO(temp_out[j], 6)); 1255 } 1256 } 1257 } 1258 1259 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { 1260 int i, j; 1261 tran_high_t a1; 1262 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); 1263 1264 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); 1265 a1 = ROUND_POWER_OF_TWO(out, 6); 1266 1267 for (j = 0; j < 32; ++j) { 1268 for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1); 1269 dest += stride; 1270 } 1271 } 1272 1273 #if CONFIG_VP9_HIGHBITDEPTH 1274 1275 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse 1276 // transform amplify bits + 1 bit for contingency in rounding and quantizing 1277 #define HIGHBD_VALID_TXFM_MAGNITUDE_RANGE (1 << 25) 1278 1279 static INLINE int detect_invalid_highbd_input(const tran_low_t *input, 1280 int size) { 1281 int i; 1282 for (i = 0; i < size; ++i) 1283 if (abs(input[i]) >= HIGHBD_VALID_TXFM_MAGNITUDE_RANGE) return 1; 1284 return 0; 1285 } 1286 1287 void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, 1288 int stride, int bd) { 1289 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 1290 0.5 shifts per pixel. */ 1291 int i; 1292 tran_low_t output[16]; 1293 tran_high_t a1, b1, c1, d1, e1; 1294 const tran_low_t *ip = input; 1295 tran_low_t *op = output; 1296 1297 for (i = 0; i < 4; i++) { 1298 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1299 c1 = ip[1] >> UNIT_QUANT_SHIFT; 1300 d1 = ip[2] >> UNIT_QUANT_SHIFT; 1301 b1 = ip[3] >> UNIT_QUANT_SHIFT; 1302 a1 += c1; 1303 d1 -= b1; 1304 e1 = (a1 - d1) >> 1; 1305 b1 = e1 - b1; 1306 c1 = e1 - c1; 1307 a1 -= b1; 1308 d1 += c1; 1309 op[0] = HIGHBD_WRAPLOW(a1, bd); 1310 op[1] = HIGHBD_WRAPLOW(b1, bd); 1311 op[2] = HIGHBD_WRAPLOW(c1, bd); 1312 op[3] = HIGHBD_WRAPLOW(d1, bd); 1313 ip += 4; 1314 op += 4; 1315 } 1316 1317 ip = output; 1318 for (i = 0; i < 4; i++) { 1319 a1 = ip[4 * 0]; 1320 c1 = ip[4 * 1]; 1321 d1 = ip[4 * 2]; 1322 b1 = ip[4 * 3]; 1323 a1 += c1; 1324 d1 -= b1; 1325 e1 = (a1 - d1) >> 1; 1326 b1 = e1 - b1; 1327 c1 = e1 - c1; 1328 a1 -= b1; 1329 d1 += c1; 1330 dest[stride * 0] = 1331 highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd); 1332 dest[stride * 1] = 1333 highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd); 1334 dest[stride * 2] = 1335 highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd); 1336 dest[stride * 3] = 1337 highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd); 1338 1339 ip++; 1340 dest++; 1341 } 1342 } 1343 1344 void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint16_t *dest, 1345 int stride, int bd) { 1346 int i; 1347 tran_high_t a1, e1; 1348 tran_low_t tmp[4]; 1349 const tran_low_t *ip = in; 1350 tran_low_t *op = tmp; 1351 (void)bd; 1352 1353 a1 = ip[0] >> UNIT_QUANT_SHIFT; 1354 e1 = a1 >> 1; 1355 a1 -= e1; 1356 op[0] = HIGHBD_WRAPLOW(a1, bd); 1357 op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd); 1358 1359 ip = tmp; 1360 for (i = 0; i < 4; i++) { 1361 e1 = ip[0] >> 1; 1362 a1 = ip[0] - e1; 1363 dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); 1364 dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd); 1365 dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd); 1366 dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd); 1367 ip++; 1368 dest++; 1369 } 1370 } 1371 1372 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { 1373 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1374 tran_low_t x0 = input[0]; 1375 tran_low_t x1 = input[1]; 1376 tran_low_t x2 = input[2]; 1377 tran_low_t x3 = input[3]; 1378 (void)bd; 1379 1380 if (detect_invalid_highbd_input(input, 4)) { 1381 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1382 assert(0 && "invalid highbd txfm input"); 1383 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1384 memset(output, 0, sizeof(*output) * 4); 1385 return; 1386 } 1387 1388 if (!(x0 | x1 | x2 | x3)) { 1389 memset(output, 0, 4 * sizeof(*output)); 1390 return; 1391 } 1392 1393 s0 = sinpi_1_9 * x0; 1394 s1 = sinpi_2_9 * x0; 1395 s2 = sinpi_3_9 * x1; 1396 s3 = sinpi_4_9 * x2; 1397 s4 = sinpi_1_9 * x2; 1398 s5 = sinpi_2_9 * x3; 1399 s6 = sinpi_4_9 * x3; 1400 s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); 1401 1402 s0 = s0 + s3 + s5; 1403 s1 = s1 - s4 - s6; 1404 s3 = s2; 1405 s2 = sinpi_3_9 * s7; 1406 1407 // 1-D transform scaling factor is sqrt(2). 1408 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 1409 // + 1b (addition) = 29b. 1410 // Hence the output bit depth is 15b. 1411 output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); 1412 output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); 1413 output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); 1414 output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); 1415 } 1416 1417 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { 1418 tran_low_t step[4]; 1419 tran_high_t temp1, temp2; 1420 (void)bd; 1421 1422 if (detect_invalid_highbd_input(input, 4)) { 1423 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1424 assert(0 && "invalid highbd txfm input"); 1425 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1426 memset(output, 0, sizeof(*output) * 4); 1427 return; 1428 } 1429 1430 // stage 1 1431 temp1 = (input[0] + input[2]) * cospi_16_64; 1432 temp2 = (input[0] - input[2]) * cospi_16_64; 1433 step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1434 step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1435 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; 1436 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; 1437 step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1438 step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1439 1440 // stage 2 1441 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); 1442 output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd); 1443 output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd); 1444 output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd); 1445 } 1446 1447 void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, 1448 int stride, int bd) { 1449 int i, j; 1450 tran_low_t out[4 * 4]; 1451 tran_low_t *outptr = out; 1452 tran_low_t temp_in[4], temp_out[4]; 1453 1454 // Rows 1455 for (i = 0; i < 4; ++i) { 1456 vpx_highbd_idct4_c(input, outptr, bd); 1457 input += 4; 1458 outptr += 4; 1459 } 1460 1461 // Columns 1462 for (i = 0; i < 4; ++i) { 1463 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; 1464 vpx_highbd_idct4_c(temp_in, temp_out, bd); 1465 for (j = 0; j < 4; ++j) { 1466 dest[j * stride + i] = highbd_clip_pixel_add( 1467 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); 1468 } 1469 } 1470 } 1471 1472 void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, 1473 int stride, int bd) { 1474 int i; 1475 tran_high_t a1; 1476 tran_low_t out = 1477 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); 1478 1479 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); 1480 a1 = ROUND_POWER_OF_TWO(out, 4); 1481 1482 for (i = 0; i < 4; i++) { 1483 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd); 1484 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); 1485 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); 1486 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); 1487 dest += stride; 1488 } 1489 } 1490 1491 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { 1492 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 1493 tran_low_t x0 = input[7]; 1494 tran_low_t x1 = input[0]; 1495 tran_low_t x2 = input[5]; 1496 tran_low_t x3 = input[2]; 1497 tran_low_t x4 = input[3]; 1498 tran_low_t x5 = input[4]; 1499 tran_low_t x6 = input[1]; 1500 tran_low_t x7 = input[6]; 1501 (void)bd; 1502 1503 if (detect_invalid_highbd_input(input, 8)) { 1504 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1505 assert(0 && "invalid highbd txfm input"); 1506 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1507 memset(output, 0, sizeof(*output) * 8); 1508 return; 1509 } 1510 1511 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { 1512 memset(output, 0, 8 * sizeof(*output)); 1513 return; 1514 } 1515 1516 // stage 1 1517 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 1518 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 1519 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 1520 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 1521 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 1522 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 1523 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 1524 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 1525 1526 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); 1527 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); 1528 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); 1529 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); 1530 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); 1531 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); 1532 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); 1533 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); 1534 1535 // stage 2 1536 s0 = x0; 1537 s1 = x1; 1538 s2 = x2; 1539 s3 = x3; 1540 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 1541 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 1542 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; 1543 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 1544 1545 x0 = HIGHBD_WRAPLOW(s0 + s2, bd); 1546 x1 = HIGHBD_WRAPLOW(s1 + s3, bd); 1547 x2 = HIGHBD_WRAPLOW(s0 - s2, bd); 1548 x3 = HIGHBD_WRAPLOW(s1 - s3, bd); 1549 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); 1550 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); 1551 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); 1552 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); 1553 1554 // stage 3 1555 s2 = cospi_16_64 * (x2 + x3); 1556 s3 = cospi_16_64 * (x2 - x3); 1557 s6 = cospi_16_64 * (x6 + x7); 1558 s7 = cospi_16_64 * (x6 - x7); 1559 1560 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); 1561 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); 1562 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); 1563 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); 1564 1565 output[0] = HIGHBD_WRAPLOW(x0, bd); 1566 output[1] = HIGHBD_WRAPLOW(-x4, bd); 1567 output[2] = HIGHBD_WRAPLOW(x6, bd); 1568 output[3] = HIGHBD_WRAPLOW(-x2, bd); 1569 output[4] = HIGHBD_WRAPLOW(x3, bd); 1570 output[5] = HIGHBD_WRAPLOW(-x7, bd); 1571 output[6] = HIGHBD_WRAPLOW(x5, bd); 1572 output[7] = HIGHBD_WRAPLOW(-x1, bd); 1573 } 1574 1575 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { 1576 tran_low_t step1[8], step2[8]; 1577 tran_high_t temp1, temp2; 1578 1579 if (detect_invalid_highbd_input(input, 8)) { 1580 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1581 assert(0 && "invalid highbd txfm input"); 1582 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1583 memset(output, 0, sizeof(*output) * 8); 1584 return; 1585 } 1586 1587 // stage 1 1588 step1[0] = input[0]; 1589 step1[2] = input[4]; 1590 step1[1] = input[2]; 1591 step1[3] = input[6]; 1592 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; 1593 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; 1594 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1595 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1596 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; 1597 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; 1598 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1599 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1600 1601 // stage 2 & stage 3 - even half 1602 vpx_highbd_idct4_c(step1, step1, bd); 1603 1604 // stage 2 - odd half 1605 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); 1606 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); 1607 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); 1608 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); 1609 1610 // stage 3 - odd half 1611 step1[4] = step2[4]; 1612 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1613 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1614 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1615 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1616 step1[7] = step2[7]; 1617 1618 // stage 4 1619 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); 1620 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); 1621 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); 1622 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); 1623 output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); 1624 output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); 1625 output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); 1626 output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); 1627 } 1628 1629 void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, 1630 int stride, int bd) { 1631 int i, j; 1632 tran_low_t out[8 * 8]; 1633 tran_low_t *outptr = out; 1634 tran_low_t temp_in[8], temp_out[8]; 1635 1636 // First transform rows 1637 for (i = 0; i < 8; ++i) { 1638 vpx_highbd_idct8_c(input, outptr, bd); 1639 input += 8; 1640 outptr += 8; 1641 } 1642 1643 // Then transform columns 1644 for (i = 0; i < 8; ++i) { 1645 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; 1646 vpx_highbd_idct8_c(temp_in, temp_out, bd); 1647 for (j = 0; j < 8; ++j) { 1648 dest[j * stride + i] = highbd_clip_pixel_add( 1649 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1650 } 1651 } 1652 } 1653 1654 void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, 1655 int stride, int bd) { 1656 int i, j; 1657 tran_low_t out[8 * 8] = { 0 }; 1658 tran_low_t *outptr = out; 1659 tran_low_t temp_in[8], temp_out[8]; 1660 1661 // First transform rows 1662 // Only first 4 row has non-zero coefs 1663 for (i = 0; i < 4; ++i) { 1664 vpx_highbd_idct8_c(input, outptr, bd); 1665 input += 8; 1666 outptr += 8; 1667 } 1668 1669 // Then transform columns 1670 for (i = 0; i < 8; ++i) { 1671 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; 1672 vpx_highbd_idct8_c(temp_in, temp_out, bd); 1673 for (j = 0; j < 8; ++j) { 1674 dest[j * stride + i] = highbd_clip_pixel_add( 1675 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); 1676 } 1677 } 1678 } 1679 1680 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, 1681 int stride, int bd) { 1682 int i, j; 1683 tran_high_t a1; 1684 tran_low_t out = 1685 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); 1686 1687 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); 1688 a1 = ROUND_POWER_OF_TWO(out, 5); 1689 for (j = 0; j < 8; ++j) { 1690 for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 1691 dest += stride; 1692 } 1693 } 1694 1695 void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { 1696 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 1697 tran_high_t s9, s10, s11, s12, s13, s14, s15; 1698 tran_low_t x0 = input[15]; 1699 tran_low_t x1 = input[0]; 1700 tran_low_t x2 = input[13]; 1701 tran_low_t x3 = input[2]; 1702 tran_low_t x4 = input[11]; 1703 tran_low_t x5 = input[4]; 1704 tran_low_t x6 = input[9]; 1705 tran_low_t x7 = input[6]; 1706 tran_low_t x8 = input[7]; 1707 tran_low_t x9 = input[8]; 1708 tran_low_t x10 = input[5]; 1709 tran_low_t x11 = input[10]; 1710 tran_low_t x12 = input[3]; 1711 tran_low_t x13 = input[12]; 1712 tran_low_t x14 = input[1]; 1713 tran_low_t x15 = input[14]; 1714 (void)bd; 1715 1716 if (detect_invalid_highbd_input(input, 16)) { 1717 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1718 assert(0 && "invalid highbd txfm input"); 1719 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1720 memset(output, 0, sizeof(*output) * 16); 1721 return; 1722 } 1723 1724 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | 1725 x13 | x14 | x15)) { 1726 memset(output, 0, 16 * sizeof(*output)); 1727 return; 1728 } 1729 1730 // stage 1 1731 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 1732 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 1733 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 1734 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 1735 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 1736 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 1737 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 1738 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 1739 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 1740 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 1741 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 1742 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 1743 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 1744 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 1745 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 1746 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 1747 1748 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); 1749 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); 1750 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); 1751 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); 1752 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); 1753 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); 1754 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); 1755 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); 1756 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); 1757 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); 1758 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); 1759 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); 1760 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); 1761 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); 1762 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); 1763 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); 1764 1765 // stage 2 1766 s0 = x0; 1767 s1 = x1; 1768 s2 = x2; 1769 s3 = x3; 1770 s4 = x4; 1771 s5 = x5; 1772 s6 = x6; 1773 s7 = x7; 1774 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 1775 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 1776 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 1777 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 1778 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; 1779 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 1780 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; 1781 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 1782 1783 x0 = HIGHBD_WRAPLOW(s0 + s4, bd); 1784 x1 = HIGHBD_WRAPLOW(s1 + s5, bd); 1785 x2 = HIGHBD_WRAPLOW(s2 + s6, bd); 1786 x3 = HIGHBD_WRAPLOW(s3 + s7, bd); 1787 x4 = HIGHBD_WRAPLOW(s0 - s4, bd); 1788 x5 = HIGHBD_WRAPLOW(s1 - s5, bd); 1789 x6 = HIGHBD_WRAPLOW(s2 - s6, bd); 1790 x7 = HIGHBD_WRAPLOW(s3 - s7, bd); 1791 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); 1792 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); 1793 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); 1794 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); 1795 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); 1796 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); 1797 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); 1798 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); 1799 1800 // stage 3 1801 s0 = x0; 1802 s1 = x1; 1803 s2 = x2; 1804 s3 = x3; 1805 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 1806 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 1807 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; 1808 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 1809 s8 = x8; 1810 s9 = x9; 1811 s10 = x10; 1812 s11 = x11; 1813 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 1814 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 1815 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; 1816 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 1817 1818 x0 = HIGHBD_WRAPLOW(s0 + s2, bd); 1819 x1 = HIGHBD_WRAPLOW(s1 + s3, bd); 1820 x2 = HIGHBD_WRAPLOW(s0 - s2, bd); 1821 x3 = HIGHBD_WRAPLOW(s1 - s3, bd); 1822 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); 1823 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); 1824 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); 1825 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); 1826 x8 = HIGHBD_WRAPLOW(s8 + s10, bd); 1827 x9 = HIGHBD_WRAPLOW(s9 + s11, bd); 1828 x10 = HIGHBD_WRAPLOW(s8 - s10, bd); 1829 x11 = HIGHBD_WRAPLOW(s9 - s11, bd); 1830 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); 1831 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); 1832 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); 1833 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); 1834 1835 // stage 4 1836 s2 = (-cospi_16_64) * (x2 + x3); 1837 s3 = cospi_16_64 * (x2 - x3); 1838 s6 = cospi_16_64 * (x6 + x7); 1839 s7 = cospi_16_64 * (-x6 + x7); 1840 s10 = cospi_16_64 * (x10 + x11); 1841 s11 = cospi_16_64 * (-x10 + x11); 1842 s14 = (-cospi_16_64) * (x14 + x15); 1843 s15 = cospi_16_64 * (x14 - x15); 1844 1845 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); 1846 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); 1847 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); 1848 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); 1849 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); 1850 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); 1851 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); 1852 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); 1853 1854 output[0] = HIGHBD_WRAPLOW(x0, bd); 1855 output[1] = HIGHBD_WRAPLOW(-x8, bd); 1856 output[2] = HIGHBD_WRAPLOW(x12, bd); 1857 output[3] = HIGHBD_WRAPLOW(-x4, bd); 1858 output[4] = HIGHBD_WRAPLOW(x6, bd); 1859 output[5] = HIGHBD_WRAPLOW(x14, bd); 1860 output[6] = HIGHBD_WRAPLOW(x10, bd); 1861 output[7] = HIGHBD_WRAPLOW(x2, bd); 1862 output[8] = HIGHBD_WRAPLOW(x3, bd); 1863 output[9] = HIGHBD_WRAPLOW(x11, bd); 1864 output[10] = HIGHBD_WRAPLOW(x15, bd); 1865 output[11] = HIGHBD_WRAPLOW(x7, bd); 1866 output[12] = HIGHBD_WRAPLOW(x5, bd); 1867 output[13] = HIGHBD_WRAPLOW(-x13, bd); 1868 output[14] = HIGHBD_WRAPLOW(x9, bd); 1869 output[15] = HIGHBD_WRAPLOW(-x1, bd); 1870 } 1871 1872 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { 1873 tran_low_t step1[16], step2[16]; 1874 tran_high_t temp1, temp2; 1875 (void)bd; 1876 1877 if (detect_invalid_highbd_input(input, 16)) { 1878 #if CONFIG_COEFFICIENT_RANGE_CHECKING 1879 assert(0 && "invalid highbd txfm input"); 1880 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 1881 memset(output, 0, sizeof(*output) * 16); 1882 return; 1883 } 1884 1885 // stage 1 1886 step1[0] = input[0 / 2]; 1887 step1[1] = input[16 / 2]; 1888 step1[2] = input[8 / 2]; 1889 step1[3] = input[24 / 2]; 1890 step1[4] = input[4 / 2]; 1891 step1[5] = input[20 / 2]; 1892 step1[6] = input[12 / 2]; 1893 step1[7] = input[28 / 2]; 1894 step1[8] = input[2 / 2]; 1895 step1[9] = input[18 / 2]; 1896 step1[10] = input[10 / 2]; 1897 step1[11] = input[26 / 2]; 1898 step1[12] = input[6 / 2]; 1899 step1[13] = input[22 / 2]; 1900 step1[14] = input[14 / 2]; 1901 step1[15] = input[30 / 2]; 1902 1903 // stage 2 1904 step2[0] = step1[0]; 1905 step2[1] = step1[1]; 1906 step2[2] = step1[2]; 1907 step2[3] = step1[3]; 1908 step2[4] = step1[4]; 1909 step2[5] = step1[5]; 1910 step2[6] = step1[6]; 1911 step2[7] = step1[7]; 1912 1913 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 1914 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 1915 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1916 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1917 1918 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 1919 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 1920 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1921 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1922 1923 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 1924 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 1925 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1926 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1927 1928 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 1929 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 1930 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1931 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1932 1933 // stage 3 1934 step1[0] = step2[0]; 1935 step1[1] = step2[1]; 1936 step1[2] = step2[2]; 1937 step1[3] = step2[3]; 1938 1939 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 1940 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 1941 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1942 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1943 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 1944 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 1945 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1946 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1947 1948 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); 1949 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); 1950 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); 1951 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); 1952 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); 1953 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); 1954 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); 1955 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); 1956 1957 // stage 4 1958 temp1 = (step1[0] + step1[1]) * cospi_16_64; 1959 temp2 = (step1[0] - step1[1]) * cospi_16_64; 1960 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1961 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1962 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 1963 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 1964 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1965 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1966 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); 1967 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); 1968 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); 1969 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); 1970 1971 step2[8] = step1[8]; 1972 step2[15] = step1[15]; 1973 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 1974 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 1975 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1976 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1977 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 1978 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 1979 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1980 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1981 step2[11] = step1[11]; 1982 step2[12] = step1[12]; 1983 1984 // stage 5 1985 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); 1986 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); 1987 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); 1988 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); 1989 step1[4] = step2[4]; 1990 temp1 = (step2[6] - step2[5]) * cospi_16_64; 1991 temp2 = (step2[5] + step2[6]) * cospi_16_64; 1992 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 1993 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 1994 step1[7] = step2[7]; 1995 1996 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); 1997 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); 1998 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); 1999 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); 2000 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); 2001 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); 2002 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); 2003 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); 2004 2005 // stage 6 2006 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); 2007 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); 2008 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); 2009 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); 2010 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); 2011 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); 2012 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); 2013 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); 2014 step2[8] = step1[8]; 2015 step2[9] = step1[9]; 2016 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 2017 temp2 = (step1[10] + step1[13]) * cospi_16_64; 2018 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2019 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2020 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 2021 temp2 = (step1[11] + step1[12]) * cospi_16_64; 2022 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2023 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2024 step2[14] = step1[14]; 2025 step2[15] = step1[15]; 2026 2027 // stage 7 2028 output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); 2029 output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); 2030 output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); 2031 output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); 2032 output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); 2033 output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); 2034 output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); 2035 output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); 2036 output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); 2037 output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); 2038 output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); 2039 output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); 2040 output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); 2041 output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); 2042 output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); 2043 output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); 2044 } 2045 2046 void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, 2047 int stride, int bd) { 2048 int i, j; 2049 tran_low_t out[16 * 16]; 2050 tran_low_t *outptr = out; 2051 tran_low_t temp_in[16], temp_out[16]; 2052 2053 // First transform rows 2054 for (i = 0; i < 16; ++i) { 2055 vpx_highbd_idct16_c(input, outptr, bd); 2056 input += 16; 2057 outptr += 16; 2058 } 2059 2060 // Then transform columns 2061 for (i = 0; i < 16; ++i) { 2062 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 2063 vpx_highbd_idct16_c(temp_in, temp_out, bd); 2064 for (j = 0; j < 16; ++j) { 2065 dest[j * stride + i] = highbd_clip_pixel_add( 2066 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2067 } 2068 } 2069 } 2070 2071 void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, 2072 int stride, int bd) { 2073 int i, j; 2074 tran_low_t out[16 * 16] = { 0 }; 2075 tran_low_t *outptr = out; 2076 tran_low_t temp_in[16], temp_out[16]; 2077 2078 // First transform rows. Since all non-zero dct coefficients are in 2079 // upper-left 8x8 area, we only need to calculate first 8 rows here. 2080 for (i = 0; i < 8; ++i) { 2081 vpx_highbd_idct16_c(input, outptr, bd); 2082 input += 16; 2083 outptr += 16; 2084 } 2085 2086 // Then transform columns 2087 for (i = 0; i < 16; ++i) { 2088 uint16_t *destT = dest; 2089 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 2090 vpx_highbd_idct16_c(temp_in, temp_out, bd); 2091 for (j = 0; j < 16; ++j) { 2092 destT[i] = highbd_clip_pixel_add(destT[i], 2093 ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2094 destT += stride; 2095 } 2096 } 2097 } 2098 2099 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, 2100 int stride, int bd) { 2101 int i, j; 2102 tran_low_t out[16 * 16] = { 0 }; 2103 tran_low_t *outptr = out; 2104 tran_low_t temp_in[16], temp_out[16]; 2105 2106 // First transform rows. Since all non-zero dct coefficients are in 2107 // upper-left 4x4 area, we only need to calculate first 4 rows here. 2108 for (i = 0; i < 4; ++i) { 2109 vpx_highbd_idct16_c(input, outptr, bd); 2110 input += 16; 2111 outptr += 16; 2112 } 2113 2114 // Then transform columns 2115 for (i = 0; i < 16; ++i) { 2116 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; 2117 vpx_highbd_idct16_c(temp_in, temp_out, bd); 2118 for (j = 0; j < 16; ++j) { 2119 dest[j * stride + i] = highbd_clip_pixel_add( 2120 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2121 } 2122 } 2123 } 2124 2125 void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, 2126 int stride, int bd) { 2127 int i, j; 2128 tran_high_t a1; 2129 tran_low_t out = 2130 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); 2131 2132 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); 2133 a1 = ROUND_POWER_OF_TWO(out, 6); 2134 for (j = 0; j < 16; ++j) { 2135 for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 2136 dest += stride; 2137 } 2138 } 2139 2140 static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, 2141 int bd) { 2142 tran_low_t step1[32], step2[32]; 2143 tran_high_t temp1, temp2; 2144 (void)bd; 2145 2146 if (detect_invalid_highbd_input(input, 32)) { 2147 #if CONFIG_COEFFICIENT_RANGE_CHECKING 2148 assert(0 && "invalid highbd txfm input"); 2149 #endif // CONFIG_COEFFICIENT_RANGE_CHECKING 2150 memset(output, 0, sizeof(*output) * 32); 2151 return; 2152 } 2153 2154 // stage 1 2155 step1[0] = input[0]; 2156 step1[1] = input[16]; 2157 step1[2] = input[8]; 2158 step1[3] = input[24]; 2159 step1[4] = input[4]; 2160 step1[5] = input[20]; 2161 step1[6] = input[12]; 2162 step1[7] = input[28]; 2163 step1[8] = input[2]; 2164 step1[9] = input[18]; 2165 step1[10] = input[10]; 2166 step1[11] = input[26]; 2167 step1[12] = input[6]; 2168 step1[13] = input[22]; 2169 step1[14] = input[14]; 2170 step1[15] = input[30]; 2171 2172 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; 2173 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; 2174 step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2175 step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2176 2177 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; 2178 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; 2179 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2180 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2181 2182 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; 2183 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; 2184 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2185 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2186 2187 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; 2188 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; 2189 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2190 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2191 2192 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; 2193 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; 2194 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2195 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2196 2197 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; 2198 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; 2199 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2200 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2201 2202 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; 2203 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; 2204 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2205 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2206 2207 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; 2208 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; 2209 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2210 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2211 2212 // stage 2 2213 step2[0] = step1[0]; 2214 step2[1] = step1[1]; 2215 step2[2] = step1[2]; 2216 step2[3] = step1[3]; 2217 step2[4] = step1[4]; 2218 step2[5] = step1[5]; 2219 step2[6] = step1[6]; 2220 step2[7] = step1[7]; 2221 2222 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; 2223 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; 2224 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2225 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2226 2227 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; 2228 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; 2229 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2230 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2231 2232 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; 2233 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; 2234 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2235 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2236 2237 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; 2238 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; 2239 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2240 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2241 2242 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); 2243 step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); 2244 step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd); 2245 step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd); 2246 step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd); 2247 step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd); 2248 step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd); 2249 step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd); 2250 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd); 2251 step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd); 2252 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd); 2253 step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd); 2254 step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd); 2255 step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd); 2256 step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd); 2257 step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd); 2258 2259 // stage 3 2260 step1[0] = step2[0]; 2261 step1[1] = step2[1]; 2262 step1[2] = step2[2]; 2263 step1[3] = step2[3]; 2264 2265 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; 2266 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; 2267 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2268 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2269 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; 2270 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; 2271 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2272 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2273 2274 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); 2275 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); 2276 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd); 2277 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd); 2278 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd); 2279 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd); 2280 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd); 2281 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd); 2282 2283 step1[16] = step2[16]; 2284 step1[31] = step2[31]; 2285 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; 2286 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; 2287 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2288 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2289 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; 2290 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; 2291 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2292 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2293 step1[19] = step2[19]; 2294 step1[20] = step2[20]; 2295 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; 2296 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; 2297 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2298 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2299 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; 2300 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; 2301 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2302 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2303 step1[23] = step2[23]; 2304 step1[24] = step2[24]; 2305 step1[27] = step2[27]; 2306 step1[28] = step2[28]; 2307 2308 // stage 4 2309 temp1 = (step1[0] + step1[1]) * cospi_16_64; 2310 temp2 = (step1[0] - step1[1]) * cospi_16_64; 2311 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2312 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2313 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; 2314 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; 2315 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2316 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2317 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); 2318 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); 2319 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); 2320 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd); 2321 2322 step2[8] = step1[8]; 2323 step2[15] = step1[15]; 2324 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; 2325 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; 2326 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2327 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2328 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; 2329 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; 2330 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2331 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2332 step2[11] = step1[11]; 2333 step2[12] = step1[12]; 2334 2335 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd); 2336 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd); 2337 step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd); 2338 step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd); 2339 step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd); 2340 step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd); 2341 step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd); 2342 step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd); 2343 2344 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd); 2345 step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd); 2346 step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd); 2347 step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd); 2348 step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd); 2349 step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd); 2350 step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd); 2351 step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd); 2352 2353 // stage 5 2354 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd); 2355 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd); 2356 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd); 2357 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd); 2358 step1[4] = step2[4]; 2359 temp1 = (step2[6] - step2[5]) * cospi_16_64; 2360 temp2 = (step2[5] + step2[6]) * cospi_16_64; 2361 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2362 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2363 step1[7] = step2[7]; 2364 2365 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); 2366 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd); 2367 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd); 2368 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd); 2369 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd); 2370 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd); 2371 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd); 2372 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd); 2373 2374 step1[16] = step2[16]; 2375 step1[17] = step2[17]; 2376 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; 2377 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; 2378 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2379 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2380 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; 2381 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; 2382 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2383 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2384 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; 2385 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; 2386 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2387 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2388 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; 2389 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; 2390 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2391 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2392 step1[22] = step2[22]; 2393 step1[23] = step2[23]; 2394 step1[24] = step2[24]; 2395 step1[25] = step2[25]; 2396 step1[30] = step2[30]; 2397 step1[31] = step2[31]; 2398 2399 // stage 6 2400 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd); 2401 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd); 2402 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd); 2403 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd); 2404 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd); 2405 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd); 2406 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd); 2407 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd); 2408 step2[8] = step1[8]; 2409 step2[9] = step1[9]; 2410 temp1 = (-step1[10] + step1[13]) * cospi_16_64; 2411 temp2 = (step1[10] + step1[13]) * cospi_16_64; 2412 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2413 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2414 temp1 = (-step1[11] + step1[12]) * cospi_16_64; 2415 temp2 = (step1[11] + step1[12]) * cospi_16_64; 2416 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2417 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2418 step2[14] = step1[14]; 2419 step2[15] = step1[15]; 2420 2421 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd); 2422 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd); 2423 step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd); 2424 step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd); 2425 step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd); 2426 step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd); 2427 step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd); 2428 step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd); 2429 2430 step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd); 2431 step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd); 2432 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd); 2433 step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd); 2434 step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd); 2435 step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd); 2436 step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd); 2437 step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd); 2438 2439 // stage 7 2440 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd); 2441 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd); 2442 step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd); 2443 step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd); 2444 step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd); 2445 step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd); 2446 step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd); 2447 step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd); 2448 step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd); 2449 step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd); 2450 step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd); 2451 step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd); 2452 step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd); 2453 step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd); 2454 step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd); 2455 step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd); 2456 2457 step1[16] = step2[16]; 2458 step1[17] = step2[17]; 2459 step1[18] = step2[18]; 2460 step1[19] = step2[19]; 2461 temp1 = (-step2[20] + step2[27]) * cospi_16_64; 2462 temp2 = (step2[20] + step2[27]) * cospi_16_64; 2463 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2464 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2465 temp1 = (-step2[21] + step2[26]) * cospi_16_64; 2466 temp2 = (step2[21] + step2[26]) * cospi_16_64; 2467 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2468 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2469 temp1 = (-step2[22] + step2[25]) * cospi_16_64; 2470 temp2 = (step2[22] + step2[25]) * cospi_16_64; 2471 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2472 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2473 temp1 = (-step2[23] + step2[24]) * cospi_16_64; 2474 temp2 = (step2[23] + step2[24]) * cospi_16_64; 2475 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); 2476 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); 2477 step1[28] = step2[28]; 2478 step1[29] = step2[29]; 2479 step1[30] = step2[30]; 2480 step1[31] = step2[31]; 2481 2482 // final stage 2483 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd); 2484 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd); 2485 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd); 2486 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd); 2487 output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd); 2488 output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd); 2489 output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd); 2490 output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd); 2491 output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd); 2492 output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd); 2493 output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd); 2494 output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd); 2495 output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd); 2496 output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd); 2497 output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd); 2498 output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd); 2499 output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd); 2500 output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd); 2501 output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd); 2502 output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd); 2503 output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd); 2504 output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd); 2505 output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd); 2506 output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd); 2507 output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd); 2508 output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd); 2509 output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd); 2510 output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd); 2511 output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd); 2512 output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd); 2513 output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd); 2514 output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd); 2515 } 2516 2517 void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, 2518 int stride, int bd) { 2519 int i, j; 2520 tran_low_t out[32 * 32]; 2521 tran_low_t *outptr = out; 2522 tran_low_t temp_in[32], temp_out[32]; 2523 2524 // Rows 2525 for (i = 0; i < 32; ++i) { 2526 tran_low_t zero_coeff = 0; 2527 for (j = 0; j < 32; ++j) zero_coeff |= input[j]; 2528 2529 if (zero_coeff) 2530 highbd_idct32_c(input, outptr, bd); 2531 else 2532 memset(outptr, 0, sizeof(tran_low_t) * 32); 2533 input += 32; 2534 outptr += 32; 2535 } 2536 2537 // Columns 2538 for (i = 0; i < 32; ++i) { 2539 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 2540 highbd_idct32_c(temp_in, temp_out, bd); 2541 for (j = 0; j < 32; ++j) { 2542 dest[j * stride + i] = highbd_clip_pixel_add( 2543 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2544 } 2545 } 2546 } 2547 2548 void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, 2549 int stride, int bd) { 2550 int i, j; 2551 tran_low_t out[32 * 32] = { 0 }; 2552 tran_low_t *outptr = out; 2553 tran_low_t temp_in[32], temp_out[32]; 2554 2555 // Rows 2556 // Only upper-left 16x16 has non-zero coeff 2557 for (i = 0; i < 16; ++i) { 2558 highbd_idct32_c(input, outptr, bd); 2559 input += 32; 2560 outptr += 32; 2561 } 2562 2563 // Columns 2564 for (i = 0; i < 32; ++i) { 2565 uint16_t *destT = dest; 2566 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 2567 highbd_idct32_c(temp_in, temp_out, bd); 2568 for (j = 0; j < 32; ++j) { 2569 destT[i] = highbd_clip_pixel_add(destT[i], 2570 ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2571 destT += stride; 2572 } 2573 } 2574 } 2575 2576 void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, 2577 int stride, int bd) { 2578 int i, j; 2579 tran_low_t out[32 * 32] = { 0 }; 2580 tran_low_t *outptr = out; 2581 tran_low_t temp_in[32], temp_out[32]; 2582 2583 // Rows 2584 // Only upper-left 8x8 has non-zero coeff 2585 for (i = 0; i < 8; ++i) { 2586 highbd_idct32_c(input, outptr, bd); 2587 input += 32; 2588 outptr += 32; 2589 } 2590 2591 // Columns 2592 for (i = 0; i < 32; ++i) { 2593 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; 2594 highbd_idct32_c(temp_in, temp_out, bd); 2595 for (j = 0; j < 32; ++j) { 2596 dest[j * stride + i] = highbd_clip_pixel_add( 2597 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); 2598 } 2599 } 2600 } 2601 2602 void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, 2603 int stride, int bd) { 2604 int i, j; 2605 int a1; 2606 tran_low_t out = 2607 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); 2608 2609 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); 2610 a1 = ROUND_POWER_OF_TWO(out, 6); 2611 2612 for (j = 0; j < 32; ++j) { 2613 for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); 2614 dest += stride; 2615 } 2616 } 2617 2618 #endif // CONFIG_VP9_HIGHBITDEPTH 2619