1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <math.h> 13 14 #include "./vpx_config.h" 15 #include "./vp9_rtcd.h" 16 17 #include "vp9/common/vp9_blockd.h" 18 #include "vp9/common/vp9_idct.h" 19 #include "vp9/common/vp9_systemdependent.h" 20 21 static INLINE int fdct_round_shift(int input) { 22 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 23 assert(INT16_MIN <= rv && rv <= INT16_MAX); 24 return rv; 25 } 26 27 static void fdct4(const int16_t *input, int16_t *output) { 28 int16_t step[4]; 29 int temp1, temp2; 30 31 step[0] = input[0] + input[3]; 32 step[1] = input[1] + input[2]; 33 step[2] = input[1] - input[2]; 34 step[3] = input[0] - input[3]; 35 36 temp1 = (step[0] + step[1]) * cospi_16_64; 37 temp2 = (step[0] - step[1]) * cospi_16_64; 38 output[0] = fdct_round_shift(temp1); 39 output[2] = fdct_round_shift(temp2); 40 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 41 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 42 output[1] = fdct_round_shift(temp1); 43 output[3] = fdct_round_shift(temp2); 44 } 45 46 void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) { 47 int r, c; 48 int16_t sum = 0; 49 for (r = 0; r < 4; ++r) 50 for (c = 0; c < 4; ++c) 51 sum += input[r * stride + c]; 52 53 output[0] = sum << 1; 54 output[1] = 0; 55 } 56 57 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { 58 // The 2D transform is done with two passes which are actually pretty 59 // similar. In the first one, we transform the columns and transpose 60 // the results. In the second one, we transform the rows. To achieve that, 61 // as the first pass results are transposed, we transpose the columns (that 62 // is the transposed rows) and transpose the results (so that it goes back 63 // in normal/row positions). 64 int pass; 65 // We need an intermediate buffer between passes. 66 int16_t intermediate[4 * 4]; 67 const int16_t *in = input; 68 int16_t *out = intermediate; 69 // Do the two transform/transpose passes 70 for (pass = 0; pass < 2; ++pass) { 71 /*canbe16*/ int input[4]; 72 /*canbe16*/ int step[4]; 73 /*needs32*/ int temp1, temp2; 74 int i; 75 for (i = 0; i < 4; ++i) { 76 // Load inputs. 77 if (0 == pass) { 78 input[0] = in[0 * stride] * 16; 79 input[1] = in[1 * stride] * 16; 80 input[2] = in[2 * stride] * 16; 81 input[3] = in[3 * stride] * 16; 82 if (i == 0 && input[0]) { 83 input[0] += 1; 84 } 85 } else { 86 input[0] = in[0 * 4]; 87 input[1] = in[1 * 4]; 88 input[2] = in[2 * 4]; 89 input[3] = in[3 * 4]; 90 } 91 // Transform. 92 step[0] = input[0] + input[3]; 93 step[1] = input[1] + input[2]; 94 step[2] = input[1] - input[2]; 95 step[3] = input[0] - input[3]; 96 temp1 = (step[0] + step[1]) * cospi_16_64; 97 temp2 = (step[0] - step[1]) * cospi_16_64; 98 out[0] = fdct_round_shift(temp1); 99 out[2] = fdct_round_shift(temp2); 100 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 101 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 102 out[1] = fdct_round_shift(temp1); 103 out[3] = fdct_round_shift(temp2); 104 // Do next column (which is a transposed row in second/horizontal pass) 105 in++; 106 out += 4; 107 } 108 // Setup in/out for next pass. 109 in = intermediate; 110 out = output; 111 } 112 113 { 114 int i, j; 115 for (i = 0; i < 4; ++i) { 116 for (j = 0; j < 4; ++j) 117 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; 118 } 119 } 120 } 121 122 static void fadst4(const int16_t *input, int16_t *output) { 123 int x0, x1, x2, x3; 124 int s0, s1, s2, s3, s4, s5, s6, s7; 125 126 x0 = input[0]; 127 x1 = input[1]; 128 x2 = input[2]; 129 x3 = input[3]; 130 131 if (!(x0 | x1 | x2 | x3)) { 132 output[0] = output[1] = output[2] = output[3] = 0; 133 return; 134 } 135 136 s0 = sinpi_1_9 * x0; 137 s1 = sinpi_4_9 * x0; 138 s2 = sinpi_2_9 * x1; 139 s3 = sinpi_1_9 * x1; 140 s4 = sinpi_3_9 * x2; 141 s5 = sinpi_4_9 * x3; 142 s6 = sinpi_2_9 * x3; 143 s7 = x0 + x1 - x3; 144 145 x0 = s0 + s2 + s5; 146 x1 = sinpi_3_9 * s7; 147 x2 = s1 - s3 + s6; 148 x3 = s4; 149 150 s0 = x0 + x3; 151 s1 = x1; 152 s2 = x2 - x3; 153 s3 = x2 - x0 + x3; 154 155 // 1-D transform scaling factor is sqrt(2). 156 output[0] = fdct_round_shift(s0); 157 output[1] = fdct_round_shift(s1); 158 output[2] = fdct_round_shift(s2); 159 output[3] = fdct_round_shift(s3); 160 } 161 162 static const transform_2d FHT_4[] = { 163 { fdct4, fdct4 }, // DCT_DCT = 0 164 { fadst4, fdct4 }, // ADST_DCT = 1 165 { fdct4, fadst4 }, // DCT_ADST = 2 166 { fadst4, fadst4 } // ADST_ADST = 3 167 }; 168 169 void vp9_fht4x4_c(const int16_t *input, int16_t *output, 170 int stride, int tx_type) { 171 if (tx_type == DCT_DCT) { 172 vp9_fdct4x4_c(input, output, stride); 173 } else { 174 int16_t out[4 * 4]; 175 int16_t *outptr = &out[0]; 176 int i, j; 177 int16_t temp_in[4], temp_out[4]; 178 const transform_2d ht = FHT_4[tx_type]; 179 180 // Columns 181 for (i = 0; i < 4; ++i) { 182 for (j = 0; j < 4; ++j) 183 temp_in[j] = input[j * stride + i] * 16; 184 if (i == 0 && temp_in[0]) 185 temp_in[0] += 1; 186 ht.cols(temp_in, temp_out); 187 for (j = 0; j < 4; ++j) 188 outptr[j * 4 + i] = temp_out[j]; 189 } 190 191 // Rows 192 for (i = 0; i < 4; ++i) { 193 for (j = 0; j < 4; ++j) 194 temp_in[j] = out[j + i * 4]; 195 ht.rows(temp_in, temp_out); 196 for (j = 0; j < 4; ++j) 197 output[j + i * 4] = (temp_out[j] + 1) >> 2; 198 } 199 } 200 } 201 202 static void fdct8(const int16_t *input, int16_t *output) { 203 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 204 /*needs32*/ int t0, t1, t2, t3; 205 /*canbe16*/ int x0, x1, x2, x3; 206 207 // stage 1 208 s0 = input[0] + input[7]; 209 s1 = input[1] + input[6]; 210 s2 = input[2] + input[5]; 211 s3 = input[3] + input[4]; 212 s4 = input[3] - input[4]; 213 s5 = input[2] - input[5]; 214 s6 = input[1] - input[6]; 215 s7 = input[0] - input[7]; 216 217 // fdct4(step, step); 218 x0 = s0 + s3; 219 x1 = s1 + s2; 220 x2 = s1 - s2; 221 x3 = s0 - s3; 222 t0 = (x0 + x1) * cospi_16_64; 223 t1 = (x0 - x1) * cospi_16_64; 224 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 225 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 226 output[0] = fdct_round_shift(t0); 227 output[2] = fdct_round_shift(t2); 228 output[4] = fdct_round_shift(t1); 229 output[6] = fdct_round_shift(t3); 230 231 // Stage 2 232 t0 = (s6 - s5) * cospi_16_64; 233 t1 = (s6 + s5) * cospi_16_64; 234 t2 = fdct_round_shift(t0); 235 t3 = fdct_round_shift(t1); 236 237 // Stage 3 238 x0 = s4 + t2; 239 x1 = s4 - t2; 240 x2 = s7 - t3; 241 x3 = s7 + t3; 242 243 // Stage 4 244 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 245 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 246 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 247 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 248 output[1] = fdct_round_shift(t0); 249 output[3] = fdct_round_shift(t2); 250 output[5] = fdct_round_shift(t1); 251 output[7] = fdct_round_shift(t3); 252 } 253 254 void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) { 255 int r, c; 256 int16_t sum = 0; 257 for (r = 0; r < 8; ++r) 258 for (c = 0; c < 8; ++c) 259 sum += input[r * stride + c]; 260 261 output[0] = sum; 262 output[1] = 0; 263 } 264 265 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { 266 int i, j; 267 int16_t intermediate[64]; 268 269 // Transform columns 270 { 271 int16_t *output = intermediate; 272 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 273 /*needs32*/ int t0, t1, t2, t3; 274 /*canbe16*/ int x0, x1, x2, x3; 275 276 int i; 277 for (i = 0; i < 8; i++) { 278 // stage 1 279 s0 = (input[0 * stride] + input[7 * stride]) * 4; 280 s1 = (input[1 * stride] + input[6 * stride]) * 4; 281 s2 = (input[2 * stride] + input[5 * stride]) * 4; 282 s3 = (input[3 * stride] + input[4 * stride]) * 4; 283 s4 = (input[3 * stride] - input[4 * stride]) * 4; 284 s5 = (input[2 * stride] - input[5 * stride]) * 4; 285 s6 = (input[1 * stride] - input[6 * stride]) * 4; 286 s7 = (input[0 * stride] - input[7 * stride]) * 4; 287 288 // fdct4(step, step); 289 x0 = s0 + s3; 290 x1 = s1 + s2; 291 x2 = s1 - s2; 292 x3 = s0 - s3; 293 t0 = (x0 + x1) * cospi_16_64; 294 t1 = (x0 - x1) * cospi_16_64; 295 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 296 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 297 output[0 * 8] = fdct_round_shift(t0); 298 output[2 * 8] = fdct_round_shift(t2); 299 output[4 * 8] = fdct_round_shift(t1); 300 output[6 * 8] = fdct_round_shift(t3); 301 302 // Stage 2 303 t0 = (s6 - s5) * cospi_16_64; 304 t1 = (s6 + s5) * cospi_16_64; 305 t2 = fdct_round_shift(t0); 306 t3 = fdct_round_shift(t1); 307 308 // Stage 3 309 x0 = s4 + t2; 310 x1 = s4 - t2; 311 x2 = s7 - t3; 312 x3 = s7 + t3; 313 314 // Stage 4 315 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 316 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 317 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 318 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 319 output[1 * 8] = fdct_round_shift(t0); 320 output[3 * 8] = fdct_round_shift(t2); 321 output[5 * 8] = fdct_round_shift(t1); 322 output[7 * 8] = fdct_round_shift(t3); 323 input++; 324 output++; 325 } 326 } 327 328 // Rows 329 for (i = 0; i < 8; ++i) { 330 fdct8(&intermediate[i * 8], &final_output[i * 8]); 331 for (j = 0; j < 8; ++j) 332 final_output[j + i * 8] /= 2; 333 } 334 } 335 336 void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) { 337 int r, c; 338 int16_t sum = 0; 339 for (r = 0; r < 16; ++r) 340 for (c = 0; c < 16; ++c) 341 sum += input[r * stride + c]; 342 343 output[0] = sum >> 1; 344 output[1] = 0; 345 } 346 347 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { 348 // The 2D transform is done with two passes which are actually pretty 349 // similar. In the first one, we transform the columns and transpose 350 // the results. In the second one, we transform the rows. To achieve that, 351 // as the first pass results are transposed, we transpose the columns (that 352 // is the transposed rows) and transpose the results (so that it goes back 353 // in normal/row positions). 354 int pass; 355 // We need an intermediate buffer between passes. 356 int16_t intermediate[256]; 357 const int16_t *in = input; 358 int16_t *out = intermediate; 359 // Do the two transform/transpose passes 360 for (pass = 0; pass < 2; ++pass) { 361 /*canbe16*/ int step1[8]; 362 /*canbe16*/ int step2[8]; 363 /*canbe16*/ int step3[8]; 364 /*canbe16*/ int input[8]; 365 /*needs32*/ int temp1, temp2; 366 int i; 367 for (i = 0; i < 16; i++) { 368 if (0 == pass) { 369 // Calculate input for the first 8 results. 370 input[0] = (in[0 * stride] + in[15 * stride]) * 4; 371 input[1] = (in[1 * stride] + in[14 * stride]) * 4; 372 input[2] = (in[2 * stride] + in[13 * stride]) * 4; 373 input[3] = (in[3 * stride] + in[12 * stride]) * 4; 374 input[4] = (in[4 * stride] + in[11 * stride]) * 4; 375 input[5] = (in[5 * stride] + in[10 * stride]) * 4; 376 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; 377 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; 378 // Calculate input for the next 8 results. 379 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; 380 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; 381 step1[2] = (in[5 * stride] - in[10 * stride]) * 4; 382 step1[3] = (in[4 * stride] - in[11 * stride]) * 4; 383 step1[4] = (in[3 * stride] - in[12 * stride]) * 4; 384 step1[5] = (in[2 * stride] - in[13 * stride]) * 4; 385 step1[6] = (in[1 * stride] - in[14 * stride]) * 4; 386 step1[7] = (in[0 * stride] - in[15 * stride]) * 4; 387 } else { 388 // Calculate input for the first 8 results. 389 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); 390 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); 391 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); 392 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); 393 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); 394 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); 395 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); 396 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); 397 // Calculate input for the next 8 results. 398 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); 399 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); 400 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); 401 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); 402 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); 403 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); 404 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); 405 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); 406 } 407 // Work on the first eight values; fdct8(input, even_results); 408 { 409 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 410 /*needs32*/ int t0, t1, t2, t3; 411 /*canbe16*/ int x0, x1, x2, x3; 412 413 // stage 1 414 s0 = input[0] + input[7]; 415 s1 = input[1] + input[6]; 416 s2 = input[2] + input[5]; 417 s3 = input[3] + input[4]; 418 s4 = input[3] - input[4]; 419 s5 = input[2] - input[5]; 420 s6 = input[1] - input[6]; 421 s7 = input[0] - input[7]; 422 423 // fdct4(step, step); 424 x0 = s0 + s3; 425 x1 = s1 + s2; 426 x2 = s1 - s2; 427 x3 = s0 - s3; 428 t0 = (x0 + x1) * cospi_16_64; 429 t1 = (x0 - x1) * cospi_16_64; 430 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 431 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 432 out[0] = fdct_round_shift(t0); 433 out[4] = fdct_round_shift(t2); 434 out[8] = fdct_round_shift(t1); 435 out[12] = fdct_round_shift(t3); 436 437 // Stage 2 438 t0 = (s6 - s5) * cospi_16_64; 439 t1 = (s6 + s5) * cospi_16_64; 440 t2 = fdct_round_shift(t0); 441 t3 = fdct_round_shift(t1); 442 443 // Stage 3 444 x0 = s4 + t2; 445 x1 = s4 - t2; 446 x2 = s7 - t3; 447 x3 = s7 + t3; 448 449 // Stage 4 450 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 451 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 452 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 453 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 454 out[2] = fdct_round_shift(t0); 455 out[6] = fdct_round_shift(t2); 456 out[10] = fdct_round_shift(t1); 457 out[14] = fdct_round_shift(t3); 458 } 459 // Work on the next eight values; step1 -> odd_results 460 { 461 // step 2 462 temp1 = (step1[5] - step1[2]) * cospi_16_64; 463 temp2 = (step1[4] - step1[3]) * cospi_16_64; 464 step2[2] = fdct_round_shift(temp1); 465 step2[3] = fdct_round_shift(temp2); 466 temp1 = (step1[4] + step1[3]) * cospi_16_64; 467 temp2 = (step1[5] + step1[2]) * cospi_16_64; 468 step2[4] = fdct_round_shift(temp1); 469 step2[5] = fdct_round_shift(temp2); 470 // step 3 471 step3[0] = step1[0] + step2[3]; 472 step3[1] = step1[1] + step2[2]; 473 step3[2] = step1[1] - step2[2]; 474 step3[3] = step1[0] - step2[3]; 475 step3[4] = step1[7] - step2[4]; 476 step3[5] = step1[6] - step2[5]; 477 step3[6] = step1[6] + step2[5]; 478 step3[7] = step1[7] + step2[4]; 479 // step 4 480 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 481 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; 482 step2[1] = fdct_round_shift(temp1); 483 step2[2] = fdct_round_shift(temp2); 484 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; 485 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 486 step2[5] = fdct_round_shift(temp1); 487 step2[6] = fdct_round_shift(temp2); 488 // step 5 489 step1[0] = step3[0] + step2[1]; 490 step1[1] = step3[0] - step2[1]; 491 step1[2] = step3[3] + step2[2]; 492 step1[3] = step3[3] - step2[2]; 493 step1[4] = step3[4] - step2[5]; 494 step1[5] = step3[4] + step2[5]; 495 step1[6] = step3[7] - step2[6]; 496 step1[7] = step3[7] + step2[6]; 497 // step 6 498 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 499 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 500 out[1] = fdct_round_shift(temp1); 501 out[9] = fdct_round_shift(temp2); 502 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 503 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 504 out[5] = fdct_round_shift(temp1); 505 out[13] = fdct_round_shift(temp2); 506 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 507 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 508 out[3] = fdct_round_shift(temp1); 509 out[11] = fdct_round_shift(temp2); 510 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 511 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 512 out[7] = fdct_round_shift(temp1); 513 out[15] = fdct_round_shift(temp2); 514 } 515 // Do next column (which is a transposed row in second/horizontal pass) 516 in++; 517 out += 16; 518 } 519 // Setup in/out for next pass. 520 in = intermediate; 521 out = output; 522 } 523 } 524 525 static void fadst8(const int16_t *input, int16_t *output) { 526 int s0, s1, s2, s3, s4, s5, s6, s7; 527 528 int x0 = input[7]; 529 int x1 = input[0]; 530 int x2 = input[5]; 531 int x3 = input[2]; 532 int x4 = input[3]; 533 int x5 = input[4]; 534 int x6 = input[1]; 535 int x7 = input[6]; 536 537 // stage 1 538 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 539 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 540 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 541 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 542 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 543 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 544 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 545 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 546 547 x0 = fdct_round_shift(s0 + s4); 548 x1 = fdct_round_shift(s1 + s5); 549 x2 = fdct_round_shift(s2 + s6); 550 x3 = fdct_round_shift(s3 + s7); 551 x4 = fdct_round_shift(s0 - s4); 552 x5 = fdct_round_shift(s1 - s5); 553 x6 = fdct_round_shift(s2 - s6); 554 x7 = fdct_round_shift(s3 - s7); 555 556 // stage 2 557 s0 = x0; 558 s1 = x1; 559 s2 = x2; 560 s3 = x3; 561 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 562 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 563 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; 564 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 565 566 x0 = s0 + s2; 567 x1 = s1 + s3; 568 x2 = s0 - s2; 569 x3 = s1 - s3; 570 x4 = fdct_round_shift(s4 + s6); 571 x5 = fdct_round_shift(s5 + s7); 572 x6 = fdct_round_shift(s4 - s6); 573 x7 = fdct_round_shift(s5 - s7); 574 575 // stage 3 576 s2 = cospi_16_64 * (x2 + x3); 577 s3 = cospi_16_64 * (x2 - x3); 578 s6 = cospi_16_64 * (x6 + x7); 579 s7 = cospi_16_64 * (x6 - x7); 580 581 x2 = fdct_round_shift(s2); 582 x3 = fdct_round_shift(s3); 583 x6 = fdct_round_shift(s6); 584 x7 = fdct_round_shift(s7); 585 586 output[0] = x0; 587 output[1] = - x4; 588 output[2] = x6; 589 output[3] = - x2; 590 output[4] = x3; 591 output[5] = - x7; 592 output[6] = x5; 593 output[7] = - x1; 594 } 595 596 static const transform_2d FHT_8[] = { 597 { fdct8, fdct8 }, // DCT_DCT = 0 598 { fadst8, fdct8 }, // ADST_DCT = 1 599 { fdct8, fadst8 }, // DCT_ADST = 2 600 { fadst8, fadst8 } // ADST_ADST = 3 601 }; 602 603 void vp9_fht8x8_c(const int16_t *input, int16_t *output, 604 int stride, int tx_type) { 605 if (tx_type == DCT_DCT) { 606 vp9_fdct8x8_c(input, output, stride); 607 } else { 608 int16_t out[64]; 609 int16_t *outptr = &out[0]; 610 int i, j; 611 int16_t temp_in[8], temp_out[8]; 612 const transform_2d ht = FHT_8[tx_type]; 613 614 // Columns 615 for (i = 0; i < 8; ++i) { 616 for (j = 0; j < 8; ++j) 617 temp_in[j] = input[j * stride + i] * 4; 618 ht.cols(temp_in, temp_out); 619 for (j = 0; j < 8; ++j) 620 outptr[j * 8 + i] = temp_out[j]; 621 } 622 623 // Rows 624 for (i = 0; i < 8; ++i) { 625 for (j = 0; j < 8; ++j) 626 temp_in[j] = out[j + i * 8]; 627 ht.rows(temp_in, temp_out); 628 for (j = 0; j < 8; ++j) 629 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; 630 } 631 } 632 } 633 634 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per 635 pixel. */ 636 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { 637 int i; 638 int a1, b1, c1, d1, e1; 639 const int16_t *ip = input; 640 int16_t *op = output; 641 642 for (i = 0; i < 4; i++) { 643 a1 = ip[0 * stride]; 644 b1 = ip[1 * stride]; 645 c1 = ip[2 * stride]; 646 d1 = ip[3 * stride]; 647 648 a1 += b1; 649 d1 = d1 - c1; 650 e1 = (a1 - d1) >> 1; 651 b1 = e1 - b1; 652 c1 = e1 - c1; 653 a1 -= c1; 654 d1 += b1; 655 op[0] = a1; 656 op[4] = c1; 657 op[8] = d1; 658 op[12] = b1; 659 660 ip++; 661 op++; 662 } 663 ip = output; 664 op = output; 665 666 for (i = 0; i < 4; i++) { 667 a1 = ip[0]; 668 b1 = ip[1]; 669 c1 = ip[2]; 670 d1 = ip[3]; 671 672 a1 += b1; 673 d1 -= c1; 674 e1 = (a1 - d1) >> 1; 675 b1 = e1 - b1; 676 c1 = e1 - c1; 677 a1 -= c1; 678 d1 += b1; 679 op[0] = a1 * UNIT_QUANT_FACTOR; 680 op[1] = c1 * UNIT_QUANT_FACTOR; 681 op[2] = d1 * UNIT_QUANT_FACTOR; 682 op[3] = b1 * UNIT_QUANT_FACTOR; 683 684 ip += 4; 685 op += 4; 686 } 687 } 688 689 // Rewrote to use same algorithm as others. 690 static void fdct16(const int16_t in[16], int16_t out[16]) { 691 /*canbe16*/ int step1[8]; 692 /*canbe16*/ int step2[8]; 693 /*canbe16*/ int step3[8]; 694 /*canbe16*/ int input[8]; 695 /*needs32*/ int temp1, temp2; 696 697 // step 1 698 input[0] = in[0] + in[15]; 699 input[1] = in[1] + in[14]; 700 input[2] = in[2] + in[13]; 701 input[3] = in[3] + in[12]; 702 input[4] = in[4] + in[11]; 703 input[5] = in[5] + in[10]; 704 input[6] = in[6] + in[ 9]; 705 input[7] = in[7] + in[ 8]; 706 707 step1[0] = in[7] - in[ 8]; 708 step1[1] = in[6] - in[ 9]; 709 step1[2] = in[5] - in[10]; 710 step1[3] = in[4] - in[11]; 711 step1[4] = in[3] - in[12]; 712 step1[5] = in[2] - in[13]; 713 step1[6] = in[1] - in[14]; 714 step1[7] = in[0] - in[15]; 715 716 // fdct8(step, step); 717 { 718 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 719 /*needs32*/ int t0, t1, t2, t3; 720 /*canbe16*/ int x0, x1, x2, x3; 721 722 // stage 1 723 s0 = input[0] + input[7]; 724 s1 = input[1] + input[6]; 725 s2 = input[2] + input[5]; 726 s3 = input[3] + input[4]; 727 s4 = input[3] - input[4]; 728 s5 = input[2] - input[5]; 729 s6 = input[1] - input[6]; 730 s7 = input[0] - input[7]; 731 732 // fdct4(step, step); 733 x0 = s0 + s3; 734 x1 = s1 + s2; 735 x2 = s1 - s2; 736 x3 = s0 - s3; 737 t0 = (x0 + x1) * cospi_16_64; 738 t1 = (x0 - x1) * cospi_16_64; 739 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 740 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 741 out[0] = fdct_round_shift(t0); 742 out[4] = fdct_round_shift(t2); 743 out[8] = fdct_round_shift(t1); 744 out[12] = fdct_round_shift(t3); 745 746 // Stage 2 747 t0 = (s6 - s5) * cospi_16_64; 748 t1 = (s6 + s5) * cospi_16_64; 749 t2 = fdct_round_shift(t0); 750 t3 = fdct_round_shift(t1); 751 752 // Stage 3 753 x0 = s4 + t2; 754 x1 = s4 - t2; 755 x2 = s7 - t3; 756 x3 = s7 + t3; 757 758 // Stage 4 759 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 760 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 761 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 762 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 763 out[2] = fdct_round_shift(t0); 764 out[6] = fdct_round_shift(t2); 765 out[10] = fdct_round_shift(t1); 766 out[14] = fdct_round_shift(t3); 767 } 768 769 // step 2 770 temp1 = (step1[5] - step1[2]) * cospi_16_64; 771 temp2 = (step1[4] - step1[3]) * cospi_16_64; 772 step2[2] = fdct_round_shift(temp1); 773 step2[3] = fdct_round_shift(temp2); 774 temp1 = (step1[4] + step1[3]) * cospi_16_64; 775 temp2 = (step1[5] + step1[2]) * cospi_16_64; 776 step2[4] = fdct_round_shift(temp1); 777 step2[5] = fdct_round_shift(temp2); 778 779 // step 3 780 step3[0] = step1[0] + step2[3]; 781 step3[1] = step1[1] + step2[2]; 782 step3[2] = step1[1] - step2[2]; 783 step3[3] = step1[0] - step2[3]; 784 step3[4] = step1[7] - step2[4]; 785 step3[5] = step1[6] - step2[5]; 786 step3[6] = step1[6] + step2[5]; 787 step3[7] = step1[7] + step2[4]; 788 789 // step 4 790 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 791 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; 792 step2[1] = fdct_round_shift(temp1); 793 step2[2] = fdct_round_shift(temp2); 794 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; 795 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 796 step2[5] = fdct_round_shift(temp1); 797 step2[6] = fdct_round_shift(temp2); 798 799 // step 5 800 step1[0] = step3[0] + step2[1]; 801 step1[1] = step3[0] - step2[1]; 802 step1[2] = step3[3] + step2[2]; 803 step1[3] = step3[3] - step2[2]; 804 step1[4] = step3[4] - step2[5]; 805 step1[5] = step3[4] + step2[5]; 806 step1[6] = step3[7] - step2[6]; 807 step1[7] = step3[7] + step2[6]; 808 809 // step 6 810 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 811 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 812 out[1] = fdct_round_shift(temp1); 813 out[9] = fdct_round_shift(temp2); 814 815 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 816 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 817 out[5] = fdct_round_shift(temp1); 818 out[13] = fdct_round_shift(temp2); 819 820 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 821 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 822 out[3] = fdct_round_shift(temp1); 823 out[11] = fdct_round_shift(temp2); 824 825 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 826 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 827 out[7] = fdct_round_shift(temp1); 828 out[15] = fdct_round_shift(temp2); 829 } 830 831 static void fadst16(const int16_t *input, int16_t *output) { 832 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 833 834 int x0 = input[15]; 835 int x1 = input[0]; 836 int x2 = input[13]; 837 int x3 = input[2]; 838 int x4 = input[11]; 839 int x5 = input[4]; 840 int x6 = input[9]; 841 int x7 = input[6]; 842 int x8 = input[7]; 843 int x9 = input[8]; 844 int x10 = input[5]; 845 int x11 = input[10]; 846 int x12 = input[3]; 847 int x13 = input[12]; 848 int x14 = input[1]; 849 int x15 = input[14]; 850 851 // stage 1 852 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 853 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 854 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 855 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 856 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 857 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 858 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 859 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 860 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 861 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 862 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 863 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 864 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 865 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 866 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 867 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 868 869 x0 = fdct_round_shift(s0 + s8); 870 x1 = fdct_round_shift(s1 + s9); 871 x2 = fdct_round_shift(s2 + s10); 872 x3 = fdct_round_shift(s3 + s11); 873 x4 = fdct_round_shift(s4 + s12); 874 x5 = fdct_round_shift(s5 + s13); 875 x6 = fdct_round_shift(s6 + s14); 876 x7 = fdct_round_shift(s7 + s15); 877 x8 = fdct_round_shift(s0 - s8); 878 x9 = fdct_round_shift(s1 - s9); 879 x10 = fdct_round_shift(s2 - s10); 880 x11 = fdct_round_shift(s3 - s11); 881 x12 = fdct_round_shift(s4 - s12); 882 x13 = fdct_round_shift(s5 - s13); 883 x14 = fdct_round_shift(s6 - s14); 884 x15 = fdct_round_shift(s7 - s15); 885 886 // stage 2 887 s0 = x0; 888 s1 = x1; 889 s2 = x2; 890 s3 = x3; 891 s4 = x4; 892 s5 = x5; 893 s6 = x6; 894 s7 = x7; 895 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 896 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 897 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 898 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 899 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 900 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 901 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 902 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 903 904 x0 = s0 + s4; 905 x1 = s1 + s5; 906 x2 = s2 + s6; 907 x3 = s3 + s7; 908 x4 = s0 - s4; 909 x5 = s1 - s5; 910 x6 = s2 - s6; 911 x7 = s3 - s7; 912 x8 = fdct_round_shift(s8 + s12); 913 x9 = fdct_round_shift(s9 + s13); 914 x10 = fdct_round_shift(s10 + s14); 915 x11 = fdct_round_shift(s11 + s15); 916 x12 = fdct_round_shift(s8 - s12); 917 x13 = fdct_round_shift(s9 - s13); 918 x14 = fdct_round_shift(s10 - s14); 919 x15 = fdct_round_shift(s11 - s15); 920 921 // stage 3 922 s0 = x0; 923 s1 = x1; 924 s2 = x2; 925 s3 = x3; 926 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 927 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 928 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 929 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 930 s8 = x8; 931 s9 = x9; 932 s10 = x10; 933 s11 = x11; 934 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 935 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 936 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 937 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 938 939 x0 = s0 + s2; 940 x1 = s1 + s3; 941 x2 = s0 - s2; 942 x3 = s1 - s3; 943 x4 = fdct_round_shift(s4 + s6); 944 x5 = fdct_round_shift(s5 + s7); 945 x6 = fdct_round_shift(s4 - s6); 946 x7 = fdct_round_shift(s5 - s7); 947 x8 = s8 + s10; 948 x9 = s9 + s11; 949 x10 = s8 - s10; 950 x11 = s9 - s11; 951 x12 = fdct_round_shift(s12 + s14); 952 x13 = fdct_round_shift(s13 + s15); 953 x14 = fdct_round_shift(s12 - s14); 954 x15 = fdct_round_shift(s13 - s15); 955 956 // stage 4 957 s2 = (- cospi_16_64) * (x2 + x3); 958 s3 = cospi_16_64 * (x2 - x3); 959 s6 = cospi_16_64 * (x6 + x7); 960 s7 = cospi_16_64 * (- x6 + x7); 961 s10 = cospi_16_64 * (x10 + x11); 962 s11 = cospi_16_64 * (- x10 + x11); 963 s14 = (- cospi_16_64) * (x14 + x15); 964 s15 = cospi_16_64 * (x14 - x15); 965 966 x2 = fdct_round_shift(s2); 967 x3 = fdct_round_shift(s3); 968 x6 = fdct_round_shift(s6); 969 x7 = fdct_round_shift(s7); 970 x10 = fdct_round_shift(s10); 971 x11 = fdct_round_shift(s11); 972 x14 = fdct_round_shift(s14); 973 x15 = fdct_round_shift(s15); 974 975 output[0] = x0; 976 output[1] = - x8; 977 output[2] = x12; 978 output[3] = - x4; 979 output[4] = x6; 980 output[5] = x14; 981 output[6] = x10; 982 output[7] = x2; 983 output[8] = x3; 984 output[9] = x11; 985 output[10] = x15; 986 output[11] = x7; 987 output[12] = x5; 988 output[13] = - x13; 989 output[14] = x9; 990 output[15] = - x1; 991 } 992 993 static const transform_2d FHT_16[] = { 994 { fdct16, fdct16 }, // DCT_DCT = 0 995 { fadst16, fdct16 }, // ADST_DCT = 1 996 { fdct16, fadst16 }, // DCT_ADST = 2 997 { fadst16, fadst16 } // ADST_ADST = 3 998 }; 999 1000 void vp9_fht16x16_c(const int16_t *input, int16_t *output, 1001 int stride, int tx_type) { 1002 if (tx_type == DCT_DCT) { 1003 vp9_fdct16x16_c(input, output, stride); 1004 } else { 1005 int16_t out[256]; 1006 int16_t *outptr = &out[0]; 1007 int i, j; 1008 int16_t temp_in[16], temp_out[16]; 1009 const transform_2d ht = FHT_16[tx_type]; 1010 1011 // Columns 1012 for (i = 0; i < 16; ++i) { 1013 for (j = 0; j < 16; ++j) 1014 temp_in[j] = input[j * stride + i] * 4; 1015 ht.cols(temp_in, temp_out); 1016 for (j = 0; j < 16; ++j) 1017 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1018 } 1019 1020 // Rows 1021 for (i = 0; i < 16; ++i) { 1022 for (j = 0; j < 16; ++j) 1023 temp_in[j] = out[j + i * 16]; 1024 ht.rows(temp_in, temp_out); 1025 for (j = 0; j < 16; ++j) 1026 output[j + i * 16] = temp_out[j]; 1027 } 1028 } 1029 } 1030 1031 static INLINE int dct_32_round(int input) { 1032 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 1033 assert(-131072 <= rv && rv <= 131071); 1034 return rv; 1035 } 1036 1037 static INLINE int half_round_shift(int input) { 1038 int rv = (input + 1 + (input < 0)) >> 2; 1039 return rv; 1040 } 1041 1042 static void fdct32(const int *input, int *output, int round) { 1043 int step[32]; 1044 // Stage 1 1045 step[0] = input[0] + input[(32 - 1)]; 1046 step[1] = input[1] + input[(32 - 2)]; 1047 step[2] = input[2] + input[(32 - 3)]; 1048 step[3] = input[3] + input[(32 - 4)]; 1049 step[4] = input[4] + input[(32 - 5)]; 1050 step[5] = input[5] + input[(32 - 6)]; 1051 step[6] = input[6] + input[(32 - 7)]; 1052 step[7] = input[7] + input[(32 - 8)]; 1053 step[8] = input[8] + input[(32 - 9)]; 1054 step[9] = input[9] + input[(32 - 10)]; 1055 step[10] = input[10] + input[(32 - 11)]; 1056 step[11] = input[11] + input[(32 - 12)]; 1057 step[12] = input[12] + input[(32 - 13)]; 1058 step[13] = input[13] + input[(32 - 14)]; 1059 step[14] = input[14] + input[(32 - 15)]; 1060 step[15] = input[15] + input[(32 - 16)]; 1061 step[16] = -input[16] + input[(32 - 17)]; 1062 step[17] = -input[17] + input[(32 - 18)]; 1063 step[18] = -input[18] + input[(32 - 19)]; 1064 step[19] = -input[19] + input[(32 - 20)]; 1065 step[20] = -input[20] + input[(32 - 21)]; 1066 step[21] = -input[21] + input[(32 - 22)]; 1067 step[22] = -input[22] + input[(32 - 23)]; 1068 step[23] = -input[23] + input[(32 - 24)]; 1069 step[24] = -input[24] + input[(32 - 25)]; 1070 step[25] = -input[25] + input[(32 - 26)]; 1071 step[26] = -input[26] + input[(32 - 27)]; 1072 step[27] = -input[27] + input[(32 - 28)]; 1073 step[28] = -input[28] + input[(32 - 29)]; 1074 step[29] = -input[29] + input[(32 - 30)]; 1075 step[30] = -input[30] + input[(32 - 31)]; 1076 step[31] = -input[31] + input[(32 - 32)]; 1077 1078 // Stage 2 1079 output[0] = step[0] + step[16 - 1]; 1080 output[1] = step[1] + step[16 - 2]; 1081 output[2] = step[2] + step[16 - 3]; 1082 output[3] = step[3] + step[16 - 4]; 1083 output[4] = step[4] + step[16 - 5]; 1084 output[5] = step[5] + step[16 - 6]; 1085 output[6] = step[6] + step[16 - 7]; 1086 output[7] = step[7] + step[16 - 8]; 1087 output[8] = -step[8] + step[16 - 9]; 1088 output[9] = -step[9] + step[16 - 10]; 1089 output[10] = -step[10] + step[16 - 11]; 1090 output[11] = -step[11] + step[16 - 12]; 1091 output[12] = -step[12] + step[16 - 13]; 1092 output[13] = -step[13] + step[16 - 14]; 1093 output[14] = -step[14] + step[16 - 15]; 1094 output[15] = -step[15] + step[16 - 16]; 1095 1096 output[16] = step[16]; 1097 output[17] = step[17]; 1098 output[18] = step[18]; 1099 output[19] = step[19]; 1100 1101 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); 1102 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); 1103 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); 1104 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); 1105 1106 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); 1107 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); 1108 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); 1109 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); 1110 1111 output[28] = step[28]; 1112 output[29] = step[29]; 1113 output[30] = step[30]; 1114 output[31] = step[31]; 1115 1116 // dump the magnitude by 4, hence the intermediate values are within 1117 // the range of 16 bits. 1118 if (round) { 1119 output[0] = half_round_shift(output[0]); 1120 output[1] = half_round_shift(output[1]); 1121 output[2] = half_round_shift(output[2]); 1122 output[3] = half_round_shift(output[3]); 1123 output[4] = half_round_shift(output[4]); 1124 output[5] = half_round_shift(output[5]); 1125 output[6] = half_round_shift(output[6]); 1126 output[7] = half_round_shift(output[7]); 1127 output[8] = half_round_shift(output[8]); 1128 output[9] = half_round_shift(output[9]); 1129 output[10] = half_round_shift(output[10]); 1130 output[11] = half_round_shift(output[11]); 1131 output[12] = half_round_shift(output[12]); 1132 output[13] = half_round_shift(output[13]); 1133 output[14] = half_round_shift(output[14]); 1134 output[15] = half_round_shift(output[15]); 1135 1136 output[16] = half_round_shift(output[16]); 1137 output[17] = half_round_shift(output[17]); 1138 output[18] = half_round_shift(output[18]); 1139 output[19] = half_round_shift(output[19]); 1140 output[20] = half_round_shift(output[20]); 1141 output[21] = half_round_shift(output[21]); 1142 output[22] = half_round_shift(output[22]); 1143 output[23] = half_round_shift(output[23]); 1144 output[24] = half_round_shift(output[24]); 1145 output[25] = half_round_shift(output[25]); 1146 output[26] = half_round_shift(output[26]); 1147 output[27] = half_round_shift(output[27]); 1148 output[28] = half_round_shift(output[28]); 1149 output[29] = half_round_shift(output[29]); 1150 output[30] = half_round_shift(output[30]); 1151 output[31] = half_round_shift(output[31]); 1152 } 1153 1154 // Stage 3 1155 step[0] = output[0] + output[(8 - 1)]; 1156 step[1] = output[1] + output[(8 - 2)]; 1157 step[2] = output[2] + output[(8 - 3)]; 1158 step[3] = output[3] + output[(8 - 4)]; 1159 step[4] = -output[4] + output[(8 - 5)]; 1160 step[5] = -output[5] + output[(8 - 6)]; 1161 step[6] = -output[6] + output[(8 - 7)]; 1162 step[7] = -output[7] + output[(8 - 8)]; 1163 step[8] = output[8]; 1164 step[9] = output[9]; 1165 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); 1166 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); 1167 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); 1168 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); 1169 step[14] = output[14]; 1170 step[15] = output[15]; 1171 1172 step[16] = output[16] + output[23]; 1173 step[17] = output[17] + output[22]; 1174 step[18] = output[18] + output[21]; 1175 step[19] = output[19] + output[20]; 1176 step[20] = -output[20] + output[19]; 1177 step[21] = -output[21] + output[18]; 1178 step[22] = -output[22] + output[17]; 1179 step[23] = -output[23] + output[16]; 1180 step[24] = -output[24] + output[31]; 1181 step[25] = -output[25] + output[30]; 1182 step[26] = -output[26] + output[29]; 1183 step[27] = -output[27] + output[28]; 1184 step[28] = output[28] + output[27]; 1185 step[29] = output[29] + output[26]; 1186 step[30] = output[30] + output[25]; 1187 step[31] = output[31] + output[24]; 1188 1189 // Stage 4 1190 output[0] = step[0] + step[3]; 1191 output[1] = step[1] + step[2]; 1192 output[2] = -step[2] + step[1]; 1193 output[3] = -step[3] + step[0]; 1194 output[4] = step[4]; 1195 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); 1196 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); 1197 output[7] = step[7]; 1198 output[8] = step[8] + step[11]; 1199 output[9] = step[9] + step[10]; 1200 output[10] = -step[10] + step[9]; 1201 output[11] = -step[11] + step[8]; 1202 output[12] = -step[12] + step[15]; 1203 output[13] = -step[13] + step[14]; 1204 output[14] = step[14] + step[13]; 1205 output[15] = step[15] + step[12]; 1206 1207 output[16] = step[16]; 1208 output[17] = step[17]; 1209 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); 1210 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); 1211 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); 1212 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); 1213 output[22] = step[22]; 1214 output[23] = step[23]; 1215 output[24] = step[24]; 1216 output[25] = step[25]; 1217 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); 1218 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); 1219 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); 1220 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); 1221 output[30] = step[30]; 1222 output[31] = step[31]; 1223 1224 // Stage 5 1225 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); 1226 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); 1227 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); 1228 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); 1229 step[4] = output[4] + output[5]; 1230 step[5] = -output[5] + output[4]; 1231 step[6] = -output[6] + output[7]; 1232 step[7] = output[7] + output[6]; 1233 step[8] = output[8]; 1234 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); 1235 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); 1236 step[11] = output[11]; 1237 step[12] = output[12]; 1238 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); 1239 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); 1240 step[15] = output[15]; 1241 1242 step[16] = output[16] + output[19]; 1243 step[17] = output[17] + output[18]; 1244 step[18] = -output[18] + output[17]; 1245 step[19] = -output[19] + output[16]; 1246 step[20] = -output[20] + output[23]; 1247 step[21] = -output[21] + output[22]; 1248 step[22] = output[22] + output[21]; 1249 step[23] = output[23] + output[20]; 1250 step[24] = output[24] + output[27]; 1251 step[25] = output[25] + output[26]; 1252 step[26] = -output[26] + output[25]; 1253 step[27] = -output[27] + output[24]; 1254 step[28] = -output[28] + output[31]; 1255 step[29] = -output[29] + output[30]; 1256 step[30] = output[30] + output[29]; 1257 step[31] = output[31] + output[28]; 1258 1259 // Stage 6 1260 output[0] = step[0]; 1261 output[1] = step[1]; 1262 output[2] = step[2]; 1263 output[3] = step[3]; 1264 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); 1265 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); 1266 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); 1267 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); 1268 output[8] = step[8] + step[9]; 1269 output[9] = -step[9] + step[8]; 1270 output[10] = -step[10] + step[11]; 1271 output[11] = step[11] + step[10]; 1272 output[12] = step[12] + step[13]; 1273 output[13] = -step[13] + step[12]; 1274 output[14] = -step[14] + step[15]; 1275 output[15] = step[15] + step[14]; 1276 1277 output[16] = step[16]; 1278 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); 1279 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); 1280 output[19] = step[19]; 1281 output[20] = step[20]; 1282 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); 1283 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); 1284 output[23] = step[23]; 1285 output[24] = step[24]; 1286 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); 1287 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); 1288 output[27] = step[27]; 1289 output[28] = step[28]; 1290 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); 1291 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); 1292 output[31] = step[31]; 1293 1294 // Stage 7 1295 step[0] = output[0]; 1296 step[1] = output[1]; 1297 step[2] = output[2]; 1298 step[3] = output[3]; 1299 step[4] = output[4]; 1300 step[5] = output[5]; 1301 step[6] = output[6]; 1302 step[7] = output[7]; 1303 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); 1304 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); 1305 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); 1306 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); 1307 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); 1308 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); 1309 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); 1310 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); 1311 1312 step[16] = output[16] + output[17]; 1313 step[17] = -output[17] + output[16]; 1314 step[18] = -output[18] + output[19]; 1315 step[19] = output[19] + output[18]; 1316 step[20] = output[20] + output[21]; 1317 step[21] = -output[21] + output[20]; 1318 step[22] = -output[22] + output[23]; 1319 step[23] = output[23] + output[22]; 1320 step[24] = output[24] + output[25]; 1321 step[25] = -output[25] + output[24]; 1322 step[26] = -output[26] + output[27]; 1323 step[27] = output[27] + output[26]; 1324 step[28] = output[28] + output[29]; 1325 step[29] = -output[29] + output[28]; 1326 step[30] = -output[30] + output[31]; 1327 step[31] = output[31] + output[30]; 1328 1329 // Final stage --- outputs indices are bit-reversed. 1330 output[0] = step[0]; 1331 output[16] = step[1]; 1332 output[8] = step[2]; 1333 output[24] = step[3]; 1334 output[4] = step[4]; 1335 output[20] = step[5]; 1336 output[12] = step[6]; 1337 output[28] = step[7]; 1338 output[2] = step[8]; 1339 output[18] = step[9]; 1340 output[10] = step[10]; 1341 output[26] = step[11]; 1342 output[6] = step[12]; 1343 output[22] = step[13]; 1344 output[14] = step[14]; 1345 output[30] = step[15]; 1346 1347 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); 1348 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); 1349 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); 1350 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); 1351 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); 1352 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); 1353 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); 1354 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); 1355 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); 1356 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); 1357 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); 1358 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); 1359 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); 1360 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); 1361 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); 1362 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); 1363 } 1364 1365 void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) { 1366 int r, c; 1367 int16_t sum = 0; 1368 for (r = 0; r < 32; ++r) 1369 for (c = 0; c < 32; ++c) 1370 sum += input[r * stride + c]; 1371 1372 output[0] = sum >> 3; 1373 output[1] = 0; 1374 } 1375 1376 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { 1377 int i, j; 1378 int output[32 * 32]; 1379 1380 // Columns 1381 for (i = 0; i < 32; ++i) { 1382 int temp_in[32], temp_out[32]; 1383 for (j = 0; j < 32; ++j) 1384 temp_in[j] = input[j * stride + i] * 4; 1385 fdct32(temp_in, temp_out, 0); 1386 for (j = 0; j < 32; ++j) 1387 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1388 } 1389 1390 // Rows 1391 for (i = 0; i < 32; ++i) { 1392 int temp_in[32], temp_out[32]; 1393 for (j = 0; j < 32; ++j) 1394 temp_in[j] = output[j + i * 32]; 1395 fdct32(temp_in, temp_out, 0); 1396 for (j = 0; j < 32; ++j) 1397 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1398 } 1399 } 1400 1401 // Note that although we use dct_32_round in dct32 computation flow, 1402 // this 2d fdct32x32 for rate-distortion optimization loop is operating 1403 // within 16 bits precision. 1404 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { 1405 int i, j; 1406 int output[32 * 32]; 1407 1408 // Columns 1409 for (i = 0; i < 32; ++i) { 1410 int temp_in[32], temp_out[32]; 1411 for (j = 0; j < 32; ++j) 1412 temp_in[j] = input[j * stride + i] * 4; 1413 fdct32(temp_in, temp_out, 0); 1414 for (j = 0; j < 32; ++j) 1415 // TODO(cd): see quality impact of only doing 1416 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; 1417 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c 1418 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1419 } 1420 1421 // Rows 1422 for (i = 0; i < 32; ++i) { 1423 int temp_in[32], temp_out[32]; 1424 for (j = 0; j < 32; ++j) 1425 temp_in[j] = output[j + i * 32]; 1426 fdct32(temp_in, temp_out, 1); 1427 for (j = 0; j < 32; ++j) 1428 out[j + i * 32] = temp_out[j]; 1429 } 1430 } 1431