1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <math.h> 13 14 #include "./vpx_config.h" 15 #include "./vp9_rtcd.h" 16 17 #include "vp9/common/vp9_blockd.h" 18 #include "vp9/common/vp9_idct.h" 19 #include "vp9/common/vp9_systemdependent.h" 20 21 #include "vp9/encoder/vp9_dct.h" 22 23 static void fdct4(const int16_t *input, int16_t *output) { 24 int16_t step[4]; 25 int temp1, temp2; 26 27 step[0] = input[0] + input[3]; 28 step[1] = input[1] + input[2]; 29 step[2] = input[1] - input[2]; 30 step[3] = input[0] - input[3]; 31 32 temp1 = (step[0] + step[1]) * cospi_16_64; 33 temp2 = (step[0] - step[1]) * cospi_16_64; 34 output[0] = dct_const_round_shift(temp1); 35 output[2] = dct_const_round_shift(temp2); 36 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 37 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 38 output[1] = dct_const_round_shift(temp1); 39 output[3] = dct_const_round_shift(temp2); 40 } 41 42 void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) { 43 // The 2D transform is done with two passes which are actually pretty 44 // similar. In the first one, we transform the columns and transpose 45 // the results. In the second one, we transform the rows. To achieve that, 46 // as the first pass results are transposed, we tranpose the columns (that 47 // is the transposed rows) and transpose the results (so that it goes back 48 // in normal/row positions). 49 int pass; 50 // We need an intermediate buffer between passes. 51 int16_t intermediate[4 * 4]; 52 const int16_t *in = input; 53 int16_t *out = intermediate; 54 // Do the two transform/transpose passes 55 for (pass = 0; pass < 2; ++pass) { 56 /*canbe16*/ int input[4]; 57 /*canbe16*/ int step[4]; 58 /*needs32*/ int temp1, temp2; 59 int i; 60 for (i = 0; i < 4; ++i) { 61 // Load inputs. 62 if (0 == pass) { 63 input[0] = in[0 * stride] * 16; 64 input[1] = in[1 * stride] * 16; 65 input[2] = in[2 * stride] * 16; 66 input[3] = in[3 * stride] * 16; 67 if (i == 0 && input[0]) { 68 input[0] += 1; 69 } 70 } else { 71 input[0] = in[0 * 4]; 72 input[1] = in[1 * 4]; 73 input[2] = in[2 * 4]; 74 input[3] = in[3 * 4]; 75 } 76 // Transform. 77 step[0] = input[0] + input[3]; 78 step[1] = input[1] + input[2]; 79 step[2] = input[1] - input[2]; 80 step[3] = input[0] - input[3]; 81 temp1 = (step[0] + step[1]) * cospi_16_64; 82 temp2 = (step[0] - step[1]) * cospi_16_64; 83 out[0] = dct_const_round_shift(temp1); 84 out[2] = dct_const_round_shift(temp2); 85 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 86 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 87 out[1] = dct_const_round_shift(temp1); 88 out[3] = dct_const_round_shift(temp2); 89 // Do next column (which is a transposed row in second/horizontal pass) 90 in++; 91 out += 4; 92 } 93 // Setup in/out for next pass. 94 in = intermediate; 95 out = output; 96 } 97 98 { 99 int i, j; 100 for (i = 0; i < 4; ++i) { 101 for (j = 0; j < 4; ++j) 102 output[j + i * 4] = (output[j + i * 4] + 1) >> 2; 103 } 104 } 105 } 106 107 static void fadst4(const int16_t *input, int16_t *output) { 108 int x0, x1, x2, x3; 109 int s0, s1, s2, s3, s4, s5, s6, s7; 110 111 x0 = input[0]; 112 x1 = input[1]; 113 x2 = input[2]; 114 x3 = input[3]; 115 116 if (!(x0 | x1 | x2 | x3)) { 117 output[0] = output[1] = output[2] = output[3] = 0; 118 return; 119 } 120 121 s0 = sinpi_1_9 * x0; 122 s1 = sinpi_4_9 * x0; 123 s2 = sinpi_2_9 * x1; 124 s3 = sinpi_1_9 * x1; 125 s4 = sinpi_3_9 * x2; 126 s5 = sinpi_4_9 * x3; 127 s6 = sinpi_2_9 * x3; 128 s7 = x0 + x1 - x3; 129 130 x0 = s0 + s2 + s5; 131 x1 = sinpi_3_9 * s7; 132 x2 = s1 - s3 + s6; 133 x3 = s4; 134 135 s0 = x0 + x3; 136 s1 = x1; 137 s2 = x2 - x3; 138 s3 = x2 - x0 + x3; 139 140 // 1-D transform scaling factor is sqrt(2). 141 output[0] = dct_const_round_shift(s0); 142 output[1] = dct_const_round_shift(s1); 143 output[2] = dct_const_round_shift(s2); 144 output[3] = dct_const_round_shift(s3); 145 } 146 147 static const transform_2d FHT_4[] = { 148 { fdct4, fdct4 }, // DCT_DCT = 0 149 { fadst4, fdct4 }, // ADST_DCT = 1 150 { fdct4, fadst4 }, // DCT_ADST = 2 151 { fadst4, fadst4 } // ADST_ADST = 3 152 }; 153 154 void vp9_short_fht4x4_c(const int16_t *input, int16_t *output, 155 int stride, int tx_type) { 156 int16_t out[4 * 4]; 157 int16_t *outptr = &out[0]; 158 int i, j; 159 int16_t temp_in[4], temp_out[4]; 160 const transform_2d ht = FHT_4[tx_type]; 161 162 // Columns 163 for (i = 0; i < 4; ++i) { 164 for (j = 0; j < 4; ++j) 165 temp_in[j] = input[j * stride + i] * 16; 166 if (i == 0 && temp_in[0]) 167 temp_in[0] += 1; 168 ht.cols(temp_in, temp_out); 169 for (j = 0; j < 4; ++j) 170 outptr[j * 4 + i] = temp_out[j]; 171 } 172 173 // Rows 174 for (i = 0; i < 4; ++i) { 175 for (j = 0; j < 4; ++j) 176 temp_in[j] = out[j + i * 4]; 177 ht.rows(temp_in, temp_out); 178 for (j = 0; j < 4; ++j) 179 output[j + i * 4] = (temp_out[j] + 1) >> 2; 180 } 181 } 182 183 static void fdct8(const int16_t *input, int16_t *output) { 184 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 185 /*needs32*/ int t0, t1, t2, t3; 186 /*canbe16*/ int x0, x1, x2, x3; 187 188 // stage 1 189 s0 = input[0] + input[7]; 190 s1 = input[1] + input[6]; 191 s2 = input[2] + input[5]; 192 s3 = input[3] + input[4]; 193 s4 = input[3] - input[4]; 194 s5 = input[2] - input[5]; 195 s6 = input[1] - input[6]; 196 s7 = input[0] - input[7]; 197 198 // fdct4(step, step); 199 x0 = s0 + s3; 200 x1 = s1 + s2; 201 x2 = s1 - s2; 202 x3 = s0 - s3; 203 t0 = (x0 + x1) * cospi_16_64; 204 t1 = (x0 - x1) * cospi_16_64; 205 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 206 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 207 output[0] = dct_const_round_shift(t0); 208 output[2] = dct_const_round_shift(t2); 209 output[4] = dct_const_round_shift(t1); 210 output[6] = dct_const_round_shift(t3); 211 212 // Stage 2 213 t0 = (s6 - s5) * cospi_16_64; 214 t1 = (s6 + s5) * cospi_16_64; 215 t2 = dct_const_round_shift(t0); 216 t3 = dct_const_round_shift(t1); 217 218 // Stage 3 219 x0 = s4 + t2; 220 x1 = s4 - t2; 221 x2 = s7 - t3; 222 x3 = s7 + t3; 223 224 // Stage 4 225 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 226 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 227 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 228 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 229 output[1] = dct_const_round_shift(t0); 230 output[3] = dct_const_round_shift(t2); 231 output[5] = dct_const_round_shift(t1); 232 output[7] = dct_const_round_shift(t3); 233 } 234 235 void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) { 236 int i, j; 237 int16_t intermediate[64]; 238 239 // Transform columns 240 { 241 int16_t *output = intermediate; 242 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 243 /*needs32*/ int t0, t1, t2, t3; 244 /*canbe16*/ int x0, x1, x2, x3; 245 246 int i; 247 for (i = 0; i < 8; i++) { 248 // stage 1 249 s0 = (input[0 * stride] + input[7 * stride]) * 4; 250 s1 = (input[1 * stride] + input[6 * stride]) * 4; 251 s2 = (input[2 * stride] + input[5 * stride]) * 4; 252 s3 = (input[3 * stride] + input[4 * stride]) * 4; 253 s4 = (input[3 * stride] - input[4 * stride]) * 4; 254 s5 = (input[2 * stride] - input[5 * stride]) * 4; 255 s6 = (input[1 * stride] - input[6 * stride]) * 4; 256 s7 = (input[0 * stride] - input[7 * stride]) * 4; 257 258 // fdct4(step, step); 259 x0 = s0 + s3; 260 x1 = s1 + s2; 261 x2 = s1 - s2; 262 x3 = s0 - s3; 263 t0 = (x0 + x1) * cospi_16_64; 264 t1 = (x0 - x1) * cospi_16_64; 265 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 266 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 267 output[0 * 8] = dct_const_round_shift(t0); 268 output[2 * 8] = dct_const_round_shift(t2); 269 output[4 * 8] = dct_const_round_shift(t1); 270 output[6 * 8] = dct_const_round_shift(t3); 271 272 // Stage 2 273 t0 = (s6 - s5) * cospi_16_64; 274 t1 = (s6 + s5) * cospi_16_64; 275 t2 = dct_const_round_shift(t0); 276 t3 = dct_const_round_shift(t1); 277 278 // Stage 3 279 x0 = s4 + t2; 280 x1 = s4 - t2; 281 x2 = s7 - t3; 282 x3 = s7 + t3; 283 284 // Stage 4 285 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 286 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 287 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 288 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 289 output[1 * 8] = dct_const_round_shift(t0); 290 output[3 * 8] = dct_const_round_shift(t2); 291 output[5 * 8] = dct_const_round_shift(t1); 292 output[7 * 8] = dct_const_round_shift(t3); 293 input++; 294 output++; 295 } 296 } 297 298 // Rows 299 for (i = 0; i < 8; ++i) { 300 fdct8(&intermediate[i * 8], &final_output[i * 8]); 301 for (j = 0; j < 8; ++j) 302 final_output[j + i * 8] /= 2; 303 } 304 } 305 306 void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) { 307 // The 2D transform is done with two passes which are actually pretty 308 // similar. In the first one, we transform the columns and transpose 309 // the results. In the second one, we transform the rows. To achieve that, 310 // as the first pass results are transposed, we tranpose the columns (that 311 // is the transposed rows) and transpose the results (so that it goes back 312 // in normal/row positions). 313 int pass; 314 // We need an intermediate buffer between passes. 315 int16_t intermediate[256]; 316 const int16_t *in = input; 317 int16_t *out = intermediate; 318 // Do the two transform/transpose passes 319 for (pass = 0; pass < 2; ++pass) { 320 /*canbe16*/ int step1[8]; 321 /*canbe16*/ int step2[8]; 322 /*canbe16*/ int step3[8]; 323 /*canbe16*/ int input[8]; 324 /*needs32*/ int temp1, temp2; 325 int i; 326 for (i = 0; i < 16; i++) { 327 if (0 == pass) { 328 // Calculate input for the first 8 results. 329 input[0] = (in[0 * stride] + in[15 * stride]) * 4; 330 input[1] = (in[1 * stride] + in[14 * stride]) * 4; 331 input[2] = (in[2 * stride] + in[13 * stride]) * 4; 332 input[3] = (in[3 * stride] + in[12 * stride]) * 4; 333 input[4] = (in[4 * stride] + in[11 * stride]) * 4; 334 input[5] = (in[5 * stride] + in[10 * stride]) * 4; 335 input[6] = (in[6 * stride] + in[ 9 * stride]) * 4; 336 input[7] = (in[7 * stride] + in[ 8 * stride]) * 4; 337 // Calculate input for the next 8 results. 338 step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4; 339 step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4; 340 step1[2] = (in[5 * stride] - in[10 * stride]) * 4; 341 step1[3] = (in[4 * stride] - in[11 * stride]) * 4; 342 step1[4] = (in[3 * stride] - in[12 * stride]) * 4; 343 step1[5] = (in[2 * stride] - in[13 * stride]) * 4; 344 step1[6] = (in[1 * stride] - in[14 * stride]) * 4; 345 step1[7] = (in[0 * stride] - in[15 * stride]) * 4; 346 } else { 347 // Calculate input for the first 8 results. 348 input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); 349 input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); 350 input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); 351 input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); 352 input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); 353 input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); 354 input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); 355 input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); 356 // Calculate input for the next 8 results. 357 step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); 358 step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); 359 step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); 360 step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); 361 step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); 362 step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); 363 step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); 364 step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); 365 } 366 // Work on the first eight values; fdct8(input, even_results); 367 { 368 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 369 /*needs32*/ int t0, t1, t2, t3; 370 /*canbe16*/ int x0, x1, x2, x3; 371 372 // stage 1 373 s0 = input[0] + input[7]; 374 s1 = input[1] + input[6]; 375 s2 = input[2] + input[5]; 376 s3 = input[3] + input[4]; 377 s4 = input[3] - input[4]; 378 s5 = input[2] - input[5]; 379 s6 = input[1] - input[6]; 380 s7 = input[0] - input[7]; 381 382 // fdct4(step, step); 383 x0 = s0 + s3; 384 x1 = s1 + s2; 385 x2 = s1 - s2; 386 x3 = s0 - s3; 387 t0 = (x0 + x1) * cospi_16_64; 388 t1 = (x0 - x1) * cospi_16_64; 389 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 390 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 391 out[0] = dct_const_round_shift(t0); 392 out[4] = dct_const_round_shift(t2); 393 out[8] = dct_const_round_shift(t1); 394 out[12] = dct_const_round_shift(t3); 395 396 // Stage 2 397 t0 = (s6 - s5) * cospi_16_64; 398 t1 = (s6 + s5) * cospi_16_64; 399 t2 = dct_const_round_shift(t0); 400 t3 = dct_const_round_shift(t1); 401 402 // Stage 3 403 x0 = s4 + t2; 404 x1 = s4 - t2; 405 x2 = s7 - t3; 406 x3 = s7 + t3; 407 408 // Stage 4 409 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 410 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 411 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 412 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 413 out[2] = dct_const_round_shift(t0); 414 out[6] = dct_const_round_shift(t2); 415 out[10] = dct_const_round_shift(t1); 416 out[14] = dct_const_round_shift(t3); 417 } 418 // Work on the next eight values; step1 -> odd_results 419 { 420 // step 2 421 temp1 = (step1[5] - step1[2]) * cospi_16_64; 422 temp2 = (step1[4] - step1[3]) * cospi_16_64; 423 step2[2] = dct_const_round_shift(temp1); 424 step2[3] = dct_const_round_shift(temp2); 425 temp1 = (step1[4] + step1[3]) * cospi_16_64; 426 temp2 = (step1[5] + step1[2]) * cospi_16_64; 427 step2[4] = dct_const_round_shift(temp1); 428 step2[5] = dct_const_round_shift(temp2); 429 // step 3 430 step3[0] = step1[0] + step2[3]; 431 step3[1] = step1[1] + step2[2]; 432 step3[2] = step1[1] - step2[2]; 433 step3[3] = step1[0] - step2[3]; 434 step3[4] = step1[7] - step2[4]; 435 step3[5] = step1[6] - step2[5]; 436 step3[6] = step1[6] + step2[5]; 437 step3[7] = step1[7] + step2[4]; 438 // step 4 439 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 440 temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; 441 step2[1] = dct_const_round_shift(temp1); 442 step2[2] = dct_const_round_shift(temp2); 443 temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; 444 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 445 step2[5] = dct_const_round_shift(temp1); 446 step2[6] = dct_const_round_shift(temp2); 447 // step 5 448 step1[0] = step3[0] + step2[1]; 449 step1[1] = step3[0] - step2[1]; 450 step1[2] = step3[3] - step2[2]; 451 step1[3] = step3[3] + step2[2]; 452 step1[4] = step3[4] + step2[5]; 453 step1[5] = step3[4] - step2[5]; 454 step1[6] = step3[7] - step2[6]; 455 step1[7] = step3[7] + step2[6]; 456 // step 6 457 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 458 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 459 out[1] = dct_const_round_shift(temp1); 460 out[9] = dct_const_round_shift(temp2); 461 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 462 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 463 out[5] = dct_const_round_shift(temp1); 464 out[13] = dct_const_round_shift(temp2); 465 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 466 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 467 out[3] = dct_const_round_shift(temp1); 468 out[11] = dct_const_round_shift(temp2); 469 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 470 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 471 out[7] = dct_const_round_shift(temp1); 472 out[15] = dct_const_round_shift(temp2); 473 } 474 // Do next column (which is a transposed row in second/horizontal pass) 475 in++; 476 out += 16; 477 } 478 // Setup in/out for next pass. 479 in = intermediate; 480 out = output; 481 } 482 } 483 484 static void fadst8(const int16_t *input, int16_t *output) { 485 int s0, s1, s2, s3, s4, s5, s6, s7; 486 487 int x0 = input[7]; 488 int x1 = input[0]; 489 int x2 = input[5]; 490 int x3 = input[2]; 491 int x4 = input[3]; 492 int x5 = input[4]; 493 int x6 = input[1]; 494 int x7 = input[6]; 495 496 // stage 1 497 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 498 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 499 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 500 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 501 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 502 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 503 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 504 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 505 506 x0 = dct_const_round_shift(s0 + s4); 507 x1 = dct_const_round_shift(s1 + s5); 508 x2 = dct_const_round_shift(s2 + s6); 509 x3 = dct_const_round_shift(s3 + s7); 510 x4 = dct_const_round_shift(s0 - s4); 511 x5 = dct_const_round_shift(s1 - s5); 512 x6 = dct_const_round_shift(s2 - s6); 513 x7 = dct_const_round_shift(s3 - s7); 514 515 // stage 2 516 s0 = x0; 517 s1 = x1; 518 s2 = x2; 519 s3 = x3; 520 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 521 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 522 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; 523 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 524 525 x0 = s0 + s2; 526 x1 = s1 + s3; 527 x2 = s0 - s2; 528 x3 = s1 - s3; 529 x4 = dct_const_round_shift(s4 + s6); 530 x5 = dct_const_round_shift(s5 + s7); 531 x6 = dct_const_round_shift(s4 - s6); 532 x7 = dct_const_round_shift(s5 - s7); 533 534 // stage 3 535 s2 = cospi_16_64 * (x2 + x3); 536 s3 = cospi_16_64 * (x2 - x3); 537 s6 = cospi_16_64 * (x6 + x7); 538 s7 = cospi_16_64 * (x6 - x7); 539 540 x2 = dct_const_round_shift(s2); 541 x3 = dct_const_round_shift(s3); 542 x6 = dct_const_round_shift(s6); 543 x7 = dct_const_round_shift(s7); 544 545 output[0] = x0; 546 output[1] = - x4; 547 output[2] = x6; 548 output[3] = - x2; 549 output[4] = x3; 550 output[5] = - x7; 551 output[6] = x5; 552 output[7] = - x1; 553 } 554 555 static const transform_2d FHT_8[] = { 556 { fdct8, fdct8 }, // DCT_DCT = 0 557 { fadst8, fdct8 }, // ADST_DCT = 1 558 { fdct8, fadst8 }, // DCT_ADST = 2 559 { fadst8, fadst8 } // ADST_ADST = 3 560 }; 561 562 void vp9_short_fht8x8_c(const int16_t *input, int16_t *output, 563 int stride, int tx_type) { 564 int16_t out[64]; 565 int16_t *outptr = &out[0]; 566 int i, j; 567 int16_t temp_in[8], temp_out[8]; 568 const transform_2d ht = FHT_8[tx_type]; 569 570 // Columns 571 for (i = 0; i < 8; ++i) { 572 for (j = 0; j < 8; ++j) 573 temp_in[j] = input[j * stride + i] * 4; 574 ht.cols(temp_in, temp_out); 575 for (j = 0; j < 8; ++j) 576 outptr[j * 8 + i] = temp_out[j]; 577 } 578 579 // Rows 580 for (i = 0; i < 8; ++i) { 581 for (j = 0; j < 8; ++j) 582 temp_in[j] = out[j + i * 8]; 583 ht.rows(temp_in, temp_out); 584 for (j = 0; j < 8; ++j) 585 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; 586 } 587 } 588 589 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per 590 pixel. */ 591 void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) { 592 int i; 593 int a1, b1, c1, d1, e1; 594 const int16_t *ip = input; 595 int16_t *op = output; 596 597 for (i = 0; i < 4; i++) { 598 a1 = ip[0 * stride]; 599 b1 = ip[1 * stride]; 600 c1 = ip[2 * stride]; 601 d1 = ip[3 * stride]; 602 603 a1 += b1; 604 d1 = d1 - c1; 605 e1 = (a1 - d1) >> 1; 606 b1 = e1 - b1; 607 c1 = e1 - c1; 608 a1 -= c1; 609 d1 += b1; 610 op[0] = a1; 611 op[4] = c1; 612 op[8] = d1; 613 op[12] = b1; 614 615 ip++; 616 op++; 617 } 618 ip = output; 619 op = output; 620 621 for (i = 0; i < 4; i++) { 622 a1 = ip[0]; 623 b1 = ip[1]; 624 c1 = ip[2]; 625 d1 = ip[3]; 626 627 a1 += b1; 628 d1 -= c1; 629 e1 = (a1 - d1) >> 1; 630 b1 = e1 - b1; 631 c1 = e1 - c1; 632 a1 -= c1; 633 d1 += b1; 634 op[0] = a1 * UNIT_QUANT_FACTOR; 635 op[1] = c1 * UNIT_QUANT_FACTOR; 636 op[2] = d1 * UNIT_QUANT_FACTOR; 637 op[3] = b1 * UNIT_QUANT_FACTOR; 638 639 ip += 4; 640 op += 4; 641 } 642 } 643 644 // Rewrote to use same algorithm as others. 645 static void fdct16(const int16_t in[16], int16_t out[16]) { 646 /*canbe16*/ int step1[8]; 647 /*canbe16*/ int step2[8]; 648 /*canbe16*/ int step3[8]; 649 /*canbe16*/ int input[8]; 650 /*needs32*/ int temp1, temp2; 651 652 // step 1 653 input[0] = in[0] + in[15]; 654 input[1] = in[1] + in[14]; 655 input[2] = in[2] + in[13]; 656 input[3] = in[3] + in[12]; 657 input[4] = in[4] + in[11]; 658 input[5] = in[5] + in[10]; 659 input[6] = in[6] + in[ 9]; 660 input[7] = in[7] + in[ 8]; 661 662 step1[0] = in[7] - in[ 8]; 663 step1[1] = in[6] - in[ 9]; 664 step1[2] = in[5] - in[10]; 665 step1[3] = in[4] - in[11]; 666 step1[4] = in[3] - in[12]; 667 step1[5] = in[2] - in[13]; 668 step1[6] = in[1] - in[14]; 669 step1[7] = in[0] - in[15]; 670 671 // fdct8(step, step); 672 { 673 /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; 674 /*needs32*/ int t0, t1, t2, t3; 675 /*canbe16*/ int x0, x1, x2, x3; 676 677 // stage 1 678 s0 = input[0] + input[7]; 679 s1 = input[1] + input[6]; 680 s2 = input[2] + input[5]; 681 s3 = input[3] + input[4]; 682 s4 = input[3] - input[4]; 683 s5 = input[2] - input[5]; 684 s6 = input[1] - input[6]; 685 s7 = input[0] - input[7]; 686 687 // fdct4(step, step); 688 x0 = s0 + s3; 689 x1 = s1 + s2; 690 x2 = s1 - s2; 691 x3 = s0 - s3; 692 t0 = (x0 + x1) * cospi_16_64; 693 t1 = (x0 - x1) * cospi_16_64; 694 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 695 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 696 out[0] = dct_const_round_shift(t0); 697 out[4] = dct_const_round_shift(t2); 698 out[8] = dct_const_round_shift(t1); 699 out[12] = dct_const_round_shift(t3); 700 701 // Stage 2 702 t0 = (s6 - s5) * cospi_16_64; 703 t1 = (s6 + s5) * cospi_16_64; 704 t2 = dct_const_round_shift(t0); 705 t3 = dct_const_round_shift(t1); 706 707 // Stage 3 708 x0 = s4 + t2; 709 x1 = s4 - t2; 710 x2 = s7 - t3; 711 x3 = s7 + t3; 712 713 // Stage 4 714 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 715 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 716 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 717 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 718 out[2] = dct_const_round_shift(t0); 719 out[6] = dct_const_round_shift(t2); 720 out[10] = dct_const_round_shift(t1); 721 out[14] = dct_const_round_shift(t3); 722 } 723 724 // step 2 725 temp1 = (step1[5] - step1[2]) * cospi_16_64; 726 temp2 = (step1[4] - step1[3]) * cospi_16_64; 727 step2[2] = dct_const_round_shift(temp1); 728 step2[3] = dct_const_round_shift(temp2); 729 temp1 = (step1[4] + step1[3]) * cospi_16_64; 730 temp2 = (step1[5] + step1[2]) * cospi_16_64; 731 step2[4] = dct_const_round_shift(temp1); 732 step2[5] = dct_const_round_shift(temp2); 733 734 // step 3 735 step3[0] = step1[0] + step2[3]; 736 step3[1] = step1[1] + step2[2]; 737 step3[2] = step1[1] - step2[2]; 738 step3[3] = step1[0] - step2[3]; 739 step3[4] = step1[7] - step2[4]; 740 step3[5] = step1[6] - step2[5]; 741 step3[6] = step1[6] + step2[5]; 742 step3[7] = step1[7] + step2[4]; 743 744 // step 4 745 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 746 temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; 747 step2[1] = dct_const_round_shift(temp1); 748 step2[2] = dct_const_round_shift(temp2); 749 temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; 750 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 751 step2[5] = dct_const_round_shift(temp1); 752 step2[6] = dct_const_round_shift(temp2); 753 754 // step 5 755 step1[0] = step3[0] + step2[1]; 756 step1[1] = step3[0] - step2[1]; 757 step1[2] = step3[3] - step2[2]; 758 step1[3] = step3[3] + step2[2]; 759 step1[4] = step3[4] + step2[5]; 760 step1[5] = step3[4] - step2[5]; 761 step1[6] = step3[7] - step2[6]; 762 step1[7] = step3[7] + step2[6]; 763 764 // step 6 765 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 766 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 767 out[1] = dct_const_round_shift(temp1); 768 out[9] = dct_const_round_shift(temp2); 769 770 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 771 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 772 out[5] = dct_const_round_shift(temp1); 773 out[13] = dct_const_round_shift(temp2); 774 775 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 776 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 777 out[3] = dct_const_round_shift(temp1); 778 out[11] = dct_const_round_shift(temp2); 779 780 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 781 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 782 out[7] = dct_const_round_shift(temp1); 783 out[15] = dct_const_round_shift(temp2); 784 } 785 786 static void fadst16(const int16_t *input, int16_t *output) { 787 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15; 788 789 int x0 = input[15]; 790 int x1 = input[0]; 791 int x2 = input[13]; 792 int x3 = input[2]; 793 int x4 = input[11]; 794 int x5 = input[4]; 795 int x6 = input[9]; 796 int x7 = input[6]; 797 int x8 = input[7]; 798 int x9 = input[8]; 799 int x10 = input[5]; 800 int x11 = input[10]; 801 int x12 = input[3]; 802 int x13 = input[12]; 803 int x14 = input[1]; 804 int x15 = input[14]; 805 806 // stage 1 807 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 808 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 809 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 810 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 811 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 812 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 813 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 814 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 815 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 816 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 817 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 818 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 819 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 820 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 821 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 822 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 823 824 x0 = dct_const_round_shift(s0 + s8); 825 x1 = dct_const_round_shift(s1 + s9); 826 x2 = dct_const_round_shift(s2 + s10); 827 x3 = dct_const_round_shift(s3 + s11); 828 x4 = dct_const_round_shift(s4 + s12); 829 x5 = dct_const_round_shift(s5 + s13); 830 x6 = dct_const_round_shift(s6 + s14); 831 x7 = dct_const_round_shift(s7 + s15); 832 x8 = dct_const_round_shift(s0 - s8); 833 x9 = dct_const_round_shift(s1 - s9); 834 x10 = dct_const_round_shift(s2 - s10); 835 x11 = dct_const_round_shift(s3 - s11); 836 x12 = dct_const_round_shift(s4 - s12); 837 x13 = dct_const_round_shift(s5 - s13); 838 x14 = dct_const_round_shift(s6 - s14); 839 x15 = dct_const_round_shift(s7 - s15); 840 841 // stage 2 842 s0 = x0; 843 s1 = x1; 844 s2 = x2; 845 s3 = x3; 846 s4 = x4; 847 s5 = x5; 848 s6 = x6; 849 s7 = x7; 850 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 851 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 852 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 853 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 854 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 855 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 856 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 857 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 858 859 x0 = s0 + s4; 860 x1 = s1 + s5; 861 x2 = s2 + s6; 862 x3 = s3 + s7; 863 x4 = s0 - s4; 864 x5 = s1 - s5; 865 x6 = s2 - s6; 866 x7 = s3 - s7; 867 x8 = dct_const_round_shift(s8 + s12); 868 x9 = dct_const_round_shift(s9 + s13); 869 x10 = dct_const_round_shift(s10 + s14); 870 x11 = dct_const_round_shift(s11 + s15); 871 x12 = dct_const_round_shift(s8 - s12); 872 x13 = dct_const_round_shift(s9 - s13); 873 x14 = dct_const_round_shift(s10 - s14); 874 x15 = dct_const_round_shift(s11 - s15); 875 876 // stage 3 877 s0 = x0; 878 s1 = x1; 879 s2 = x2; 880 s3 = x3; 881 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 882 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 883 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 884 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 885 s8 = x8; 886 s9 = x9; 887 s10 = x10; 888 s11 = x11; 889 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 890 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 891 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 892 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 893 894 x0 = s0 + s2; 895 x1 = s1 + s3; 896 x2 = s0 - s2; 897 x3 = s1 - s3; 898 x4 = dct_const_round_shift(s4 + s6); 899 x5 = dct_const_round_shift(s5 + s7); 900 x6 = dct_const_round_shift(s4 - s6); 901 x7 = dct_const_round_shift(s5 - s7); 902 x8 = s8 + s10; 903 x9 = s9 + s11; 904 x10 = s8 - s10; 905 x11 = s9 - s11; 906 x12 = dct_const_round_shift(s12 + s14); 907 x13 = dct_const_round_shift(s13 + s15); 908 x14 = dct_const_round_shift(s12 - s14); 909 x15 = dct_const_round_shift(s13 - s15); 910 911 // stage 4 912 s2 = (- cospi_16_64) * (x2 + x3); 913 s3 = cospi_16_64 * (x2 - x3); 914 s6 = cospi_16_64 * (x6 + x7); 915 s7 = cospi_16_64 * (- x6 + x7); 916 s10 = cospi_16_64 * (x10 + x11); 917 s11 = cospi_16_64 * (- x10 + x11); 918 s14 = (- cospi_16_64) * (x14 + x15); 919 s15 = cospi_16_64 * (x14 - x15); 920 921 x2 = dct_const_round_shift(s2); 922 x3 = dct_const_round_shift(s3); 923 x6 = dct_const_round_shift(s6); 924 x7 = dct_const_round_shift(s7); 925 x10 = dct_const_round_shift(s10); 926 x11 = dct_const_round_shift(s11); 927 x14 = dct_const_round_shift(s14); 928 x15 = dct_const_round_shift(s15); 929 930 output[0] = x0; 931 output[1] = - x8; 932 output[2] = x12; 933 output[3] = - x4; 934 output[4] = x6; 935 output[5] = x14; 936 output[6] = x10; 937 output[7] = x2; 938 output[8] = x3; 939 output[9] = x11; 940 output[10] = x15; 941 output[11] = x7; 942 output[12] = x5; 943 output[13] = - x13; 944 output[14] = x9; 945 output[15] = - x1; 946 } 947 948 static const transform_2d FHT_16[] = { 949 { fdct16, fdct16 }, // DCT_DCT = 0 950 { fadst16, fdct16 }, // ADST_DCT = 1 951 { fdct16, fadst16 }, // DCT_ADST = 2 952 { fadst16, fadst16 } // ADST_ADST = 3 953 }; 954 955 void vp9_short_fht16x16_c(const int16_t *input, int16_t *output, 956 int stride, int tx_type) { 957 int16_t out[256]; 958 int16_t *outptr = &out[0]; 959 int i, j; 960 int16_t temp_in[16], temp_out[16]; 961 const transform_2d ht = FHT_16[tx_type]; 962 963 // Columns 964 for (i = 0; i < 16; ++i) { 965 for (j = 0; j < 16; ++j) 966 temp_in[j] = input[j * stride + i] * 4; 967 ht.cols(temp_in, temp_out); 968 for (j = 0; j < 16; ++j) 969 outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 970 // outptr[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 971 } 972 973 // Rows 974 for (i = 0; i < 16; ++i) { 975 for (j = 0; j < 16; ++j) 976 temp_in[j] = out[j + i * 16]; 977 ht.rows(temp_in, temp_out); 978 for (j = 0; j < 16; ++j) 979 output[j + i * 16] = temp_out[j]; 980 } 981 } 982 983 static INLINE int dct_32_round(int input) { 984 int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 985 assert(-131072 <= rv && rv <= 131071); 986 return rv; 987 } 988 989 static INLINE int half_round_shift(int input) { 990 int rv = (input + 1 + (input < 0)) >> 2; 991 return rv; 992 } 993 994 static void dct32_1d(const int *input, int *output, int round) { 995 int step[32]; 996 // Stage 1 997 step[0] = input[0] + input[(32 - 1)]; 998 step[1] = input[1] + input[(32 - 2)]; 999 step[2] = input[2] + input[(32 - 3)]; 1000 step[3] = input[3] + input[(32 - 4)]; 1001 step[4] = input[4] + input[(32 - 5)]; 1002 step[5] = input[5] + input[(32 - 6)]; 1003 step[6] = input[6] + input[(32 - 7)]; 1004 step[7] = input[7] + input[(32 - 8)]; 1005 step[8] = input[8] + input[(32 - 9)]; 1006 step[9] = input[9] + input[(32 - 10)]; 1007 step[10] = input[10] + input[(32 - 11)]; 1008 step[11] = input[11] + input[(32 - 12)]; 1009 step[12] = input[12] + input[(32 - 13)]; 1010 step[13] = input[13] + input[(32 - 14)]; 1011 step[14] = input[14] + input[(32 - 15)]; 1012 step[15] = input[15] + input[(32 - 16)]; 1013 step[16] = -input[16] + input[(32 - 17)]; 1014 step[17] = -input[17] + input[(32 - 18)]; 1015 step[18] = -input[18] + input[(32 - 19)]; 1016 step[19] = -input[19] + input[(32 - 20)]; 1017 step[20] = -input[20] + input[(32 - 21)]; 1018 step[21] = -input[21] + input[(32 - 22)]; 1019 step[22] = -input[22] + input[(32 - 23)]; 1020 step[23] = -input[23] + input[(32 - 24)]; 1021 step[24] = -input[24] + input[(32 - 25)]; 1022 step[25] = -input[25] + input[(32 - 26)]; 1023 step[26] = -input[26] + input[(32 - 27)]; 1024 step[27] = -input[27] + input[(32 - 28)]; 1025 step[28] = -input[28] + input[(32 - 29)]; 1026 step[29] = -input[29] + input[(32 - 30)]; 1027 step[30] = -input[30] + input[(32 - 31)]; 1028 step[31] = -input[31] + input[(32 - 32)]; 1029 1030 // Stage 2 1031 output[0] = step[0] + step[16 - 1]; 1032 output[1] = step[1] + step[16 - 2]; 1033 output[2] = step[2] + step[16 - 3]; 1034 output[3] = step[3] + step[16 - 4]; 1035 output[4] = step[4] + step[16 - 5]; 1036 output[5] = step[5] + step[16 - 6]; 1037 output[6] = step[6] + step[16 - 7]; 1038 output[7] = step[7] + step[16 - 8]; 1039 output[8] = -step[8] + step[16 - 9]; 1040 output[9] = -step[9] + step[16 - 10]; 1041 output[10] = -step[10] + step[16 - 11]; 1042 output[11] = -step[11] + step[16 - 12]; 1043 output[12] = -step[12] + step[16 - 13]; 1044 output[13] = -step[13] + step[16 - 14]; 1045 output[14] = -step[14] + step[16 - 15]; 1046 output[15] = -step[15] + step[16 - 16]; 1047 1048 output[16] = step[16]; 1049 output[17] = step[17]; 1050 output[18] = step[18]; 1051 output[19] = step[19]; 1052 1053 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); 1054 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); 1055 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); 1056 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); 1057 1058 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); 1059 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); 1060 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); 1061 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); 1062 1063 output[28] = step[28]; 1064 output[29] = step[29]; 1065 output[30] = step[30]; 1066 output[31] = step[31]; 1067 1068 // dump the magnitude by 4, hence the intermediate values are within 1069 // the range of 16 bits. 1070 if (round) { 1071 output[0] = half_round_shift(output[0]); 1072 output[1] = half_round_shift(output[1]); 1073 output[2] = half_round_shift(output[2]); 1074 output[3] = half_round_shift(output[3]); 1075 output[4] = half_round_shift(output[4]); 1076 output[5] = half_round_shift(output[5]); 1077 output[6] = half_round_shift(output[6]); 1078 output[7] = half_round_shift(output[7]); 1079 output[8] = half_round_shift(output[8]); 1080 output[9] = half_round_shift(output[9]); 1081 output[10] = half_round_shift(output[10]); 1082 output[11] = half_round_shift(output[11]); 1083 output[12] = half_round_shift(output[12]); 1084 output[13] = half_round_shift(output[13]); 1085 output[14] = half_round_shift(output[14]); 1086 output[15] = half_round_shift(output[15]); 1087 1088 output[16] = half_round_shift(output[16]); 1089 output[17] = half_round_shift(output[17]); 1090 output[18] = half_round_shift(output[18]); 1091 output[19] = half_round_shift(output[19]); 1092 output[20] = half_round_shift(output[20]); 1093 output[21] = half_round_shift(output[21]); 1094 output[22] = half_round_shift(output[22]); 1095 output[23] = half_round_shift(output[23]); 1096 output[24] = half_round_shift(output[24]); 1097 output[25] = half_round_shift(output[25]); 1098 output[26] = half_round_shift(output[26]); 1099 output[27] = half_round_shift(output[27]); 1100 output[28] = half_round_shift(output[28]); 1101 output[29] = half_round_shift(output[29]); 1102 output[30] = half_round_shift(output[30]); 1103 output[31] = half_round_shift(output[31]); 1104 } 1105 1106 // Stage 3 1107 step[0] = output[0] + output[(8 - 1)]; 1108 step[1] = output[1] + output[(8 - 2)]; 1109 step[2] = output[2] + output[(8 - 3)]; 1110 step[3] = output[3] + output[(8 - 4)]; 1111 step[4] = -output[4] + output[(8 - 5)]; 1112 step[5] = -output[5] + output[(8 - 6)]; 1113 step[6] = -output[6] + output[(8 - 7)]; 1114 step[7] = -output[7] + output[(8 - 8)]; 1115 step[8] = output[8]; 1116 step[9] = output[9]; 1117 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); 1118 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); 1119 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); 1120 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); 1121 step[14] = output[14]; 1122 step[15] = output[15]; 1123 1124 step[16] = output[16] + output[23]; 1125 step[17] = output[17] + output[22]; 1126 step[18] = output[18] + output[21]; 1127 step[19] = output[19] + output[20]; 1128 step[20] = -output[20] + output[19]; 1129 step[21] = -output[21] + output[18]; 1130 step[22] = -output[22] + output[17]; 1131 step[23] = -output[23] + output[16]; 1132 step[24] = -output[24] + output[31]; 1133 step[25] = -output[25] + output[30]; 1134 step[26] = -output[26] + output[29]; 1135 step[27] = -output[27] + output[28]; 1136 step[28] = output[28] + output[27]; 1137 step[29] = output[29] + output[26]; 1138 step[30] = output[30] + output[25]; 1139 step[31] = output[31] + output[24]; 1140 1141 // Stage 4 1142 output[0] = step[0] + step[3]; 1143 output[1] = step[1] + step[2]; 1144 output[2] = -step[2] + step[1]; 1145 output[3] = -step[3] + step[0]; 1146 output[4] = step[4]; 1147 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); 1148 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); 1149 output[7] = step[7]; 1150 output[8] = step[8] + step[11]; 1151 output[9] = step[9] + step[10]; 1152 output[10] = -step[10] + step[9]; 1153 output[11] = -step[11] + step[8]; 1154 output[12] = -step[12] + step[15]; 1155 output[13] = -step[13] + step[14]; 1156 output[14] = step[14] + step[13]; 1157 output[15] = step[15] + step[12]; 1158 1159 output[16] = step[16]; 1160 output[17] = step[17]; 1161 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); 1162 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); 1163 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); 1164 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); 1165 output[22] = step[22]; 1166 output[23] = step[23]; 1167 output[24] = step[24]; 1168 output[25] = step[25]; 1169 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); 1170 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); 1171 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); 1172 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); 1173 output[30] = step[30]; 1174 output[31] = step[31]; 1175 1176 // Stage 5 1177 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); 1178 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); 1179 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); 1180 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); 1181 step[4] = output[4] + output[5]; 1182 step[5] = -output[5] + output[4]; 1183 step[6] = -output[6] + output[7]; 1184 step[7] = output[7] + output[6]; 1185 step[8] = output[8]; 1186 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); 1187 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); 1188 step[11] = output[11]; 1189 step[12] = output[12]; 1190 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); 1191 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); 1192 step[15] = output[15]; 1193 1194 step[16] = output[16] + output[19]; 1195 step[17] = output[17] + output[18]; 1196 step[18] = -output[18] + output[17]; 1197 step[19] = -output[19] + output[16]; 1198 step[20] = -output[20] + output[23]; 1199 step[21] = -output[21] + output[22]; 1200 step[22] = output[22] + output[21]; 1201 step[23] = output[23] + output[20]; 1202 step[24] = output[24] + output[27]; 1203 step[25] = output[25] + output[26]; 1204 step[26] = -output[26] + output[25]; 1205 step[27] = -output[27] + output[24]; 1206 step[28] = -output[28] + output[31]; 1207 step[29] = -output[29] + output[30]; 1208 step[30] = output[30] + output[29]; 1209 step[31] = output[31] + output[28]; 1210 1211 // Stage 6 1212 output[0] = step[0]; 1213 output[1] = step[1]; 1214 output[2] = step[2]; 1215 output[3] = step[3]; 1216 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); 1217 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); 1218 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); 1219 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); 1220 output[8] = step[8] + step[9]; 1221 output[9] = -step[9] + step[8]; 1222 output[10] = -step[10] + step[11]; 1223 output[11] = step[11] + step[10]; 1224 output[12] = step[12] + step[13]; 1225 output[13] = -step[13] + step[12]; 1226 output[14] = -step[14] + step[15]; 1227 output[15] = step[15] + step[14]; 1228 1229 output[16] = step[16]; 1230 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); 1231 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); 1232 output[19] = step[19]; 1233 output[20] = step[20]; 1234 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); 1235 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); 1236 output[23] = step[23]; 1237 output[24] = step[24]; 1238 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); 1239 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); 1240 output[27] = step[27]; 1241 output[28] = step[28]; 1242 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); 1243 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); 1244 output[31] = step[31]; 1245 1246 // Stage 7 1247 step[0] = output[0]; 1248 step[1] = output[1]; 1249 step[2] = output[2]; 1250 step[3] = output[3]; 1251 step[4] = output[4]; 1252 step[5] = output[5]; 1253 step[6] = output[6]; 1254 step[7] = output[7]; 1255 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); 1256 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); 1257 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); 1258 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); 1259 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); 1260 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); 1261 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); 1262 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); 1263 1264 step[16] = output[16] + output[17]; 1265 step[17] = -output[17] + output[16]; 1266 step[18] = -output[18] + output[19]; 1267 step[19] = output[19] + output[18]; 1268 step[20] = output[20] + output[21]; 1269 step[21] = -output[21] + output[20]; 1270 step[22] = -output[22] + output[23]; 1271 step[23] = output[23] + output[22]; 1272 step[24] = output[24] + output[25]; 1273 step[25] = -output[25] + output[24]; 1274 step[26] = -output[26] + output[27]; 1275 step[27] = output[27] + output[26]; 1276 step[28] = output[28] + output[29]; 1277 step[29] = -output[29] + output[28]; 1278 step[30] = -output[30] + output[31]; 1279 step[31] = output[31] + output[30]; 1280 1281 // Final stage --- outputs indices are bit-reversed. 1282 output[0] = step[0]; 1283 output[16] = step[1]; 1284 output[8] = step[2]; 1285 output[24] = step[3]; 1286 output[4] = step[4]; 1287 output[20] = step[5]; 1288 output[12] = step[6]; 1289 output[28] = step[7]; 1290 output[2] = step[8]; 1291 output[18] = step[9]; 1292 output[10] = step[10]; 1293 output[26] = step[11]; 1294 output[6] = step[12]; 1295 output[22] = step[13]; 1296 output[14] = step[14]; 1297 output[30] = step[15]; 1298 1299 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); 1300 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); 1301 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); 1302 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); 1303 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); 1304 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); 1305 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); 1306 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); 1307 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); 1308 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); 1309 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); 1310 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); 1311 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); 1312 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); 1313 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); 1314 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); 1315 } 1316 1317 void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) { 1318 int i, j; 1319 int output[32 * 32]; 1320 1321 // Columns 1322 for (i = 0; i < 32; ++i) { 1323 int temp_in[32], temp_out[32]; 1324 for (j = 0; j < 32; ++j) 1325 temp_in[j] = input[j * stride + i] * 4; 1326 dct32_1d(temp_in, temp_out, 0); 1327 for (j = 0; j < 32; ++j) 1328 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1329 } 1330 1331 // Rows 1332 for (i = 0; i < 32; ++i) { 1333 int temp_in[32], temp_out[32]; 1334 for (j = 0; j < 32; ++j) 1335 temp_in[j] = output[j + i * 32]; 1336 dct32_1d(temp_in, temp_out, 0); 1337 for (j = 0; j < 32; ++j) 1338 out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 1339 } 1340 } 1341 1342 // Note that although we use dct_32_round in dct32_1d computation flow, 1343 // this 2d fdct32x32 for rate-distortion optimization loop is operating 1344 // within 16 bits precision. 1345 void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) { 1346 int i, j; 1347 int output[32 * 32]; 1348 1349 // Columns 1350 for (i = 0; i < 32; ++i) { 1351 int temp_in[32], temp_out[32]; 1352 for (j = 0; j < 32; ++j) 1353 temp_in[j] = input[j * stride + i] * 4; 1354 dct32_1d(temp_in, temp_out, 0); 1355 for (j = 0; j < 32; ++j) 1356 // TODO(cd): see quality impact of only doing 1357 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; 1358 // PS: also change code in vp9/encoder/x86/vp9_dct_sse2.c 1359 output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 1360 } 1361 1362 // Rows 1363 for (i = 0; i < 32; ++i) { 1364 int temp_in[32], temp_out[32]; 1365 for (j = 0; j < 32; ++j) 1366 temp_in[j] = output[j + i * 32]; 1367 dct32_1d(temp_in, temp_out, 1); 1368 for (j = 0; j < 32; ++j) 1369 out[j + i * 32] = temp_out[j]; 1370 } 1371 } 1372 1373 void vp9_fht4x4(TX_TYPE tx_type, const int16_t *input, int16_t *output, 1374 int stride) { 1375 if (tx_type == DCT_DCT) 1376 vp9_fdct4x4(input, output, stride); 1377 else 1378 vp9_short_fht4x4(input, output, stride, tx_type); 1379 } 1380 1381 void vp9_fht8x8(TX_TYPE tx_type, const int16_t *input, int16_t *output, 1382 int stride) { 1383 if (tx_type == DCT_DCT) 1384 vp9_fdct8x8(input, output, stride); 1385 else 1386 vp9_short_fht8x8(input, output, stride, tx_type); 1387 } 1388 1389 void vp9_fht16x16(TX_TYPE tx_type, const int16_t *input, int16_t *output, 1390 int stride) { 1391 if (tx_type == DCT_DCT) 1392 vp9_fdct16x16(input, output, stride); 1393 else 1394 vp9_short_fht16x16(input, output, stride, tx_type); 1395 } 1396