1 /* 2 * Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include "./vpx_dsp_rtcd.h" 13 #include "vpx_dsp/fwd_txfm.h" 14 15 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { 16 // The 2D transform is done with two passes which are actually pretty 17 // similar. In the first one, we transform the columns and transpose 18 // the results. In the second one, we transform the rows. To achieve that, 19 // as the first pass results are transposed, we transpose the columns (that 20 // is the transposed rows) and transpose the results (so that it goes back 21 // in normal/row positions). 22 int pass; 23 // We need an intermediate buffer between passes. 24 tran_low_t intermediate[4 * 4]; 25 const tran_low_t *in_low = NULL; 26 tran_low_t *out = intermediate; 27 // Do the two transform/transpose passes 28 for (pass = 0; pass < 2; ++pass) { 29 tran_high_t in_high[4]; // canbe16 30 tran_high_t step[4]; // canbe16 31 tran_high_t temp1, temp2; // needs32 32 int i; 33 for (i = 0; i < 4; ++i) { 34 // Load inputs. 35 if (pass == 0) { 36 in_high[0] = input[0 * stride] * 16; 37 in_high[1] = input[1 * stride] * 16; 38 in_high[2] = input[2 * stride] * 16; 39 in_high[3] = input[3 * stride] * 16; 40 if (i == 0 && in_high[0]) { 41 ++in_high[0]; 42 } 43 } else { 44 assert(in_low != NULL); 45 in_high[0] = in_low[0 * 4]; 46 in_high[1] = in_low[1 * 4]; 47 in_high[2] = in_low[2 * 4]; 48 in_high[3] = in_low[3 * 4]; 49 ++in_low; 50 } 51 // Transform. 52 step[0] = in_high[0] + in_high[3]; 53 step[1] = in_high[1] + in_high[2]; 54 step[2] = in_high[1] - in_high[2]; 55 step[3] = in_high[0] - in_high[3]; 56 temp1 = (step[0] + step[1]) * cospi_16_64; 57 temp2 = (step[0] - step[1]) * cospi_16_64; 58 out[0] = (tran_low_t)fdct_round_shift(temp1); 59 out[2] = (tran_low_t)fdct_round_shift(temp2); 60 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 61 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 62 out[1] = (tran_low_t)fdct_round_shift(temp1); 63 out[3] = (tran_low_t)fdct_round_shift(temp2); 64 // Do next column (which is a transposed row in second/horizontal pass) 65 ++input; 66 out += 4; 67 } 68 // Setup in/out for next pass. 69 in_low = intermediate; 70 out = output; 71 } 72 73 { 74 int i, j; 75 for (i = 0; i < 4; ++i) { 76 for (j = 0; j < 4; ++j) output[j + i * 4] = (output[j + i * 4] + 1) >> 2; 77 } 78 } 79 } 80 81 void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) { 82 int r, c; 83 tran_low_t sum = 0; 84 for (r = 0; r < 4; ++r) 85 for (c = 0; c < 4; ++c) sum += input[r * stride + c]; 86 87 output[0] = sum * 2; 88 } 89 90 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride) { 91 int i, j; 92 tran_low_t intermediate[64]; 93 int pass; 94 tran_low_t *out = intermediate; 95 const tran_low_t *in = NULL; 96 97 // Transform columns 98 for (pass = 0; pass < 2; ++pass) { 99 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 100 tran_high_t t0, t1, t2, t3; // needs32 101 tran_high_t x0, x1, x2, x3; // canbe16 102 103 for (i = 0; i < 8; i++) { 104 // stage 1 105 if (pass == 0) { 106 s0 = (input[0 * stride] + input[7 * stride]) * 4; 107 s1 = (input[1 * stride] + input[6 * stride]) * 4; 108 s2 = (input[2 * stride] + input[5 * stride]) * 4; 109 s3 = (input[3 * stride] + input[4 * stride]) * 4; 110 s4 = (input[3 * stride] - input[4 * stride]) * 4; 111 s5 = (input[2 * stride] - input[5 * stride]) * 4; 112 s6 = (input[1 * stride] - input[6 * stride]) * 4; 113 s7 = (input[0 * stride] - input[7 * stride]) * 4; 114 ++input; 115 } else { 116 s0 = in[0 * 8] + in[7 * 8]; 117 s1 = in[1 * 8] + in[6 * 8]; 118 s2 = in[2 * 8] + in[5 * 8]; 119 s3 = in[3 * 8] + in[4 * 8]; 120 s4 = in[3 * 8] - in[4 * 8]; 121 s5 = in[2 * 8] - in[5 * 8]; 122 s6 = in[1 * 8] - in[6 * 8]; 123 s7 = in[0 * 8] - in[7 * 8]; 124 ++in; 125 } 126 127 // fdct4(step, step); 128 x0 = s0 + s3; 129 x1 = s1 + s2; 130 x2 = s1 - s2; 131 x3 = s0 - s3; 132 t0 = (x0 + x1) * cospi_16_64; 133 t1 = (x0 - x1) * cospi_16_64; 134 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 135 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 136 out[0] = (tran_low_t)fdct_round_shift(t0); 137 out[2] = (tran_low_t)fdct_round_shift(t2); 138 out[4] = (tran_low_t)fdct_round_shift(t1); 139 out[6] = (tran_low_t)fdct_round_shift(t3); 140 141 // Stage 2 142 t0 = (s6 - s5) * cospi_16_64; 143 t1 = (s6 + s5) * cospi_16_64; 144 t2 = fdct_round_shift(t0); 145 t3 = fdct_round_shift(t1); 146 147 // Stage 3 148 x0 = s4 + t2; 149 x1 = s4 - t2; 150 x2 = s7 - t3; 151 x3 = s7 + t3; 152 153 // Stage 4 154 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 155 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 156 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 157 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 158 out[1] = (tran_low_t)fdct_round_shift(t0); 159 out[3] = (tran_low_t)fdct_round_shift(t2); 160 out[5] = (tran_low_t)fdct_round_shift(t1); 161 out[7] = (tran_low_t)fdct_round_shift(t3); 162 out += 8; 163 } 164 in = intermediate; 165 out = output; 166 } 167 168 // Rows 169 for (i = 0; i < 8; ++i) { 170 for (j = 0; j < 8; ++j) output[j + i * 8] /= 2; 171 } 172 } 173 174 void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) { 175 int r, c; 176 tran_low_t sum = 0; 177 for (r = 0; r < 8; ++r) 178 for (c = 0; c < 8; ++c) sum += input[r * stride + c]; 179 180 output[0] = sum; 181 } 182 183 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { 184 // The 2D transform is done with two passes which are actually pretty 185 // similar. In the first one, we transform the columns and transpose 186 // the results. In the second one, we transform the rows. To achieve that, 187 // as the first pass results are transposed, we transpose the columns (that 188 // is the transposed rows) and transpose the results (so that it goes back 189 // in normal/row positions). 190 int pass; 191 // We need an intermediate buffer between passes. 192 tran_low_t intermediate[256]; 193 const tran_low_t *in_low = NULL; 194 tran_low_t *out = intermediate; 195 // Do the two transform/transpose passes 196 for (pass = 0; pass < 2; ++pass) { 197 tran_high_t step1[8]; // canbe16 198 tran_high_t step2[8]; // canbe16 199 tran_high_t step3[8]; // canbe16 200 tran_high_t in_high[8]; // canbe16 201 tran_high_t temp1, temp2; // needs32 202 int i; 203 for (i = 0; i < 16; i++) { 204 if (0 == pass) { 205 // Calculate input for the first 8 results. 206 in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; 207 in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; 208 in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; 209 in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; 210 in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; 211 in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; 212 in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; 213 in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; 214 // Calculate input for the next 8 results. 215 step1[0] = (input[7 * stride] - input[8 * stride]) * 4; 216 step1[1] = (input[6 * stride] - input[9 * stride]) * 4; 217 step1[2] = (input[5 * stride] - input[10 * stride]) * 4; 218 step1[3] = (input[4 * stride] - input[11 * stride]) * 4; 219 step1[4] = (input[3 * stride] - input[12 * stride]) * 4; 220 step1[5] = (input[2 * stride] - input[13 * stride]) * 4; 221 step1[6] = (input[1 * stride] - input[14 * stride]) * 4; 222 step1[7] = (input[0 * stride] - input[15 * stride]) * 4; 223 } else { 224 // Calculate input for the first 8 results. 225 assert(in_low != NULL); 226 in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); 227 in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); 228 in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); 229 in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); 230 in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); 231 in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); 232 in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); 233 in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); 234 // Calculate input for the next 8 results. 235 step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); 236 step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); 237 step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); 238 step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); 239 step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); 240 step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); 241 step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); 242 step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); 243 in_low++; 244 } 245 // Work on the first eight values; fdct8(input, even_results); 246 { 247 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 248 tran_high_t t0, t1, t2, t3; // needs32 249 tran_high_t x0, x1, x2, x3; // canbe16 250 251 // stage 1 252 s0 = in_high[0] + in_high[7]; 253 s1 = in_high[1] + in_high[6]; 254 s2 = in_high[2] + in_high[5]; 255 s3 = in_high[3] + in_high[4]; 256 s4 = in_high[3] - in_high[4]; 257 s5 = in_high[2] - in_high[5]; 258 s6 = in_high[1] - in_high[6]; 259 s7 = in_high[0] - in_high[7]; 260 261 // fdct4(step, step); 262 x0 = s0 + s3; 263 x1 = s1 + s2; 264 x2 = s1 - s2; 265 x3 = s0 - s3; 266 t0 = (x0 + x1) * cospi_16_64; 267 t1 = (x0 - x1) * cospi_16_64; 268 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 269 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 270 out[0] = (tran_low_t)fdct_round_shift(t0); 271 out[4] = (tran_low_t)fdct_round_shift(t2); 272 out[8] = (tran_low_t)fdct_round_shift(t1); 273 out[12] = (tran_low_t)fdct_round_shift(t3); 274 275 // Stage 2 276 t0 = (s6 - s5) * cospi_16_64; 277 t1 = (s6 + s5) * cospi_16_64; 278 t2 = fdct_round_shift(t0); 279 t3 = fdct_round_shift(t1); 280 281 // Stage 3 282 x0 = s4 + t2; 283 x1 = s4 - t2; 284 x2 = s7 - t3; 285 x3 = s7 + t3; 286 287 // Stage 4 288 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 289 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 290 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 291 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 292 out[2] = (tran_low_t)fdct_round_shift(t0); 293 out[6] = (tran_low_t)fdct_round_shift(t2); 294 out[10] = (tran_low_t)fdct_round_shift(t1); 295 out[14] = (tran_low_t)fdct_round_shift(t3); 296 } 297 // Work on the next eight values; step1 -> odd_results 298 { 299 // step 2 300 temp1 = (step1[5] - step1[2]) * cospi_16_64; 301 temp2 = (step1[4] - step1[3]) * cospi_16_64; 302 step2[2] = fdct_round_shift(temp1); 303 step2[3] = fdct_round_shift(temp2); 304 temp1 = (step1[4] + step1[3]) * cospi_16_64; 305 temp2 = (step1[5] + step1[2]) * cospi_16_64; 306 step2[4] = fdct_round_shift(temp1); 307 step2[5] = fdct_round_shift(temp2); 308 // step 3 309 step3[0] = step1[0] + step2[3]; 310 step3[1] = step1[1] + step2[2]; 311 step3[2] = step1[1] - step2[2]; 312 step3[3] = step1[0] - step2[3]; 313 step3[4] = step1[7] - step2[4]; 314 step3[5] = step1[6] - step2[5]; 315 step3[6] = step1[6] + step2[5]; 316 step3[7] = step1[7] + step2[4]; 317 // step 4 318 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 319 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; 320 step2[1] = fdct_round_shift(temp1); 321 step2[2] = fdct_round_shift(temp2); 322 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; 323 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 324 step2[5] = fdct_round_shift(temp1); 325 step2[6] = fdct_round_shift(temp2); 326 // step 5 327 step1[0] = step3[0] + step2[1]; 328 step1[1] = step3[0] - step2[1]; 329 step1[2] = step3[3] + step2[2]; 330 step1[3] = step3[3] - step2[2]; 331 step1[4] = step3[4] - step2[5]; 332 step1[5] = step3[4] + step2[5]; 333 step1[6] = step3[7] - step2[6]; 334 step1[7] = step3[7] + step2[6]; 335 // step 6 336 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 337 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 338 out[1] = (tran_low_t)fdct_round_shift(temp1); 339 out[9] = (tran_low_t)fdct_round_shift(temp2); 340 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 341 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 342 out[5] = (tran_low_t)fdct_round_shift(temp1); 343 out[13] = (tran_low_t)fdct_round_shift(temp2); 344 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 345 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 346 out[3] = (tran_low_t)fdct_round_shift(temp1); 347 out[11] = (tran_low_t)fdct_round_shift(temp2); 348 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 349 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 350 out[7] = (tran_low_t)fdct_round_shift(temp1); 351 out[15] = (tran_low_t)fdct_round_shift(temp2); 352 } 353 // Do next column (which is a transposed row in second/horizontal pass) 354 input++; 355 out += 16; 356 } 357 // Setup in/out for next pass. 358 in_low = intermediate; 359 out = output; 360 } 361 } 362 363 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) { 364 int r, c; 365 int sum = 0; 366 for (r = 0; r < 16; ++r) 367 for (c = 0; c < 16; ++c) sum += input[r * stride + c]; 368 369 output[0] = (tran_low_t)(sum >> 1); 370 } 371 372 static INLINE tran_high_t dct_32_round(tran_high_t input) { 373 tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); 374 // TODO(debargha, peter.derivaz): Find new bounds for this assert, 375 // and make the bounds consts. 376 // assert(-131072 <= rv && rv <= 131071); 377 return rv; 378 } 379 380 static INLINE tran_high_t half_round_shift(tran_high_t input) { 381 tran_high_t rv = (input + 1 + (input < 0)) >> 2; 382 return rv; 383 } 384 385 void vpx_fdct32(const tran_high_t *input, tran_high_t *output, int round) { 386 tran_high_t step[32]; 387 // Stage 1 388 step[0] = input[0] + input[(32 - 1)]; 389 step[1] = input[1] + input[(32 - 2)]; 390 step[2] = input[2] + input[(32 - 3)]; 391 step[3] = input[3] + input[(32 - 4)]; 392 step[4] = input[4] + input[(32 - 5)]; 393 step[5] = input[5] + input[(32 - 6)]; 394 step[6] = input[6] + input[(32 - 7)]; 395 step[7] = input[7] + input[(32 - 8)]; 396 step[8] = input[8] + input[(32 - 9)]; 397 step[9] = input[9] + input[(32 - 10)]; 398 step[10] = input[10] + input[(32 - 11)]; 399 step[11] = input[11] + input[(32 - 12)]; 400 step[12] = input[12] + input[(32 - 13)]; 401 step[13] = input[13] + input[(32 - 14)]; 402 step[14] = input[14] + input[(32 - 15)]; 403 step[15] = input[15] + input[(32 - 16)]; 404 step[16] = -input[16] + input[(32 - 17)]; 405 step[17] = -input[17] + input[(32 - 18)]; 406 step[18] = -input[18] + input[(32 - 19)]; 407 step[19] = -input[19] + input[(32 - 20)]; 408 step[20] = -input[20] + input[(32 - 21)]; 409 step[21] = -input[21] + input[(32 - 22)]; 410 step[22] = -input[22] + input[(32 - 23)]; 411 step[23] = -input[23] + input[(32 - 24)]; 412 step[24] = -input[24] + input[(32 - 25)]; 413 step[25] = -input[25] + input[(32 - 26)]; 414 step[26] = -input[26] + input[(32 - 27)]; 415 step[27] = -input[27] + input[(32 - 28)]; 416 step[28] = -input[28] + input[(32 - 29)]; 417 step[29] = -input[29] + input[(32 - 30)]; 418 step[30] = -input[30] + input[(32 - 31)]; 419 step[31] = -input[31] + input[(32 - 32)]; 420 421 // Stage 2 422 output[0] = step[0] + step[16 - 1]; 423 output[1] = step[1] + step[16 - 2]; 424 output[2] = step[2] + step[16 - 3]; 425 output[3] = step[3] + step[16 - 4]; 426 output[4] = step[4] + step[16 - 5]; 427 output[5] = step[5] + step[16 - 6]; 428 output[6] = step[6] + step[16 - 7]; 429 output[7] = step[7] + step[16 - 8]; 430 output[8] = -step[8] + step[16 - 9]; 431 output[9] = -step[9] + step[16 - 10]; 432 output[10] = -step[10] + step[16 - 11]; 433 output[11] = -step[11] + step[16 - 12]; 434 output[12] = -step[12] + step[16 - 13]; 435 output[13] = -step[13] + step[16 - 14]; 436 output[14] = -step[14] + step[16 - 15]; 437 output[15] = -step[15] + step[16 - 16]; 438 439 output[16] = step[16]; 440 output[17] = step[17]; 441 output[18] = step[18]; 442 output[19] = step[19]; 443 444 output[20] = dct_32_round((-step[20] + step[27]) * cospi_16_64); 445 output[21] = dct_32_round((-step[21] + step[26]) * cospi_16_64); 446 output[22] = dct_32_round((-step[22] + step[25]) * cospi_16_64); 447 output[23] = dct_32_round((-step[23] + step[24]) * cospi_16_64); 448 449 output[24] = dct_32_round((step[24] + step[23]) * cospi_16_64); 450 output[25] = dct_32_round((step[25] + step[22]) * cospi_16_64); 451 output[26] = dct_32_round((step[26] + step[21]) * cospi_16_64); 452 output[27] = dct_32_round((step[27] + step[20]) * cospi_16_64); 453 454 output[28] = step[28]; 455 output[29] = step[29]; 456 output[30] = step[30]; 457 output[31] = step[31]; 458 459 // dump the magnitude by 4, hence the intermediate values are within 460 // the range of 16 bits. 461 if (round) { 462 output[0] = half_round_shift(output[0]); 463 output[1] = half_round_shift(output[1]); 464 output[2] = half_round_shift(output[2]); 465 output[3] = half_round_shift(output[3]); 466 output[4] = half_round_shift(output[4]); 467 output[5] = half_round_shift(output[5]); 468 output[6] = half_round_shift(output[6]); 469 output[7] = half_round_shift(output[7]); 470 output[8] = half_round_shift(output[8]); 471 output[9] = half_round_shift(output[9]); 472 output[10] = half_round_shift(output[10]); 473 output[11] = half_round_shift(output[11]); 474 output[12] = half_round_shift(output[12]); 475 output[13] = half_round_shift(output[13]); 476 output[14] = half_round_shift(output[14]); 477 output[15] = half_round_shift(output[15]); 478 479 output[16] = half_round_shift(output[16]); 480 output[17] = half_round_shift(output[17]); 481 output[18] = half_round_shift(output[18]); 482 output[19] = half_round_shift(output[19]); 483 output[20] = half_round_shift(output[20]); 484 output[21] = half_round_shift(output[21]); 485 output[22] = half_round_shift(output[22]); 486 output[23] = half_round_shift(output[23]); 487 output[24] = half_round_shift(output[24]); 488 output[25] = half_round_shift(output[25]); 489 output[26] = half_round_shift(output[26]); 490 output[27] = half_round_shift(output[27]); 491 output[28] = half_round_shift(output[28]); 492 output[29] = half_round_shift(output[29]); 493 output[30] = half_round_shift(output[30]); 494 output[31] = half_round_shift(output[31]); 495 } 496 497 // Stage 3 498 step[0] = output[0] + output[(8 - 1)]; 499 step[1] = output[1] + output[(8 - 2)]; 500 step[2] = output[2] + output[(8 - 3)]; 501 step[3] = output[3] + output[(8 - 4)]; 502 step[4] = -output[4] + output[(8 - 5)]; 503 step[5] = -output[5] + output[(8 - 6)]; 504 step[6] = -output[6] + output[(8 - 7)]; 505 step[7] = -output[7] + output[(8 - 8)]; 506 step[8] = output[8]; 507 step[9] = output[9]; 508 step[10] = dct_32_round((-output[10] + output[13]) * cospi_16_64); 509 step[11] = dct_32_round((-output[11] + output[12]) * cospi_16_64); 510 step[12] = dct_32_round((output[12] + output[11]) * cospi_16_64); 511 step[13] = dct_32_round((output[13] + output[10]) * cospi_16_64); 512 step[14] = output[14]; 513 step[15] = output[15]; 514 515 step[16] = output[16] + output[23]; 516 step[17] = output[17] + output[22]; 517 step[18] = output[18] + output[21]; 518 step[19] = output[19] + output[20]; 519 step[20] = -output[20] + output[19]; 520 step[21] = -output[21] + output[18]; 521 step[22] = -output[22] + output[17]; 522 step[23] = -output[23] + output[16]; 523 step[24] = -output[24] + output[31]; 524 step[25] = -output[25] + output[30]; 525 step[26] = -output[26] + output[29]; 526 step[27] = -output[27] + output[28]; 527 step[28] = output[28] + output[27]; 528 step[29] = output[29] + output[26]; 529 step[30] = output[30] + output[25]; 530 step[31] = output[31] + output[24]; 531 532 // Stage 4 533 output[0] = step[0] + step[3]; 534 output[1] = step[1] + step[2]; 535 output[2] = -step[2] + step[1]; 536 output[3] = -step[3] + step[0]; 537 output[4] = step[4]; 538 output[5] = dct_32_round((-step[5] + step[6]) * cospi_16_64); 539 output[6] = dct_32_round((step[6] + step[5]) * cospi_16_64); 540 output[7] = step[7]; 541 output[8] = step[8] + step[11]; 542 output[9] = step[9] + step[10]; 543 output[10] = -step[10] + step[9]; 544 output[11] = -step[11] + step[8]; 545 output[12] = -step[12] + step[15]; 546 output[13] = -step[13] + step[14]; 547 output[14] = step[14] + step[13]; 548 output[15] = step[15] + step[12]; 549 550 output[16] = step[16]; 551 output[17] = step[17]; 552 output[18] = dct_32_round(step[18] * -cospi_8_64 + step[29] * cospi_24_64); 553 output[19] = dct_32_round(step[19] * -cospi_8_64 + step[28] * cospi_24_64); 554 output[20] = dct_32_round(step[20] * -cospi_24_64 + step[27] * -cospi_8_64); 555 output[21] = dct_32_round(step[21] * -cospi_24_64 + step[26] * -cospi_8_64); 556 output[22] = step[22]; 557 output[23] = step[23]; 558 output[24] = step[24]; 559 output[25] = step[25]; 560 output[26] = dct_32_round(step[26] * cospi_24_64 + step[21] * -cospi_8_64); 561 output[27] = dct_32_round(step[27] * cospi_24_64 + step[20] * -cospi_8_64); 562 output[28] = dct_32_round(step[28] * cospi_8_64 + step[19] * cospi_24_64); 563 output[29] = dct_32_round(step[29] * cospi_8_64 + step[18] * cospi_24_64); 564 output[30] = step[30]; 565 output[31] = step[31]; 566 567 // Stage 5 568 step[0] = dct_32_round((output[0] + output[1]) * cospi_16_64); 569 step[1] = dct_32_round((-output[1] + output[0]) * cospi_16_64); 570 step[2] = dct_32_round(output[2] * cospi_24_64 + output[3] * cospi_8_64); 571 step[3] = dct_32_round(output[3] * cospi_24_64 - output[2] * cospi_8_64); 572 step[4] = output[4] + output[5]; 573 step[5] = -output[5] + output[4]; 574 step[6] = -output[6] + output[7]; 575 step[7] = output[7] + output[6]; 576 step[8] = output[8]; 577 step[9] = dct_32_round(output[9] * -cospi_8_64 + output[14] * cospi_24_64); 578 step[10] = dct_32_round(output[10] * -cospi_24_64 + output[13] * -cospi_8_64); 579 step[11] = output[11]; 580 step[12] = output[12]; 581 step[13] = dct_32_round(output[13] * cospi_24_64 + output[10] * -cospi_8_64); 582 step[14] = dct_32_round(output[14] * cospi_8_64 + output[9] * cospi_24_64); 583 step[15] = output[15]; 584 585 step[16] = output[16] + output[19]; 586 step[17] = output[17] + output[18]; 587 step[18] = -output[18] + output[17]; 588 step[19] = -output[19] + output[16]; 589 step[20] = -output[20] + output[23]; 590 step[21] = -output[21] + output[22]; 591 step[22] = output[22] + output[21]; 592 step[23] = output[23] + output[20]; 593 step[24] = output[24] + output[27]; 594 step[25] = output[25] + output[26]; 595 step[26] = -output[26] + output[25]; 596 step[27] = -output[27] + output[24]; 597 step[28] = -output[28] + output[31]; 598 step[29] = -output[29] + output[30]; 599 step[30] = output[30] + output[29]; 600 step[31] = output[31] + output[28]; 601 602 // Stage 6 603 output[0] = step[0]; 604 output[1] = step[1]; 605 output[2] = step[2]; 606 output[3] = step[3]; 607 output[4] = dct_32_round(step[4] * cospi_28_64 + step[7] * cospi_4_64); 608 output[5] = dct_32_round(step[5] * cospi_12_64 + step[6] * cospi_20_64); 609 output[6] = dct_32_round(step[6] * cospi_12_64 + step[5] * -cospi_20_64); 610 output[7] = dct_32_round(step[7] * cospi_28_64 + step[4] * -cospi_4_64); 611 output[8] = step[8] + step[9]; 612 output[9] = -step[9] + step[8]; 613 output[10] = -step[10] + step[11]; 614 output[11] = step[11] + step[10]; 615 output[12] = step[12] + step[13]; 616 output[13] = -step[13] + step[12]; 617 output[14] = -step[14] + step[15]; 618 output[15] = step[15] + step[14]; 619 620 output[16] = step[16]; 621 output[17] = dct_32_round(step[17] * -cospi_4_64 + step[30] * cospi_28_64); 622 output[18] = dct_32_round(step[18] * -cospi_28_64 + step[29] * -cospi_4_64); 623 output[19] = step[19]; 624 output[20] = step[20]; 625 output[21] = dct_32_round(step[21] * -cospi_20_64 + step[26] * cospi_12_64); 626 output[22] = dct_32_round(step[22] * -cospi_12_64 + step[25] * -cospi_20_64); 627 output[23] = step[23]; 628 output[24] = step[24]; 629 output[25] = dct_32_round(step[25] * cospi_12_64 + step[22] * -cospi_20_64); 630 output[26] = dct_32_round(step[26] * cospi_20_64 + step[21] * cospi_12_64); 631 output[27] = step[27]; 632 output[28] = step[28]; 633 output[29] = dct_32_round(step[29] * cospi_28_64 + step[18] * -cospi_4_64); 634 output[30] = dct_32_round(step[30] * cospi_4_64 + step[17] * cospi_28_64); 635 output[31] = step[31]; 636 637 // Stage 7 638 step[0] = output[0]; 639 step[1] = output[1]; 640 step[2] = output[2]; 641 step[3] = output[3]; 642 step[4] = output[4]; 643 step[5] = output[5]; 644 step[6] = output[6]; 645 step[7] = output[7]; 646 step[8] = dct_32_round(output[8] * cospi_30_64 + output[15] * cospi_2_64); 647 step[9] = dct_32_round(output[9] * cospi_14_64 + output[14] * cospi_18_64); 648 step[10] = dct_32_round(output[10] * cospi_22_64 + output[13] * cospi_10_64); 649 step[11] = dct_32_round(output[11] * cospi_6_64 + output[12] * cospi_26_64); 650 step[12] = dct_32_round(output[12] * cospi_6_64 + output[11] * -cospi_26_64); 651 step[13] = dct_32_round(output[13] * cospi_22_64 + output[10] * -cospi_10_64); 652 step[14] = dct_32_round(output[14] * cospi_14_64 + output[9] * -cospi_18_64); 653 step[15] = dct_32_round(output[15] * cospi_30_64 + output[8] * -cospi_2_64); 654 655 step[16] = output[16] + output[17]; 656 step[17] = -output[17] + output[16]; 657 step[18] = -output[18] + output[19]; 658 step[19] = output[19] + output[18]; 659 step[20] = output[20] + output[21]; 660 step[21] = -output[21] + output[20]; 661 step[22] = -output[22] + output[23]; 662 step[23] = output[23] + output[22]; 663 step[24] = output[24] + output[25]; 664 step[25] = -output[25] + output[24]; 665 step[26] = -output[26] + output[27]; 666 step[27] = output[27] + output[26]; 667 step[28] = output[28] + output[29]; 668 step[29] = -output[29] + output[28]; 669 step[30] = -output[30] + output[31]; 670 step[31] = output[31] + output[30]; 671 672 // Final stage --- outputs indices are bit-reversed. 673 output[0] = step[0]; 674 output[16] = step[1]; 675 output[8] = step[2]; 676 output[24] = step[3]; 677 output[4] = step[4]; 678 output[20] = step[5]; 679 output[12] = step[6]; 680 output[28] = step[7]; 681 output[2] = step[8]; 682 output[18] = step[9]; 683 output[10] = step[10]; 684 output[26] = step[11]; 685 output[6] = step[12]; 686 output[22] = step[13]; 687 output[14] = step[14]; 688 output[30] = step[15]; 689 690 output[1] = dct_32_round(step[16] * cospi_31_64 + step[31] * cospi_1_64); 691 output[17] = dct_32_round(step[17] * cospi_15_64 + step[30] * cospi_17_64); 692 output[9] = dct_32_round(step[18] * cospi_23_64 + step[29] * cospi_9_64); 693 output[25] = dct_32_round(step[19] * cospi_7_64 + step[28] * cospi_25_64); 694 output[5] = dct_32_round(step[20] * cospi_27_64 + step[27] * cospi_5_64); 695 output[21] = dct_32_round(step[21] * cospi_11_64 + step[26] * cospi_21_64); 696 output[13] = dct_32_round(step[22] * cospi_19_64 + step[25] * cospi_13_64); 697 output[29] = dct_32_round(step[23] * cospi_3_64 + step[24] * cospi_29_64); 698 output[3] = dct_32_round(step[24] * cospi_3_64 + step[23] * -cospi_29_64); 699 output[19] = dct_32_round(step[25] * cospi_19_64 + step[22] * -cospi_13_64); 700 output[11] = dct_32_round(step[26] * cospi_11_64 + step[21] * -cospi_21_64); 701 output[27] = dct_32_round(step[27] * cospi_27_64 + step[20] * -cospi_5_64); 702 output[7] = dct_32_round(step[28] * cospi_7_64 + step[19] * -cospi_25_64); 703 output[23] = dct_32_round(step[29] * cospi_23_64 + step[18] * -cospi_9_64); 704 output[15] = dct_32_round(step[30] * cospi_15_64 + step[17] * -cospi_17_64); 705 output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); 706 } 707 708 void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride) { 709 int i, j; 710 tran_high_t out[32 * 32]; 711 712 // Columns 713 for (i = 0; i < 32; ++i) { 714 tran_high_t temp_in[32], temp_out[32]; 715 for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; 716 vpx_fdct32(temp_in, temp_out, 0); 717 for (j = 0; j < 32; ++j) 718 out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 719 } 720 721 // Rows 722 for (i = 0; i < 32; ++i) { 723 tran_high_t temp_in[32], temp_out[32]; 724 for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; 725 vpx_fdct32(temp_in, temp_out, 0); 726 for (j = 0; j < 32; ++j) 727 output[j + i * 32] = 728 (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2); 729 } 730 } 731 732 // Note that although we use dct_32_round in dct32 computation flow, 733 // this 2d fdct32x32 for rate-distortion optimization loop is operating 734 // within 16 bits precision. 735 void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride) { 736 int i, j; 737 tran_high_t out[32 * 32]; 738 739 // Columns 740 for (i = 0; i < 32; ++i) { 741 tran_high_t temp_in[32], temp_out[32]; 742 for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4; 743 vpx_fdct32(temp_in, temp_out, 0); 744 for (j = 0; j < 32; ++j) 745 // TODO(cd): see quality impact of only doing 746 // output[j * 32 + i] = (temp_out[j] + 1) >> 2; 747 // PS: also change code in vpx_dsp/x86/vpx_dct_sse2.c 748 out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; 749 } 750 751 // Rows 752 for (i = 0; i < 32; ++i) { 753 tran_high_t temp_in[32], temp_out[32]; 754 for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32]; 755 vpx_fdct32(temp_in, temp_out, 1); 756 for (j = 0; j < 32; ++j) output[j + i * 32] = (tran_low_t)temp_out[j]; 757 } 758 } 759 760 void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) { 761 int r, c; 762 int sum = 0; 763 for (r = 0; r < 32; ++r) 764 for (c = 0; c < 32; ++c) sum += input[r * stride + c]; 765 766 output[0] = (tran_low_t)(sum >> 3); 767 } 768 769 #if CONFIG_VP9_HIGHBITDEPTH 770 void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, 771 int stride) { 772 vpx_fdct4x4_c(input, output, stride); 773 } 774 775 void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, 776 int stride) { 777 vpx_fdct8x8_c(input, output, stride); 778 } 779 780 void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, 781 int stride) { 782 vpx_fdct8x8_1_c(input, output, stride); 783 } 784 785 void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, 786 int stride) { 787 vpx_fdct16x16_c(input, output, stride); 788 } 789 790 void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, 791 int stride) { 792 vpx_fdct16x16_1_c(input, output, stride); 793 } 794 795 void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, 796 int stride) { 797 vpx_fdct32x32_c(input, output, stride); 798 } 799 800 void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, 801 int stride) { 802 vpx_fdct32x32_rd_c(input, output, stride); 803 } 804 805 void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, 806 int stride) { 807 vpx_fdct32x32_1_c(input, output, stride); 808 } 809 #endif // CONFIG_VP9_HIGHBITDEPTH 810