1 /* 2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3 * 4 * Use of this source code is governed by a BSD-style license 5 * that can be found in the LICENSE file in the root of the source 6 * tree. An additional intellectual property rights grant can be found 7 * in the file PATENTS. All contributing project authors may 8 * be found in the AUTHORS file in the root of the source tree. 9 */ 10 11 #include <assert.h> 12 #include <math.h> 13 14 #include "./vp9_rtcd.h" 15 #include "./vpx_config.h" 16 #include "./vpx_dsp_rtcd.h" 17 18 #include "vp9/common/vp9_blockd.h" 19 #include "vp9/common/vp9_idct.h" 20 #include "vpx_dsp/fwd_txfm.h" 21 #include "vpx_ports/mem.h" 22 23 static void fdct4(const tran_low_t *input, tran_low_t *output) { 24 tran_high_t step[4]; 25 tran_high_t temp1, temp2; 26 27 step[0] = input[0] + input[3]; 28 step[1] = input[1] + input[2]; 29 step[2] = input[1] - input[2]; 30 step[3] = input[0] - input[3]; 31 32 temp1 = (step[0] + step[1]) * cospi_16_64; 33 temp2 = (step[0] - step[1]) * cospi_16_64; 34 output[0] = (tran_low_t)fdct_round_shift(temp1); 35 output[2] = (tran_low_t)fdct_round_shift(temp2); 36 temp1 = step[2] * cospi_24_64 + step[3] * cospi_8_64; 37 temp2 = -step[2] * cospi_8_64 + step[3] * cospi_24_64; 38 output[1] = (tran_low_t)fdct_round_shift(temp1); 39 output[3] = (tran_low_t)fdct_round_shift(temp2); 40 } 41 42 static void fdct8(const tran_low_t *input, tran_low_t *output) { 43 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 44 tran_high_t t0, t1, t2, t3; // needs32 45 tran_high_t x0, x1, x2, x3; // canbe16 46 47 // stage 1 48 s0 = input[0] + input[7]; 49 s1 = input[1] + input[6]; 50 s2 = input[2] + input[5]; 51 s3 = input[3] + input[4]; 52 s4 = input[3] - input[4]; 53 s5 = input[2] - input[5]; 54 s6 = input[1] - input[6]; 55 s7 = input[0] - input[7]; 56 57 // fdct4(step, step); 58 x0 = s0 + s3; 59 x1 = s1 + s2; 60 x2 = s1 - s2; 61 x3 = s0 - s3; 62 t0 = (x0 + x1) * cospi_16_64; 63 t1 = (x0 - x1) * cospi_16_64; 64 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 65 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 66 output[0] = (tran_low_t)fdct_round_shift(t0); 67 output[2] = (tran_low_t)fdct_round_shift(t2); 68 output[4] = (tran_low_t)fdct_round_shift(t1); 69 output[6] = (tran_low_t)fdct_round_shift(t3); 70 71 // Stage 2 72 t0 = (s6 - s5) * cospi_16_64; 73 t1 = (s6 + s5) * cospi_16_64; 74 t2 = (tran_low_t)fdct_round_shift(t0); 75 t3 = (tran_low_t)fdct_round_shift(t1); 76 77 // Stage 3 78 x0 = s4 + t2; 79 x1 = s4 - t2; 80 x2 = s7 - t3; 81 x3 = s7 + t3; 82 83 // Stage 4 84 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 85 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 86 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 87 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 88 output[1] = (tran_low_t)fdct_round_shift(t0); 89 output[3] = (tran_low_t)fdct_round_shift(t2); 90 output[5] = (tran_low_t)fdct_round_shift(t1); 91 output[7] = (tran_low_t)fdct_round_shift(t3); 92 } 93 94 static void fdct16(const tran_low_t in[16], tran_low_t out[16]) { 95 tran_high_t step1[8]; // canbe16 96 tran_high_t step2[8]; // canbe16 97 tran_high_t step3[8]; // canbe16 98 tran_high_t input[8]; // canbe16 99 tran_high_t temp1, temp2; // needs32 100 101 // step 1 102 input[0] = in[0] + in[15]; 103 input[1] = in[1] + in[14]; 104 input[2] = in[2] + in[13]; 105 input[3] = in[3] + in[12]; 106 input[4] = in[4] + in[11]; 107 input[5] = in[5] + in[10]; 108 input[6] = in[6] + in[ 9]; 109 input[7] = in[7] + in[ 8]; 110 111 step1[0] = in[7] - in[ 8]; 112 step1[1] = in[6] - in[ 9]; 113 step1[2] = in[5] - in[10]; 114 step1[3] = in[4] - in[11]; 115 step1[4] = in[3] - in[12]; 116 step1[5] = in[2] - in[13]; 117 step1[6] = in[1] - in[14]; 118 step1[7] = in[0] - in[15]; 119 120 // fdct8(step, step); 121 { 122 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 123 tran_high_t t0, t1, t2, t3; // needs32 124 tran_high_t x0, x1, x2, x3; // canbe16 125 126 // stage 1 127 s0 = input[0] + input[7]; 128 s1 = input[1] + input[6]; 129 s2 = input[2] + input[5]; 130 s3 = input[3] + input[4]; 131 s4 = input[3] - input[4]; 132 s5 = input[2] - input[5]; 133 s6 = input[1] - input[6]; 134 s7 = input[0] - input[7]; 135 136 // fdct4(step, step); 137 x0 = s0 + s3; 138 x1 = s1 + s2; 139 x2 = s1 - s2; 140 x3 = s0 - s3; 141 t0 = (x0 + x1) * cospi_16_64; 142 t1 = (x0 - x1) * cospi_16_64; 143 t2 = x3 * cospi_8_64 + x2 * cospi_24_64; 144 t3 = x3 * cospi_24_64 - x2 * cospi_8_64; 145 out[0] = (tran_low_t)fdct_round_shift(t0); 146 out[4] = (tran_low_t)fdct_round_shift(t2); 147 out[8] = (tran_low_t)fdct_round_shift(t1); 148 out[12] = (tran_low_t)fdct_round_shift(t3); 149 150 // Stage 2 151 t0 = (s6 - s5) * cospi_16_64; 152 t1 = (s6 + s5) * cospi_16_64; 153 t2 = fdct_round_shift(t0); 154 t3 = fdct_round_shift(t1); 155 156 // Stage 3 157 x0 = s4 + t2; 158 x1 = s4 - t2; 159 x2 = s7 - t3; 160 x3 = s7 + t3; 161 162 // Stage 4 163 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 164 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 165 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 166 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 167 out[2] = (tran_low_t)fdct_round_shift(t0); 168 out[6] = (tran_low_t)fdct_round_shift(t2); 169 out[10] = (tran_low_t)fdct_round_shift(t1); 170 out[14] = (tran_low_t)fdct_round_shift(t3); 171 } 172 173 // step 2 174 temp1 = (step1[5] - step1[2]) * cospi_16_64; 175 temp2 = (step1[4] - step1[3]) * cospi_16_64; 176 step2[2] = fdct_round_shift(temp1); 177 step2[3] = fdct_round_shift(temp2); 178 temp1 = (step1[4] + step1[3]) * cospi_16_64; 179 temp2 = (step1[5] + step1[2]) * cospi_16_64; 180 step2[4] = fdct_round_shift(temp1); 181 step2[5] = fdct_round_shift(temp2); 182 183 // step 3 184 step3[0] = step1[0] + step2[3]; 185 step3[1] = step1[1] + step2[2]; 186 step3[2] = step1[1] - step2[2]; 187 step3[3] = step1[0] - step2[3]; 188 step3[4] = step1[7] - step2[4]; 189 step3[5] = step1[6] - step2[5]; 190 step3[6] = step1[6] + step2[5]; 191 step3[7] = step1[7] + step2[4]; 192 193 // step 4 194 temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; 195 temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64; 196 step2[1] = fdct_round_shift(temp1); 197 step2[2] = fdct_round_shift(temp2); 198 temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64; 199 temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; 200 step2[5] = fdct_round_shift(temp1); 201 step2[6] = fdct_round_shift(temp2); 202 203 // step 5 204 step1[0] = step3[0] + step2[1]; 205 step1[1] = step3[0] - step2[1]; 206 step1[2] = step3[3] + step2[2]; 207 step1[3] = step3[3] - step2[2]; 208 step1[4] = step3[4] - step2[5]; 209 step1[5] = step3[4] + step2[5]; 210 step1[6] = step3[7] - step2[6]; 211 step1[7] = step3[7] + step2[6]; 212 213 // step 6 214 temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; 215 temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; 216 out[1] = (tran_low_t)fdct_round_shift(temp1); 217 out[9] = (tran_low_t)fdct_round_shift(temp2); 218 219 temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; 220 temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; 221 out[5] = (tran_low_t)fdct_round_shift(temp1); 222 out[13] = (tran_low_t)fdct_round_shift(temp2); 223 224 temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; 225 temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; 226 out[3] = (tran_low_t)fdct_round_shift(temp1); 227 out[11] = (tran_low_t)fdct_round_shift(temp2); 228 229 temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; 230 temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; 231 out[7] = (tran_low_t)fdct_round_shift(temp1); 232 out[15] = (tran_low_t)fdct_round_shift(temp2); 233 } 234 235 static void fadst4(const tran_low_t *input, tran_low_t *output) { 236 tran_high_t x0, x1, x2, x3; 237 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 238 239 x0 = input[0]; 240 x1 = input[1]; 241 x2 = input[2]; 242 x3 = input[3]; 243 244 if (!(x0 | x1 | x2 | x3)) { 245 output[0] = output[1] = output[2] = output[3] = 0; 246 return; 247 } 248 249 s0 = sinpi_1_9 * x0; 250 s1 = sinpi_4_9 * x0; 251 s2 = sinpi_2_9 * x1; 252 s3 = sinpi_1_9 * x1; 253 s4 = sinpi_3_9 * x2; 254 s5 = sinpi_4_9 * x3; 255 s6 = sinpi_2_9 * x3; 256 s7 = x0 + x1 - x3; 257 258 x0 = s0 + s2 + s5; 259 x1 = sinpi_3_9 * s7; 260 x2 = s1 - s3 + s6; 261 x3 = s4; 262 263 s0 = x0 + x3; 264 s1 = x1; 265 s2 = x2 - x3; 266 s3 = x2 - x0 + x3; 267 268 // 1-D transform scaling factor is sqrt(2). 269 output[0] = (tran_low_t)fdct_round_shift(s0); 270 output[1] = (tran_low_t)fdct_round_shift(s1); 271 output[2] = (tran_low_t)fdct_round_shift(s2); 272 output[3] = (tran_low_t)fdct_round_shift(s3); 273 } 274 275 static void fadst8(const tran_low_t *input, tran_low_t *output) { 276 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; 277 278 tran_high_t x0 = input[7]; 279 tran_high_t x1 = input[0]; 280 tran_high_t x2 = input[5]; 281 tran_high_t x3 = input[2]; 282 tran_high_t x4 = input[3]; 283 tran_high_t x5 = input[4]; 284 tran_high_t x6 = input[1]; 285 tran_high_t x7 = input[6]; 286 287 // stage 1 288 s0 = cospi_2_64 * x0 + cospi_30_64 * x1; 289 s1 = cospi_30_64 * x0 - cospi_2_64 * x1; 290 s2 = cospi_10_64 * x2 + cospi_22_64 * x3; 291 s3 = cospi_22_64 * x2 - cospi_10_64 * x3; 292 s4 = cospi_18_64 * x4 + cospi_14_64 * x5; 293 s5 = cospi_14_64 * x4 - cospi_18_64 * x5; 294 s6 = cospi_26_64 * x6 + cospi_6_64 * x7; 295 s7 = cospi_6_64 * x6 - cospi_26_64 * x7; 296 297 x0 = fdct_round_shift(s0 + s4); 298 x1 = fdct_round_shift(s1 + s5); 299 x2 = fdct_round_shift(s2 + s6); 300 x3 = fdct_round_shift(s3 + s7); 301 x4 = fdct_round_shift(s0 - s4); 302 x5 = fdct_round_shift(s1 - s5); 303 x6 = fdct_round_shift(s2 - s6); 304 x7 = fdct_round_shift(s3 - s7); 305 306 // stage 2 307 s0 = x0; 308 s1 = x1; 309 s2 = x2; 310 s3 = x3; 311 s4 = cospi_8_64 * x4 + cospi_24_64 * x5; 312 s5 = cospi_24_64 * x4 - cospi_8_64 * x5; 313 s6 = - cospi_24_64 * x6 + cospi_8_64 * x7; 314 s7 = cospi_8_64 * x6 + cospi_24_64 * x7; 315 316 x0 = s0 + s2; 317 x1 = s1 + s3; 318 x2 = s0 - s2; 319 x3 = s1 - s3; 320 x4 = fdct_round_shift(s4 + s6); 321 x5 = fdct_round_shift(s5 + s7); 322 x6 = fdct_round_shift(s4 - s6); 323 x7 = fdct_round_shift(s5 - s7); 324 325 // stage 3 326 s2 = cospi_16_64 * (x2 + x3); 327 s3 = cospi_16_64 * (x2 - x3); 328 s6 = cospi_16_64 * (x6 + x7); 329 s7 = cospi_16_64 * (x6 - x7); 330 331 x2 = fdct_round_shift(s2); 332 x3 = fdct_round_shift(s3); 333 x6 = fdct_round_shift(s6); 334 x7 = fdct_round_shift(s7); 335 336 output[0] = (tran_low_t)x0; 337 output[1] = (tran_low_t)-x4; 338 output[2] = (tran_low_t)x6; 339 output[3] = (tran_low_t)-x2; 340 output[4] = (tran_low_t)x3; 341 output[5] = (tran_low_t)-x7; 342 output[6] = (tran_low_t)x5; 343 output[7] = (tran_low_t)-x1; 344 } 345 346 static void fadst16(const tran_low_t *input, tran_low_t *output) { 347 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; 348 tran_high_t s9, s10, s11, s12, s13, s14, s15; 349 350 tran_high_t x0 = input[15]; 351 tran_high_t x1 = input[0]; 352 tran_high_t x2 = input[13]; 353 tran_high_t x3 = input[2]; 354 tran_high_t x4 = input[11]; 355 tran_high_t x5 = input[4]; 356 tran_high_t x6 = input[9]; 357 tran_high_t x7 = input[6]; 358 tran_high_t x8 = input[7]; 359 tran_high_t x9 = input[8]; 360 tran_high_t x10 = input[5]; 361 tran_high_t x11 = input[10]; 362 tran_high_t x12 = input[3]; 363 tran_high_t x13 = input[12]; 364 tran_high_t x14 = input[1]; 365 tran_high_t x15 = input[14]; 366 367 // stage 1 368 s0 = x0 * cospi_1_64 + x1 * cospi_31_64; 369 s1 = x0 * cospi_31_64 - x1 * cospi_1_64; 370 s2 = x2 * cospi_5_64 + x3 * cospi_27_64; 371 s3 = x2 * cospi_27_64 - x3 * cospi_5_64; 372 s4 = x4 * cospi_9_64 + x5 * cospi_23_64; 373 s5 = x4 * cospi_23_64 - x5 * cospi_9_64; 374 s6 = x6 * cospi_13_64 + x7 * cospi_19_64; 375 s7 = x6 * cospi_19_64 - x7 * cospi_13_64; 376 s8 = x8 * cospi_17_64 + x9 * cospi_15_64; 377 s9 = x8 * cospi_15_64 - x9 * cospi_17_64; 378 s10 = x10 * cospi_21_64 + x11 * cospi_11_64; 379 s11 = x10 * cospi_11_64 - x11 * cospi_21_64; 380 s12 = x12 * cospi_25_64 + x13 * cospi_7_64; 381 s13 = x12 * cospi_7_64 - x13 * cospi_25_64; 382 s14 = x14 * cospi_29_64 + x15 * cospi_3_64; 383 s15 = x14 * cospi_3_64 - x15 * cospi_29_64; 384 385 x0 = fdct_round_shift(s0 + s8); 386 x1 = fdct_round_shift(s1 + s9); 387 x2 = fdct_round_shift(s2 + s10); 388 x3 = fdct_round_shift(s3 + s11); 389 x4 = fdct_round_shift(s4 + s12); 390 x5 = fdct_round_shift(s5 + s13); 391 x6 = fdct_round_shift(s6 + s14); 392 x7 = fdct_round_shift(s7 + s15); 393 x8 = fdct_round_shift(s0 - s8); 394 x9 = fdct_round_shift(s1 - s9); 395 x10 = fdct_round_shift(s2 - s10); 396 x11 = fdct_round_shift(s3 - s11); 397 x12 = fdct_round_shift(s4 - s12); 398 x13 = fdct_round_shift(s5 - s13); 399 x14 = fdct_round_shift(s6 - s14); 400 x15 = fdct_round_shift(s7 - s15); 401 402 // stage 2 403 s0 = x0; 404 s1 = x1; 405 s2 = x2; 406 s3 = x3; 407 s4 = x4; 408 s5 = x5; 409 s6 = x6; 410 s7 = x7; 411 s8 = x8 * cospi_4_64 + x9 * cospi_28_64; 412 s9 = x8 * cospi_28_64 - x9 * cospi_4_64; 413 s10 = x10 * cospi_20_64 + x11 * cospi_12_64; 414 s11 = x10 * cospi_12_64 - x11 * cospi_20_64; 415 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64; 416 s13 = x12 * cospi_4_64 + x13 * cospi_28_64; 417 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64; 418 s15 = x14 * cospi_20_64 + x15 * cospi_12_64; 419 420 x0 = s0 + s4; 421 x1 = s1 + s5; 422 x2 = s2 + s6; 423 x3 = s3 + s7; 424 x4 = s0 - s4; 425 x5 = s1 - s5; 426 x6 = s2 - s6; 427 x7 = s3 - s7; 428 x8 = fdct_round_shift(s8 + s12); 429 x9 = fdct_round_shift(s9 + s13); 430 x10 = fdct_round_shift(s10 + s14); 431 x11 = fdct_round_shift(s11 + s15); 432 x12 = fdct_round_shift(s8 - s12); 433 x13 = fdct_round_shift(s9 - s13); 434 x14 = fdct_round_shift(s10 - s14); 435 x15 = fdct_round_shift(s11 - s15); 436 437 // stage 3 438 s0 = x0; 439 s1 = x1; 440 s2 = x2; 441 s3 = x3; 442 s4 = x4 * cospi_8_64 + x5 * cospi_24_64; 443 s5 = x4 * cospi_24_64 - x5 * cospi_8_64; 444 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64; 445 s7 = x6 * cospi_8_64 + x7 * cospi_24_64; 446 s8 = x8; 447 s9 = x9; 448 s10 = x10; 449 s11 = x11; 450 s12 = x12 * cospi_8_64 + x13 * cospi_24_64; 451 s13 = x12 * cospi_24_64 - x13 * cospi_8_64; 452 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64; 453 s15 = x14 * cospi_8_64 + x15 * cospi_24_64; 454 455 x0 = s0 + s2; 456 x1 = s1 + s3; 457 x2 = s0 - s2; 458 x3 = s1 - s3; 459 x4 = fdct_round_shift(s4 + s6); 460 x5 = fdct_round_shift(s5 + s7); 461 x6 = fdct_round_shift(s4 - s6); 462 x7 = fdct_round_shift(s5 - s7); 463 x8 = s8 + s10; 464 x9 = s9 + s11; 465 x10 = s8 - s10; 466 x11 = s9 - s11; 467 x12 = fdct_round_shift(s12 + s14); 468 x13 = fdct_round_shift(s13 + s15); 469 x14 = fdct_round_shift(s12 - s14); 470 x15 = fdct_round_shift(s13 - s15); 471 472 // stage 4 473 s2 = (- cospi_16_64) * (x2 + x3); 474 s3 = cospi_16_64 * (x2 - x3); 475 s6 = cospi_16_64 * (x6 + x7); 476 s7 = cospi_16_64 * (- x6 + x7); 477 s10 = cospi_16_64 * (x10 + x11); 478 s11 = cospi_16_64 * (- x10 + x11); 479 s14 = (- cospi_16_64) * (x14 + x15); 480 s15 = cospi_16_64 * (x14 - x15); 481 482 x2 = fdct_round_shift(s2); 483 x3 = fdct_round_shift(s3); 484 x6 = fdct_round_shift(s6); 485 x7 = fdct_round_shift(s7); 486 x10 = fdct_round_shift(s10); 487 x11 = fdct_round_shift(s11); 488 x14 = fdct_round_shift(s14); 489 x15 = fdct_round_shift(s15); 490 491 output[0] = (tran_low_t)x0; 492 output[1] = (tran_low_t)-x8; 493 output[2] = (tran_low_t)x12; 494 output[3] = (tran_low_t)-x4; 495 output[4] = (tran_low_t)x6; 496 output[5] = (tran_low_t)x14; 497 output[6] = (tran_low_t)x10; 498 output[7] = (tran_low_t)x2; 499 output[8] = (tran_low_t)x3; 500 output[9] = (tran_low_t)x11; 501 output[10] = (tran_low_t)x15; 502 output[11] = (tran_low_t)x7; 503 output[12] = (tran_low_t)x5; 504 output[13] = (tran_low_t)-x13; 505 output[14] = (tran_low_t)x9; 506 output[15] = (tran_low_t)-x1; 507 } 508 509 static const transform_2d FHT_4[] = { 510 { fdct4, fdct4 }, // DCT_DCT = 0 511 { fadst4, fdct4 }, // ADST_DCT = 1 512 { fdct4, fadst4 }, // DCT_ADST = 2 513 { fadst4, fadst4 } // ADST_ADST = 3 514 }; 515 516 static const transform_2d FHT_8[] = { 517 { fdct8, fdct8 }, // DCT_DCT = 0 518 { fadst8, fdct8 }, // ADST_DCT = 1 519 { fdct8, fadst8 }, // DCT_ADST = 2 520 { fadst8, fadst8 } // ADST_ADST = 3 521 }; 522 523 static const transform_2d FHT_16[] = { 524 { fdct16, fdct16 }, // DCT_DCT = 0 525 { fadst16, fdct16 }, // ADST_DCT = 1 526 { fdct16, fadst16 }, // DCT_ADST = 2 527 { fadst16, fadst16 } // ADST_ADST = 3 528 }; 529 530 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, 531 int stride, int tx_type) { 532 if (tx_type == DCT_DCT) { 533 vpx_fdct4x4_c(input, output, stride); 534 } else { 535 tran_low_t out[4 * 4]; 536 int i, j; 537 tran_low_t temp_in[4], temp_out[4]; 538 const transform_2d ht = FHT_4[tx_type]; 539 540 // Columns 541 for (i = 0; i < 4; ++i) { 542 for (j = 0; j < 4; ++j) 543 temp_in[j] = input[j * stride + i] * 16; 544 if (i == 0 && temp_in[0]) 545 temp_in[0] += 1; 546 ht.cols(temp_in, temp_out); 547 for (j = 0; j < 4; ++j) 548 out[j * 4 + i] = temp_out[j]; 549 } 550 551 // Rows 552 for (i = 0; i < 4; ++i) { 553 for (j = 0; j < 4; ++j) 554 temp_in[j] = out[j + i * 4]; 555 ht.rows(temp_in, temp_out); 556 for (j = 0; j < 4; ++j) 557 output[j + i * 4] = (temp_out[j] + 1) >> 2; 558 } 559 } 560 } 561 562 void vp9_fdct8x8_quant_c(const int16_t *input, int stride, 563 tran_low_t *coeff_ptr, intptr_t n_coeffs, 564 int skip_block, 565 const int16_t *zbin_ptr, const int16_t *round_ptr, 566 const int16_t *quant_ptr, 567 const int16_t *quant_shift_ptr, 568 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, 569 const int16_t *dequant_ptr, 570 uint16_t *eob_ptr, 571 const int16_t *scan, const int16_t *iscan) { 572 int eob = -1; 573 574 int i, j; 575 tran_low_t intermediate[64]; 576 577 // Transform columns 578 { 579 tran_low_t *output = intermediate; 580 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16 581 tran_high_t t0, t1, t2, t3; // needs32 582 tran_high_t x0, x1, x2, x3; // canbe16 583 584 int i; 585 for (i = 0; i < 8; i++) { 586 // stage 1 587 s0 = (input[0 * stride] + input[7 * stride]) * 4; 588 s1 = (input[1 * stride] + input[6 * stride]) * 4; 589 s2 = (input[2 * stride] + input[5 * stride]) * 4; 590 s3 = (input[3 * stride] + input[4 * stride]) * 4; 591 s4 = (input[3 * stride] - input[4 * stride]) * 4; 592 s5 = (input[2 * stride] - input[5 * stride]) * 4; 593 s6 = (input[1 * stride] - input[6 * stride]) * 4; 594 s7 = (input[0 * stride] - input[7 * stride]) * 4; 595 596 // fdct4(step, step); 597 x0 = s0 + s3; 598 x1 = s1 + s2; 599 x2 = s1 - s2; 600 x3 = s0 - s3; 601 t0 = (x0 + x1) * cospi_16_64; 602 t1 = (x0 - x1) * cospi_16_64; 603 t2 = x2 * cospi_24_64 + x3 * cospi_8_64; 604 t3 = -x2 * cospi_8_64 + x3 * cospi_24_64; 605 output[0 * 8] = (tran_low_t)fdct_round_shift(t0); 606 output[2 * 8] = (tran_low_t)fdct_round_shift(t2); 607 output[4 * 8] = (tran_low_t)fdct_round_shift(t1); 608 output[6 * 8] = (tran_low_t)fdct_round_shift(t3); 609 610 // Stage 2 611 t0 = (s6 - s5) * cospi_16_64; 612 t1 = (s6 + s5) * cospi_16_64; 613 t2 = fdct_round_shift(t0); 614 t3 = fdct_round_shift(t1); 615 616 // Stage 3 617 x0 = s4 + t2; 618 x1 = s4 - t2; 619 x2 = s7 - t3; 620 x3 = s7 + t3; 621 622 // Stage 4 623 t0 = x0 * cospi_28_64 + x3 * cospi_4_64; 624 t1 = x1 * cospi_12_64 + x2 * cospi_20_64; 625 t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; 626 t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; 627 output[1 * 8] = (tran_low_t)fdct_round_shift(t0); 628 output[3 * 8] = (tran_low_t)fdct_round_shift(t2); 629 output[5 * 8] = (tran_low_t)fdct_round_shift(t1); 630 output[7 * 8] = (tran_low_t)fdct_round_shift(t3); 631 input++; 632 output++; 633 } 634 } 635 636 // Rows 637 for (i = 0; i < 8; ++i) { 638 fdct8(&intermediate[i * 8], &coeff_ptr[i * 8]); 639 for (j = 0; j < 8; ++j) 640 coeff_ptr[j + i * 8] /= 2; 641 } 642 643 // TODO(jingning) Decide the need of these arguments after the 644 // quantization process is completed. 645 (void)zbin_ptr; 646 (void)quant_shift_ptr; 647 (void)iscan; 648 649 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); 650 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); 651 652 if (!skip_block) { 653 // Quantization pass: All coefficients with index >= zero_flag are 654 // skippable. Note: zero_flag can be zero. 655 for (i = 0; i < n_coeffs; i++) { 656 const int rc = scan[i]; 657 const int coeff = coeff_ptr[rc]; 658 const int coeff_sign = (coeff >> 31); 659 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; 660 661 int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX); 662 tmp = (tmp * quant_ptr[rc != 0]) >> 16; 663 664 qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; 665 dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; 666 667 if (tmp) 668 eob = i; 669 } 670 } 671 *eob_ptr = eob + 1; 672 } 673 674 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, 675 int stride, int tx_type) { 676 if (tx_type == DCT_DCT) { 677 vpx_fdct8x8_c(input, output, stride); 678 } else { 679 tran_low_t out[64]; 680 int i, j; 681 tran_low_t temp_in[8], temp_out[8]; 682 const transform_2d ht = FHT_8[tx_type]; 683 684 // Columns 685 for (i = 0; i < 8; ++i) { 686 for (j = 0; j < 8; ++j) 687 temp_in[j] = input[j * stride + i] * 4; 688 ht.cols(temp_in, temp_out); 689 for (j = 0; j < 8; ++j) 690 out[j * 8 + i] = temp_out[j]; 691 } 692 693 // Rows 694 for (i = 0; i < 8; ++i) { 695 for (j = 0; j < 8; ++j) 696 temp_in[j] = out[j + i * 8]; 697 ht.rows(temp_in, temp_out); 698 for (j = 0; j < 8; ++j) 699 output[j + i * 8] = (temp_out[j] + (temp_out[j] < 0)) >> 1; 700 } 701 } 702 } 703 704 /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per 705 pixel. */ 706 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) { 707 int i; 708 tran_high_t a1, b1, c1, d1, e1; 709 const int16_t *ip_pass0 = input; 710 const tran_low_t *ip = NULL; 711 tran_low_t *op = output; 712 713 for (i = 0; i < 4; i++) { 714 a1 = ip_pass0[0 * stride]; 715 b1 = ip_pass0[1 * stride]; 716 c1 = ip_pass0[2 * stride]; 717 d1 = ip_pass0[3 * stride]; 718 719 a1 += b1; 720 d1 = d1 - c1; 721 e1 = (a1 - d1) >> 1; 722 b1 = e1 - b1; 723 c1 = e1 - c1; 724 a1 -= c1; 725 d1 += b1; 726 op[0] = (tran_low_t)a1; 727 op[4] = (tran_low_t)c1; 728 op[8] = (tran_low_t)d1; 729 op[12] = (tran_low_t)b1; 730 731 ip_pass0++; 732 op++; 733 } 734 ip = output; 735 op = output; 736 737 for (i = 0; i < 4; i++) { 738 a1 = ip[0]; 739 b1 = ip[1]; 740 c1 = ip[2]; 741 d1 = ip[3]; 742 743 a1 += b1; 744 d1 -= c1; 745 e1 = (a1 - d1) >> 1; 746 b1 = e1 - b1; 747 c1 = e1 - c1; 748 a1 -= c1; 749 d1 += b1; 750 op[0] = (tran_low_t)(a1 * UNIT_QUANT_FACTOR); 751 op[1] = (tran_low_t)(c1 * UNIT_QUANT_FACTOR); 752 op[2] = (tran_low_t)(d1 * UNIT_QUANT_FACTOR); 753 op[3] = (tran_low_t)(b1 * UNIT_QUANT_FACTOR); 754 755 ip += 4; 756 op += 4; 757 } 758 } 759 760 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, 761 int stride, int tx_type) { 762 if (tx_type == DCT_DCT) { 763 vpx_fdct16x16_c(input, output, stride); 764 } else { 765 tran_low_t out[256]; 766 int i, j; 767 tran_low_t temp_in[16], temp_out[16]; 768 const transform_2d ht = FHT_16[tx_type]; 769 770 // Columns 771 for (i = 0; i < 16; ++i) { 772 for (j = 0; j < 16; ++j) 773 temp_in[j] = input[j * stride + i] * 4; 774 ht.cols(temp_in, temp_out); 775 for (j = 0; j < 16; ++j) 776 out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2; 777 } 778 779 // Rows 780 for (i = 0; i < 16; ++i) { 781 for (j = 0; j < 16; ++j) 782 temp_in[j] = out[j + i * 16]; 783 ht.rows(temp_in, temp_out); 784 for (j = 0; j < 16; ++j) 785 output[j + i * 16] = temp_out[j]; 786 } 787 } 788 } 789 790 #if CONFIG_VP9_HIGHBITDEPTH 791 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, 792 int stride, int tx_type) { 793 vp9_fht4x4_c(input, output, stride, tx_type); 794 } 795 796 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, 797 int stride, int tx_type) { 798 vp9_fht8x8_c(input, output, stride, tx_type); 799 } 800 801 void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, 802 int stride) { 803 vp9_fwht4x4_c(input, output, stride); 804 } 805 806 void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, 807 int stride, int tx_type) { 808 vp9_fht16x16_c(input, output, stride, tx_type); 809 } 810 #endif // CONFIG_VP9_HIGHBITDEPTH 811