1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 #include "mp4def.h" 19 #include "idct.h" 20 #include "motion_comp.h" 21 22 #ifdef FAST_IDCT 23 24 /**************************************************************** 25 * vca_idct.c : created 6/1/99 for several options 26 * of hard-coded reduced idct function (using nz_coefs) 27 ******************************************************************/ 28 29 /*****************************************************/ 30 //pretested version 31 void idctrow0(int16 *, uint8 *, uint8 *, int) 32 { 33 return ; 34 } 35 void idctcol0(int16 *) 36 { 37 return ; 38 } 39 40 void idctrow1(int16 *blk, uint8 *pred, uint8 *dst, int width) 41 { 42 /* shortcut */ 43 int tmp; 44 int i = 8; 45 uint32 pred_word, dst_word; 46 int res, res2; 47 48 /* preset the offset, such that we can take advantage pre-offset addressing mode */ 49 width -= 4; 50 dst -= width; 51 pred -= 12; 52 blk -= 8; 53 54 while (i--) 55 { 56 tmp = (*(blk += 8) + 32) >> 6; 57 *blk = 0; 58 59 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */ 60 res = tmp + (pred_word & 0xFF); 61 CLIP_RESULT(res); 62 res2 = tmp + ((pred_word >> 8) & 0xFF); 63 CLIP_RESULT(res2); 64 dst_word = (res2 << 8) | res; 65 res = tmp + ((pred_word >> 16) & 0xFF); 66 CLIP_RESULT(res); 67 dst_word |= (res << 16); 68 res = tmp + ((pred_word >> 24) & 0xFF); 69 CLIP_RESULT(res); 70 dst_word |= (res << 24); 71 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */ 72 73 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */ 74 res = tmp + (pred_word & 0xFF); 75 CLIP_RESULT(res); 76 res2 = tmp + ((pred_word >> 8) & 0xFF); 77 CLIP_RESULT(res2); 78 dst_word = (res2 << 8) | res; 79 res = tmp + ((pred_word >> 16) & 0xFF); 80 CLIP_RESULT(res); 81 dst_word |= (res << 16); 82 res = tmp + ((pred_word >> 24) & 0xFF); 83 CLIP_RESULT(res); 84 dst_word |= (res << 24); 85 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */ 86 } 87 return; 88 } 89 90 void idctcol1(int16 *blk) 91 { /* shortcut */ 92 blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] = 93 blk[0] << 3; 94 return; 95 } 96 97 void idctrow2(int16 *blk, uint8 *pred, uint8 *dst, int width) 98 { 99 int32 x0, x1, x2, x4, x5; 100 int i = 8; 101 uint32 pred_word, dst_word; 102 int res, res2; 103 104 /* preset the offset, such that we can take advantage pre-offset addressing mode */ 105 width -= 4; 106 dst -= width; 107 pred -= 12; 108 blk -= 8; 109 110 while (i--) 111 { 112 /* shortcut */ 113 x4 = blk[9]; 114 blk[9] = 0; 115 x0 = ((*(blk += 8)) << 8) + 8192; 116 *blk = 0; /* for proper rounding in the fourth stage */ 117 118 /* first stage */ 119 x5 = (W7 * x4 + 4) >> 3; 120 x4 = (W1 * x4 + 4) >> 3; 121 122 /* third stage */ 123 x2 = (181 * (x4 + x5) + 128) >> 8; 124 x1 = (181 * (x4 - x5) + 128) >> 8; 125 126 /* fourth stage */ 127 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */ 128 res = (x0 + x4) >> 14; 129 ADD_AND_CLIP1(res); 130 res2 = (x0 + x2) >> 14; 131 ADD_AND_CLIP2(res2); 132 dst_word = (res2 << 8) | res; 133 res = (x0 + x1) >> 14; 134 ADD_AND_CLIP3(res); 135 dst_word |= (res << 16); 136 res = (x0 + x5) >> 14; 137 ADD_AND_CLIP4(res); 138 dst_word |= (res << 24); 139 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */ 140 141 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */ 142 res = (x0 - x5) >> 14; 143 ADD_AND_CLIP1(res); 144 res2 = (x0 - x1) >> 14; 145 ADD_AND_CLIP2(res2); 146 dst_word = (res2 << 8) | res; 147 res = (x0 - x2) >> 14; 148 ADD_AND_CLIP3(res); 149 dst_word |= (res << 16); 150 res = (x0 - x4) >> 14; 151 ADD_AND_CLIP4(res); 152 dst_word |= (res << 24); 153 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */ 154 } 155 return ; 156 } 157 158 void idctcol2(int16 *blk) 159 { 160 int32 x0, x1, x3, x5, x7;//, x8; 161 162 x1 = blk[8]; 163 x0 = ((int32)blk[0] << 11) + 128; 164 /* both upper and lower*/ 165 166 x7 = W7 * x1; 167 x1 = W1 * x1; 168 169 x3 = x7; 170 x5 = (181 * (x1 - x7) + 128) >> 8; 171 x7 = (181 * (x1 + x7) + 128) >> 8; 172 173 blk[0] = (x0 + x1) >> 8; 174 blk[8] = (x0 + x7) >> 8; 175 blk[16] = (x0 + x5) >> 8; 176 blk[24] = (x0 + x3) >> 8; 177 blk[56] = (x0 - x1) >> 8; 178 blk[48] = (x0 - x7) >> 8; 179 blk[40] = (x0 - x5) >> 8; 180 blk[32] = (x0 - x3) >> 8; 181 182 return ; 183 } 184 185 void idctrow3(int16 *blk, uint8 *pred, uint8 *dst, int width) 186 { 187 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8; 188 int i = 8; 189 uint32 pred_word, dst_word; 190 int res, res2; 191 192 /* preset the offset, such that we can take advantage pre-offset addressing mode */ 193 width -= 4; 194 dst -= width; 195 pred -= 12; 196 blk -= 8; 197 198 while (i--) 199 { 200 x2 = blk[10]; 201 blk[10] = 0; 202 x1 = blk[9]; 203 blk[9] = 0; 204 x0 = ((*(blk += 8)) << 8) + 8192; 205 *blk = 0; /* for proper rounding in the fourth stage */ 206 /* both upper and lower*/ 207 /* both x2orx6 and x0orx4 */ 208 209 x4 = x0; 210 x6 = (W6 * x2 + 4) >> 3; 211 x2 = (W2 * x2 + 4) >> 3; 212 x8 = x0 - x2; 213 x0 += x2; 214 x2 = x8; 215 x8 = x4 - x6; 216 x4 += x6; 217 x6 = x8; 218 219 x7 = (W7 * x1 + 4) >> 3; 220 x1 = (W1 * x1 + 4) >> 3; 221 x3 = x7; 222 x5 = (181 * (x1 - x7) + 128) >> 8; 223 x7 = (181 * (x1 + x7) + 128) >> 8; 224 225 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */ 226 res = (x0 + x1) >> 14; 227 ADD_AND_CLIP1(res); 228 res2 = (x4 + x7) >> 14; 229 ADD_AND_CLIP2(res2); 230 dst_word = (res2 << 8) | res; 231 res = (x6 + x5) >> 14; 232 ADD_AND_CLIP3(res); 233 dst_word |= (res << 16); 234 res = (x2 + x3) >> 14; 235 ADD_AND_CLIP4(res); 236 dst_word |= (res << 24); 237 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */ 238 239 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */ 240 res = (x2 - x3) >> 14; 241 ADD_AND_CLIP1(res); 242 res2 = (x6 - x5) >> 14; 243 ADD_AND_CLIP2(res2); 244 dst_word = (res2 << 8) | res; 245 res = (x4 - x7) >> 14; 246 ADD_AND_CLIP3(res); 247 dst_word |= (res << 16); 248 res = (x0 - x1) >> 14; 249 ADD_AND_CLIP4(res); 250 dst_word |= (res << 24); 251 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */ 252 } 253 254 return ; 255 } 256 257 void idctcol3(int16 *blk) 258 { 259 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8; 260 261 x2 = blk[16]; 262 x1 = blk[8]; 263 x0 = ((int32)blk[0] << 11) + 128; 264 265 x4 = x0; 266 x6 = W6 * x2; 267 x2 = W2 * x2; 268 x8 = x0 - x2; 269 x0 += x2; 270 x2 = x8; 271 x8 = x4 - x6; 272 x4 += x6; 273 x6 = x8; 274 275 x7 = W7 * x1; 276 x1 = W1 * x1; 277 x3 = x7; 278 x5 = (181 * (x1 - x7) + 128) >> 8; 279 x7 = (181 * (x1 + x7) + 128) >> 8; 280 281 blk[0] = (x0 + x1) >> 8; 282 blk[8] = (x4 + x7) >> 8; 283 blk[16] = (x6 + x5) >> 8; 284 blk[24] = (x2 + x3) >> 8; 285 blk[56] = (x0 - x1) >> 8; 286 blk[48] = (x4 - x7) >> 8; 287 blk[40] = (x6 - x5) >> 8; 288 blk[32] = (x2 - x3) >> 8; 289 290 return; 291 } 292 293 294 void idctrow4(int16 *blk, uint8 *pred, uint8 *dst, int width) 295 { 296 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8; 297 int i = 8; 298 uint32 pred_word, dst_word; 299 int res, res2; 300 301 /* preset the offset, such that we can take advantage pre-offset addressing mode */ 302 width -= 4; 303 dst -= width; 304 pred -= 12; 305 blk -= 8; 306 307 while (i--) 308 { 309 x2 = blk[10]; 310 blk[10] = 0; 311 x1 = blk[9]; 312 blk[9] = 0; 313 x3 = blk[11]; 314 blk[11] = 0; 315 x0 = ((*(blk += 8)) << 8) + 8192; 316 *blk = 0; /* for proper rounding in the fourth stage */ 317 318 x4 = x0; 319 x6 = (W6 * x2 + 4) >> 3; 320 x2 = (W2 * x2 + 4) >> 3; 321 x8 = x0 - x2; 322 x0 += x2; 323 x2 = x8; 324 x8 = x4 - x6; 325 x4 += x6; 326 x6 = x8; 327 328 x7 = (W7 * x1 + 4) >> 3; 329 x1 = (W1 * x1 + 4) >> 3; 330 x5 = (W3 * x3 + 4) >> 3; 331 x3 = (- W5 * x3 + 4) >> 3; 332 x8 = x1 - x5; 333 x1 += x5; 334 x5 = x8; 335 x8 = x7 - x3; 336 x3 += x7; 337 x7 = (181 * (x5 + x8) + 128) >> 8; 338 x5 = (181 * (x5 - x8) + 128) >> 8; 339 340 pred_word = *((uint32*)(pred += 12)); /* read 4 bytes from pred */ 341 res = (x0 + x1) >> 14; 342 ADD_AND_CLIP1(res); 343 res2 = (x4 + x7) >> 14; 344 ADD_AND_CLIP2(res2); 345 dst_word = (res2 << 8) | res; 346 res = (x6 + x5) >> 14; 347 ADD_AND_CLIP3(res); 348 dst_word |= (res << 16); 349 res = (x2 + x3) >> 14; 350 ADD_AND_CLIP4(res); 351 dst_word |= (res << 24); 352 *((uint32*)(dst += width)) = dst_word; /* save 4 bytes to dst */ 353 354 pred_word = *((uint32*)(pred += 4)); /* read 4 bytes from pred */ 355 res = (x2 - x3) >> 14; 356 ADD_AND_CLIP1(res); 357 res2 = (x6 - x5) >> 14; 358 ADD_AND_CLIP2(res2); 359 dst_word = (res2 << 8) | res; 360 res = (x4 - x7) >> 14; 361 ADD_AND_CLIP3(res); 362 dst_word |= (res << 16); 363 res = (x0 - x1) >> 14; 364 ADD_AND_CLIP4(res); 365 dst_word |= (res << 24); 366 *((uint32*)(dst += 4)) = dst_word; /* save 4 bytes to dst */ 367 } 368 return ; 369 } 370 371 void idctcol4(int16 *blk) 372 { 373 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8; 374 x2 = blk[16]; 375 x1 = blk[8]; 376 x3 = blk[24]; 377 x0 = ((int32)blk[0] << 11) + 128; 378 379 x4 = x0; 380 x6 = W6 * x2; 381 x2 = W2 * x2; 382 x8 = x0 - x2; 383 x0 += x2; 384 x2 = x8; 385 x8 = x4 - x6; 386 x4 += x6; 387 x6 = x8; 388 389 x7 = W7 * x1; 390 x1 = W1 * x1; 391 x5 = W3 * x3; 392 x3 = -W5 * x3; 393 x8 = x1 - x5; 394 x1 += x5; 395 x5 = x8; 396 x8 = x7 - x3; 397 x3 += x7; 398 x7 = (181 * (x5 + x8) + 128) >> 8; 399 x5 = (181 * (x5 - x8) + 128) >> 8; 400 401 402 blk[0] = (x0 + x1) >> 8; 403 blk[8] = (x4 + x7) >> 8; 404 blk[16] = (x6 + x5) >> 8; 405 blk[24] = (x2 + x3) >> 8; 406 blk[56] = (x0 - x1) >> 8; 407 blk[48] = (x4 - x7) >> 8; 408 blk[40] = (x6 - x5) >> 8; 409 blk[32] = (x2 - x3) >> 8; 410 411 return ; 412 } 413 414 void idctrow0_intra(int16 *, PIXEL *, int) 415 { 416 return ; 417 } 418 419 void idctrow1_intra(int16 *blk, PIXEL *comp, int width) 420 { 421 /* shortcut */ 422 int32 tmp; 423 int i = 8; 424 int offset = width; 425 uint32 word; 426 427 comp -= offset; 428 while (i--) 429 { 430 tmp = ((blk[0] + 32) >> 6); 431 blk[0] = 0; 432 CLIP_RESULT(tmp) 433 434 word = (tmp << 8) | tmp; 435 word = (word << 16) | word; 436 437 *((uint32*)(comp += offset)) = word; 438 *((uint32*)(comp + 4)) = word; 439 440 441 442 443 blk += B_SIZE; 444 } 445 return; 446 } 447 448 void idctrow2_intra(int16 *blk, PIXEL *comp, int width) 449 { 450 int32 x0, x1, x2, x4, x5, temp; 451 int i = 8; 452 int offset = width; 453 int32 word; 454 455 comp -= offset; 456 while (i--) 457 { 458 /* shortcut */ 459 x4 = blk[1]; 460 blk[1] = 0; 461 x0 = ((int32)blk[0] << 8) + 8192; 462 blk[0] = 0; /* for proper rounding in the fourth stage */ 463 464 /* first stage */ 465 x5 = (W7 * x4 + 4) >> 3; 466 x4 = (W1 * x4 + 4) >> 3; 467 468 /* third stage */ 469 x2 = (181 * (x4 + x5) + 128) >> 8; 470 x1 = (181 * (x4 - x5) + 128) >> 8; 471 472 /* fourth stage */ 473 word = ((x0 + x4) >> 14); 474 CLIP_RESULT(word) 475 476 temp = ((x0 + x2) >> 14); 477 CLIP_RESULT(temp) 478 word = word | (temp << 8); 479 temp = ((x0 + x1) >> 14); 480 CLIP_RESULT(temp) 481 word = word | (temp << 16); 482 temp = ((x0 + x5) >> 14); 483 CLIP_RESULT(temp) 484 word = word | (temp << 24); 485 *((int32*)(comp += offset)) = word; 486 487 word = ((x0 - x5) >> 14); 488 CLIP_RESULT(word) 489 temp = ((x0 - x1) >> 14); 490 CLIP_RESULT(temp) 491 word = word | (temp << 8); 492 temp = ((x0 - x2) >> 14); 493 CLIP_RESULT(temp) 494 word = word | (temp << 16); 495 temp = ((x0 - x4) >> 14); 496 CLIP_RESULT(temp) 497 word = word | (temp << 24); 498 *((int32*)(comp + 4)) = word; 499 500 blk += B_SIZE; 501 } 502 return ; 503 } 504 505 void idctrow3_intra(int16 *blk, PIXEL *comp, int width) 506 { 507 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp; 508 int i = 8; 509 int offset = width; 510 int32 word; 511 512 comp -= offset; 513 514 while (i--) 515 { 516 x2 = blk[2]; 517 blk[2] = 0; 518 x1 = blk[1]; 519 blk[1] = 0; 520 x0 = ((int32)blk[0] << 8) + 8192; 521 blk[0] = 0;/* for proper rounding in the fourth stage */ 522 /* both upper and lower*/ 523 /* both x2orx6 and x0orx4 */ 524 525 x4 = x0; 526 x6 = (W6 * x2 + 4) >> 3; 527 x2 = (W2 * x2 + 4) >> 3; 528 x8 = x0 - x2; 529 x0 += x2; 530 x2 = x8; 531 x8 = x4 - x6; 532 x4 += x6; 533 x6 = x8; 534 535 x7 = (W7 * x1 + 4) >> 3; 536 x1 = (W1 * x1 + 4) >> 3; 537 x3 = x7; 538 x5 = (181 * (x1 - x7) + 128) >> 8; 539 x7 = (181 * (x1 + x7) + 128) >> 8; 540 541 word = ((x0 + x1) >> 14); 542 CLIP_RESULT(word) 543 temp = ((x4 + x7) >> 14); 544 CLIP_RESULT(temp) 545 word = word | (temp << 8); 546 547 548 temp = ((x6 + x5) >> 14); 549 CLIP_RESULT(temp) 550 word = word | (temp << 16); 551 552 temp = ((x2 + x3) >> 14); 553 CLIP_RESULT(temp) 554 word = word | (temp << 24); 555 *((int32*)(comp += offset)) = word; 556 557 word = ((x2 - x3) >> 14); 558 CLIP_RESULT(word) 559 560 temp = ((x6 - x5) >> 14); 561 CLIP_RESULT(temp) 562 word = word | (temp << 8); 563 564 temp = ((x4 - x7) >> 14); 565 CLIP_RESULT(temp) 566 word = word | (temp << 16); 567 568 temp = ((x0 - x1) >> 14); 569 CLIP_RESULT(temp) 570 word = word | (temp << 24); 571 *((int32*)(comp + 4)) = word; 572 573 blk += B_SIZE; 574 } 575 return ; 576 } 577 578 void idctrow4_intra(int16 *blk, PIXEL *comp, int width) 579 { 580 int32 x0, x1, x2, x3, x4, x5, x6, x7, x8, temp; 581 int i = 8; 582 int offset = width; 583 int32 word; 584 585 comp -= offset; 586 587 while (i--) 588 { 589 x2 = blk[2]; 590 blk[2] = 0; 591 x1 = blk[1]; 592 blk[1] = 0; 593 x3 = blk[3]; 594 blk[3] = 0; 595 x0 = ((int32)blk[0] << 8) + 8192; 596 blk[0] = 0;/* for proper rounding in the fourth stage */ 597 598 x4 = x0; 599 x6 = (W6 * x2 + 4) >> 3; 600 x2 = (W2 * x2 + 4) >> 3; 601 x8 = x0 - x2; 602 x0 += x2; 603 x2 = x8; 604 x8 = x4 - x6; 605 x4 += x6; 606 x6 = x8; 607 608 x7 = (W7 * x1 + 4) >> 3; 609 x1 = (W1 * x1 + 4) >> 3; 610 x5 = (W3 * x3 + 4) >> 3; 611 x3 = (- W5 * x3 + 4) >> 3; 612 x8 = x1 - x5; 613 x1 += x5; 614 x5 = x8; 615 x8 = x7 - x3; 616 x3 += x7; 617 x7 = (181 * (x5 + x8) + 128) >> 8; 618 x5 = (181 * (x5 - x8) + 128) >> 8; 619 620 word = ((x0 + x1) >> 14); 621 CLIP_RESULT(word) 622 623 temp = ((x4 + x7) >> 14); 624 CLIP_RESULT(temp) 625 word = word | (temp << 8); 626 627 628 temp = ((x6 + x5) >> 14); 629 CLIP_RESULT(temp) 630 word = word | (temp << 16); 631 632 temp = ((x2 + x3) >> 14); 633 CLIP_RESULT(temp) 634 word = word | (temp << 24); 635 *((int32*)(comp += offset)) = word; 636 637 word = ((x2 - x3) >> 14); 638 CLIP_RESULT(word) 639 640 temp = ((x6 - x5) >> 14); 641 CLIP_RESULT(temp) 642 word = word | (temp << 8); 643 644 temp = ((x4 - x7) >> 14); 645 CLIP_RESULT(temp) 646 word = word | (temp << 16); 647 648 temp = ((x0 - x1) >> 14); 649 CLIP_RESULT(temp) 650 word = word | (temp << 24); 651 *((int32*)(comp + 4)) = word; 652 653 blk += B_SIZE; 654 } 655 656 return ; 657 } 658 659 #endif 660 661