1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 #include "avcdec_lib.h" 19 20 21 #define CLIP_RESULT(x) if((uint)x > 0xFF){ \ 22 x = 0xFF & (~(x>>31));} 23 24 /* (blkwidth << 2) + (dy << 1) + dx */ 25 static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) = 26 { 27 &ChromaFullMC_SIMD, 28 &ChromaHorizontalMC_SIMD, 29 &ChromaVerticalMC_SIMD, 30 &ChromaDiagonalMC_SIMD, 31 &ChromaFullMC_SIMD, 32 &ChromaHorizontalMC2_SIMD, 33 &ChromaVerticalMC2_SIMD, 34 &ChromaDiagonalMC2_SIMD 35 }; 36 /* Perform motion prediction and compensation with residue if exist. */ 37 void InterMBPrediction(AVCCommonObj *video) 38 { 39 AVCMacroblock *currMB = video->currMB; 40 AVCPictureData *currPic = video->currPic; 41 int mbPartIdx, subMbPartIdx; 42 int ref_idx; 43 int offset_MbPart_indx = 0; 44 int16 *mv; 45 uint32 x_pos, y_pos; 46 uint8 *curL, *curCb, *curCr; 47 uint8 *ref_l, *ref_Cb, *ref_Cr; 48 uint8 *predBlock, *predCb, *predCr; 49 int block_x, block_y, offset_x, offset_y, offsetP, offset; 50 int x_position = (video->mb_x << 4); 51 int y_position = (video->mb_y << 4); 52 int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx; 53 int picWidth = currPic->pitch; 54 int picHeight = currPic->height; 55 int16 *dataBlock; 56 uint32 cbp4x4; 57 uint32 tmp_word; 58 59 tmp_word = y_position * picWidth; 60 curL = currPic->Sl + tmp_word + x_position; 61 offset = (tmp_word >> 2) + (x_position >> 1); 62 curCb = currPic->Scb + offset; 63 curCr = currPic->Scr + offset; 64 65 #ifdef USE_PRED_BLOCK 66 predBlock = video->pred + 84; 67 predCb = video->pred + 452; 68 predCr = video->pred + 596; 69 #else 70 predBlock = curL; 71 predCb = curCb; 72 predCr = curCr; 73 #endif 74 75 GetMotionVectorPredictor(video, false); 76 77 for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++) 78 { 79 MbHeight = currMB->SubMbPartHeight[mbPartIdx]; 80 MbWidth = currMB->SubMbPartWidth[mbPartIdx]; 81 mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1); 82 mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1; 83 ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X]; 84 offset_indx = 0; 85 86 ref_l = video->RefPicList0[ref_idx]->Sl; 87 ref_Cb = video->RefPicList0[ref_idx]->Scb; 88 ref_Cr = video->RefPicList0[ref_idx]->Scr; 89 90 for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++) 91 { 92 block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1); // check this 93 block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1); 94 mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2)); 95 offset_x = x_position + (block_x << 2); 96 offset_y = y_position + (block_y << 2); 97 x_pos = (offset_x << 2) + *mv++; /*quarter pel */ 98 y_pos = (offset_y << 2) + *mv; /*quarter pel */ 99 100 //offset = offset_y * currPic->width; 101 //offsetC = (offset >> 2) + (offset_x >> 1); 102 #ifdef USE_PRED_BLOCK 103 offsetP = (block_y * 80) + (block_x << 2); 104 LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos, 105 /*comp_Sl + offset + offset_x,*/ 106 predBlock + offsetP, 20, MbWidth, MbHeight); 107 #else 108 offsetP = (block_y << 2) * picWidth + (block_x << 2); 109 LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos, 110 /*comp_Sl + offset + offset_x,*/ 111 predBlock + offsetP, picWidth, MbWidth, MbHeight); 112 #endif 113 114 #ifdef USE_PRED_BLOCK 115 offsetP = (block_y * 24) + (block_x << 1); 116 ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 117 /*comp_Scb + offsetC,*/ 118 predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1); 119 ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 120 /*comp_Scr + offsetC,*/ 121 predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1); 122 #else 123 offsetP = (block_y * picWidth) + (block_x << 1); 124 ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 125 /*comp_Scb + offsetC,*/ 126 predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1); 127 ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 128 /*comp_Scr + offsetC,*/ 129 predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1); 130 #endif 131 132 offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3; 133 } 134 offset_MbPart_indx = currMB->MbPartWidth >> 4; 135 } 136 137 /* used in decoder, used to be if(!encFlag) */ 138 139 /* transform in raster scan order */ 140 dataBlock = video->block; 141 cbp4x4 = video->cbp4x4; 142 /* luma */ 143 for (block_y = 4; block_y > 0; block_y--) 144 { 145 for (block_x = 4; block_x > 0; block_x--) 146 { 147 #ifdef USE_PRED_BLOCK 148 if (cbp4x4&1) 149 { 150 itrans(dataBlock, predBlock, predBlock, 20); 151 } 152 #else 153 if (cbp4x4&1) 154 { 155 itrans(dataBlock, curL, curL, picWidth); 156 } 157 #endif 158 cbp4x4 >>= 1; 159 dataBlock += 4; 160 #ifdef USE_PRED_BLOCK 161 predBlock += 4; 162 #else 163 curL += 4; 164 #endif 165 } 166 dataBlock += 48; 167 #ifdef USE_PRED_BLOCK 168 predBlock += 64; 169 #else 170 curL += ((picWidth << 2) - 16); 171 #endif 172 } 173 174 /* chroma */ 175 picWidth = (picWidth >> 1); 176 for (block_y = 2; block_y > 0; block_y--) 177 { 178 for (block_x = 2; block_x > 0; block_x--) 179 { 180 #ifdef USE_PRED_BLOCK 181 if (cbp4x4&1) 182 { 183 ictrans(dataBlock, predCb, predCb, 12); 184 } 185 #else 186 if (cbp4x4&1) 187 { 188 ictrans(dataBlock, curCb, curCb, picWidth); 189 } 190 #endif 191 cbp4x4 >>= 1; 192 dataBlock += 4; 193 #ifdef USE_PRED_BLOCK 194 predCb += 4; 195 #else 196 curCb += 4; 197 #endif 198 } 199 for (block_x = 2; block_x > 0; block_x--) 200 { 201 #ifdef USE_PRED_BLOCK 202 if (cbp4x4&1) 203 { 204 ictrans(dataBlock, predCr, predCr, 12); 205 } 206 #else 207 if (cbp4x4&1) 208 { 209 ictrans(dataBlock, curCr, curCr, picWidth); 210 } 211 #endif 212 cbp4x4 >>= 1; 213 dataBlock += 4; 214 #ifdef USE_PRED_BLOCK 215 predCr += 4; 216 #else 217 curCr += 4; 218 #endif 219 } 220 dataBlock += 48; 221 #ifdef USE_PRED_BLOCK 222 predCb += 40; 223 predCr += 40; 224 #else 225 curCb += ((picWidth << 2) - 8); 226 curCr += ((picWidth << 2) - 8); 227 #endif 228 } 229 230 #ifdef MB_BASED_DEBLOCK 231 SaveNeighborForIntraPred(video, offset); 232 #endif 233 234 return ; 235 } 236 237 238 /* preform the actual motion comp here */ 239 void LumaMotionComp(uint8 *ref, int picwidth, int picheight, 240 int x_pos, int y_pos, 241 uint8 *pred, int pred_pitch, 242 int blkwidth, int blkheight) 243 { 244 int dx, dy; 245 uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */ 246 int temp2[21][21]; /* for intermediate results */ 247 uint8 *ref2; 248 249 dx = x_pos & 3; 250 dy = y_pos & 3; 251 x_pos = x_pos >> 2; /* round it to full-pel resolution */ 252 y_pos = y_pos >> 2; 253 254 /* perform actual motion compensation */ 255 if (dx == 0 && dy == 0) 256 { /* fullpel position *//* G */ 257 if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight) 258 { 259 ref += y_pos * picwidth + x_pos; 260 FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight); 261 } 262 else 263 { 264 CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight); 265 FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight); 266 } 267 268 } /* other positions */ 269 else if (dy == 0) 270 { /* no vertical interpolation *//* a,b,c*/ 271 272 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight) 273 { 274 ref += y_pos * picwidth + x_pos; 275 276 HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx); 277 } 278 else /* need padding */ 279 { 280 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight); 281 282 HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx); 283 } 284 } 285 else if (dx == 0) 286 { /*no horizontal interpolation *//* d,h,n */ 287 288 if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight) 289 { 290 ref += y_pos * picwidth + x_pos; 291 292 VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy); 293 } 294 else /* need padding */ 295 { 296 CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5); 297 298 VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy); 299 } 300 } 301 else if (dy == 2) 302 { /* horizontal cross *//* i, j, k */ 303 304 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight) 305 { 306 ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */ 307 308 VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight); 309 310 HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx); 311 } 312 else /* need padding */ 313 { 314 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5); 315 316 VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight); 317 318 HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx); 319 } 320 } 321 else if (dx == 2) 322 { /* vertical cross */ /* f,q */ 323 324 if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight) 325 { 326 ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */ 327 328 HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5); 329 VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy); 330 } 331 else /* need padding */ 332 { 333 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5); 334 HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5); 335 VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy); 336 } 337 } 338 else 339 { /* diagonal *//* e,g,p,r */ 340 341 if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth && 342 y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight) 343 { 344 ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos; 345 346 ref += (y_pos * picwidth) + x_pos + (dx / 2); 347 348 DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight); 349 } 350 else /* need padding */ 351 { 352 CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2)); 353 354 ref2 = &temp[2 + (dy/2)][2]; 355 356 ref = &temp[2][2 + (dx/2)]; 357 358 DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight); 359 } 360 } 361 362 return ; 363 } 364 365 void CreateAlign(uint8 *ref, int picwidth, int y_pos, 366 uint8 *out, int blkwidth, int blkheight) 367 { 368 int i, j; 369 int offset, out_offset; 370 uint32 prev_pix, result, pix1, pix2, pix4; 371 372 out_offset = 24 - blkwidth; 373 374 //switch(x_pos&0x3){ 375 switch (((uint32)ref)&0x3) 376 { 377 case 1: 378 ref += y_pos * picwidth; 379 offset = picwidth - blkwidth - 3; 380 for (j = 0; j < blkheight; j++) 381 { 382 pix1 = *ref++; 383 pix2 = *((uint16*)ref); 384 ref += 2; 385 result = (pix2 << 8) | pix1; 386 387 for (i = 3; i < blkwidth; i += 4) 388 { 389 pix4 = *((uint32*)ref); 390 ref += 4; 391 prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */ 392 result |= prev_pix; 393 *((uint32*)out) = result; /* write 4 bytes */ 394 out += 4; 395 result = pix4 >> 8; /* for the next loop */ 396 } 397 ref += offset; 398 out += out_offset; 399 } 400 break; 401 case 2: 402 ref += y_pos * picwidth; 403 offset = picwidth - blkwidth - 2; 404 for (j = 0; j < blkheight; j++) 405 { 406 result = *((uint16*)ref); 407 ref += 2; 408 for (i = 2; i < blkwidth; i += 4) 409 { 410 pix4 = *((uint32*)ref); 411 ref += 4; 412 prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */ 413 result |= prev_pix; 414 *((uint32*)out) = result; /* write 4 bytes */ 415 out += 4; 416 result = pix4 >> 16; /* for the next loop */ 417 } 418 ref += offset; 419 out += out_offset; 420 } 421 break; 422 case 3: 423 ref += y_pos * picwidth; 424 offset = picwidth - blkwidth - 1; 425 for (j = 0; j < blkheight; j++) 426 { 427 result = *ref++; 428 for (i = 1; i < blkwidth; i += 4) 429 { 430 pix4 = *((uint32*)ref); 431 ref += 4; 432 prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */ 433 result |= prev_pix; 434 *((uint32*)out) = result; /* write 4 bytes */ 435 out += 4; 436 result = pix4 >> 24; /* for the next loop */ 437 } 438 ref += offset; 439 out += out_offset; 440 } 441 break; 442 } 443 } 444 445 void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos, 446 uint8 *out, int blkwidth, int blkheight) 447 { 448 int x_inc0, x_mid; 449 int y_inc, y_inc0, y_inc1, y_mid; 450 int i, j; 451 int offset; 452 453 if (x_pos < 0) 454 { 455 x_inc0 = 0; /* increment for the first part */ 456 x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth); /* stopping point */ 457 x_pos = 0; 458 } 459 else if (x_pos + blkwidth > picwidth) 460 { 461 x_inc0 = 1; /* increasing */ 462 x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0); /* clip negative to zero, encode fool proof! */ 463 } 464 else /* normal case */ 465 { 466 x_inc0 = 1; 467 x_mid = blkwidth; /* just one run */ 468 } 469 470 471 /* boundary for y_pos, taking the result from x_pos into account */ 472 if (y_pos < 0) 473 { 474 y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */ 475 y_inc1 = picwidth + y_inc0; 476 y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */ 477 y_pos = 0; 478 } 479 else if (y_pos + blkheight > picheight) 480 { 481 y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */ 482 y_inc0 = picwidth + y_inc1; /* increasing */ 483 y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0); 484 } 485 else /* normal case */ 486 { 487 y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); 488 y_inc0 = picwidth + y_inc1; 489 y_mid = blkheight; 490 } 491 492 /* clip y_pos and x_pos */ 493 if (y_pos > picheight - 1) y_pos = picheight - 1; 494 if (x_pos > picwidth - 1) x_pos = picwidth - 1; 495 496 ref += y_pos * picwidth + x_pos; 497 498 y_inc = y_inc0; /* start with top half */ 499 500 offset = 24 - blkwidth; /* to use in offset out */ 501 blkwidth -= x_mid; /* to use in the loop limit */ 502 503 if (x_inc0 == 0) 504 { 505 for (j = 0; j < blkheight; j++) 506 { 507 if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */ 508 { 509 y_inc = y_inc1; /* switch to lower half */ 510 } 511 for (i = x_mid; i > 0; i--) /* first or third quarter */ 512 { 513 *out++ = *ref; 514 } 515 for (i = blkwidth; i > 0; i--) /* second or fourth quarter */ 516 { 517 *out++ = *ref++; 518 } 519 out += offset; 520 ref += y_inc; 521 } 522 } 523 else 524 { 525 for (j = 0; j < blkheight; j++) 526 { 527 if (j == y_mid) /* put a check here to reduce the code size (for unrolling the loop) */ 528 { 529 y_inc = y_inc1; /* switch to lower half */ 530 } 531 for (i = x_mid; i > 0; i--) /* first or third quarter */ 532 { 533 *out++ = *ref++; 534 } 535 for (i = blkwidth; i > 0; i--) /* second or fourth quarter */ 536 { 537 *out++ = *ref; 538 } 539 out += offset; 540 ref += y_inc; 541 } 542 } 543 544 return ; 545 } 546 547 void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, 548 int blkwidth, int blkheight, int dx) 549 { 550 uint8 *p_ref; 551 uint32 *p_cur; 552 uint32 tmp, pkres; 553 int result, curr_offset, ref_offset; 554 int j; 555 int32 r0, r1, r2, r3, r4, r5; 556 int32 r13, r6; 557 558 p_cur = (uint32*)out; /* assume it's word aligned */ 559 curr_offset = (outpitch - blkwidth) >> 2; 560 p_ref = in; 561 ref_offset = inpitch - blkwidth; 562 563 if (dx&1) 564 { 565 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ 566 p_ref -= 2; 567 r13 = 0; 568 for (j = blkheight; j > 0; j--) 569 { 570 tmp = (uint32)(p_ref + blkwidth); 571 r0 = p_ref[0]; 572 r1 = p_ref[2]; 573 r0 |= (r1 << 16); /* 0,c,0,a */ 574 r1 = p_ref[1]; 575 r2 = p_ref[3]; 576 r1 |= (r2 << 16); /* 0,d,0,b */ 577 while ((uint32)p_ref < tmp) 578 { 579 r2 = *(p_ref += 4); /* move pointer to e */ 580 r3 = p_ref[2]; 581 r2 |= (r3 << 16); /* 0,g,0,e */ 582 r3 = p_ref[1]; 583 r4 = p_ref[3]; 584 r3 |= (r4 << 16); /* 0,h,0,f */ 585 586 r4 = r0 + r3; /* c+h, a+f */ 587 r5 = r0 + r1; /* c+d, a+b */ 588 r6 = r2 + r3; /* g+h, e+f */ 589 r5 >>= 16; 590 r5 |= (r6 << 16); /* e+f, c+d */ 591 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 592 r4 += 0x100010; /* +16, +16 */ 593 r5 = r1 + r2; /* d+g, b+e */ 594 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 595 r4 >>= 5; 596 r13 |= r4; /* check clipping */ 597 598 r5 = p_ref[dx+2]; 599 r6 = p_ref[dx+4]; 600 r5 |= (r6 << 16); 601 r4 += r5; 602 r4 += 0x10001; 603 r4 = (r4 >> 1) & 0xFF00FF; 604 605 r5 = p_ref[4]; /* i */ 606 r6 = (r5 << 16); 607 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 608 r5 += r1; /* d+i, b+g */ /* r5 not free */ 609 r1 >>= 16; 610 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 611 r1 += r2; /* f+g, d+e */ 612 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 613 r0 >>= 16; 614 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 615 r0 += r3; /* e+h, c+f */ 616 r5 += 0x100010; /* 16,16 */ 617 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 618 r5 >>= 5; 619 r13 |= r5; /* check clipping */ 620 621 r0 = p_ref[dx+3]; 622 r1 = p_ref[dx+5]; 623 r0 |= (r1 << 16); 624 r5 += r0; 625 r5 += 0x10001; 626 r5 = (r5 >> 1) & 0xFF00FF; 627 628 r4 |= (r5 << 8); /* pack them together */ 629 *p_cur++ = r4; 630 r1 = r3; 631 r0 = r2; 632 } 633 p_cur += curr_offset; /* move to the next line */ 634 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 635 636 if (r13&0xFF000700) /* need clipping */ 637 { 638 /* move back to the beginning of the line */ 639 p_ref -= (ref_offset + blkwidth); /* input */ 640 p_cur -= (outpitch >> 2); 641 642 tmp = (uint32)(p_ref + blkwidth); 643 for (; (uint32)p_ref < tmp;) 644 { 645 646 r0 = *p_ref++; 647 r1 = *p_ref++; 648 r2 = *p_ref++; 649 r3 = *p_ref++; 650 r4 = *p_ref++; 651 /* first pixel */ 652 r5 = *p_ref++; 653 result = (r0 + r5); 654 r0 = (r1 + r4); 655 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 656 r0 = (r2 + r3); 657 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 658 result = (result + 16) >> 5; 659 CLIP_RESULT(result) 660 /* 3/4 pel, no need to clip */ 661 result = (result + p_ref[dx] + 1); 662 pkres = (result >> 1) ; 663 /* second pixel */ 664 r0 = *p_ref++; 665 result = (r1 + r0); 666 r1 = (r2 + r5); 667 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 668 r1 = (r3 + r4); 669 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 670 result = (result + 16) >> 5; 671 CLIP_RESULT(result) 672 /* 3/4 pel, no need to clip */ 673 result = (result + p_ref[dx] + 1); 674 result = (result >> 1); 675 pkres |= (result << 8); 676 /* third pixel */ 677 r1 = *p_ref++; 678 result = (r2 + r1); 679 r2 = (r3 + r0); 680 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 681 r2 = (r4 + r5); 682 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 683 result = (result + 16) >> 5; 684 CLIP_RESULT(result) 685 /* 3/4 pel, no need to clip */ 686 result = (result + p_ref[dx] + 1); 687 result = (result >> 1); 688 pkres |= (result << 16); 689 /* fourth pixel */ 690 r2 = *p_ref++; 691 result = (r3 + r2); 692 r3 = (r4 + r1); 693 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 694 r3 = (r5 + r0); 695 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 696 result = (result + 16) >> 5; 697 CLIP_RESULT(result) 698 /* 3/4 pel, no need to clip */ 699 result = (result + p_ref[dx] + 1); 700 result = (result >> 1); 701 pkres |= (result << 24); 702 *p_cur++ = pkres; /* write 4 pixels */ 703 p_ref -= 5; /* offset back to the middle of filter */ 704 } 705 p_cur += curr_offset; /* move to the next line */ 706 p_ref += ref_offset; /* move to the next line */ 707 } 708 } 709 } 710 else 711 { 712 p_ref -= 2; 713 r13 = 0; 714 for (j = blkheight; j > 0; j--) 715 { 716 tmp = (uint32)(p_ref + blkwidth); 717 r0 = p_ref[0]; 718 r1 = p_ref[2]; 719 r0 |= (r1 << 16); /* 0,c,0,a */ 720 r1 = p_ref[1]; 721 r2 = p_ref[3]; 722 r1 |= (r2 << 16); /* 0,d,0,b */ 723 while ((uint32)p_ref < tmp) 724 { 725 r2 = *(p_ref += 4); /* move pointer to e */ 726 r3 = p_ref[2]; 727 r2 |= (r3 << 16); /* 0,g,0,e */ 728 r3 = p_ref[1]; 729 r4 = p_ref[3]; 730 r3 |= (r4 << 16); /* 0,h,0,f */ 731 732 r4 = r0 + r3; /* c+h, a+f */ 733 r5 = r0 + r1; /* c+d, a+b */ 734 r6 = r2 + r3; /* g+h, e+f */ 735 r5 >>= 16; 736 r5 |= (r6 << 16); /* e+f, c+d */ 737 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 738 r4 += 0x100010; /* +16, +16 */ 739 r5 = r1 + r2; /* d+g, b+e */ 740 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 741 r4 >>= 5; 742 r13 |= r4; /* check clipping */ 743 r4 &= 0xFF00FF; /* mask */ 744 745 r5 = p_ref[4]; /* i */ 746 r6 = (r5 << 16); 747 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 748 r5 += r1; /* d+i, b+g */ /* r5 not free */ 749 r1 >>= 16; 750 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 751 r1 += r2; /* f+g, d+e */ 752 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 753 r0 >>= 16; 754 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 755 r0 += r3; /* e+h, c+f */ 756 r5 += 0x100010; /* 16,16 */ 757 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 758 r5 >>= 5; 759 r13 |= r5; /* check clipping */ 760 r5 &= 0xFF00FF; /* mask */ 761 762 r4 |= (r5 << 8); /* pack them together */ 763 *p_cur++ = r4; 764 r1 = r3; 765 r0 = r2; 766 } 767 p_cur += curr_offset; /* move to the next line */ 768 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 769 770 if (r13&0xFF000700) /* need clipping */ 771 { 772 /* move back to the beginning of the line */ 773 p_ref -= (ref_offset + blkwidth); /* input */ 774 p_cur -= (outpitch >> 2); 775 776 tmp = (uint32)(p_ref + blkwidth); 777 for (; (uint32)p_ref < tmp;) 778 { 779 780 r0 = *p_ref++; 781 r1 = *p_ref++; 782 r2 = *p_ref++; 783 r3 = *p_ref++; 784 r4 = *p_ref++; 785 /* first pixel */ 786 r5 = *p_ref++; 787 result = (r0 + r5); 788 r0 = (r1 + r4); 789 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 790 r0 = (r2 + r3); 791 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 792 result = (result + 16) >> 5; 793 CLIP_RESULT(result) 794 pkres = result; 795 /* second pixel */ 796 r0 = *p_ref++; 797 result = (r1 + r0); 798 r1 = (r2 + r5); 799 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 800 r1 = (r3 + r4); 801 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 802 result = (result + 16) >> 5; 803 CLIP_RESULT(result) 804 pkres |= (result << 8); 805 /* third pixel */ 806 r1 = *p_ref++; 807 result = (r2 + r1); 808 r2 = (r3 + r0); 809 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 810 r2 = (r4 + r5); 811 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 812 result = (result + 16) >> 5; 813 CLIP_RESULT(result) 814 pkres |= (result << 16); 815 /* fourth pixel */ 816 r2 = *p_ref++; 817 result = (r3 + r2); 818 r3 = (r4 + r1); 819 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 820 r3 = (r5 + r0); 821 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 822 result = (result + 16) >> 5; 823 CLIP_RESULT(result) 824 pkres |= (result << 24); 825 *p_cur++ = pkres; /* write 4 pixels */ 826 p_ref -= 5; 827 } 828 p_cur += curr_offset; /* move to the next line */ 829 p_ref += ref_offset; 830 } 831 } 832 } 833 834 return ; 835 } 836 837 void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch, 838 int blkwidth, int blkheight, int dx) 839 { 840 int *p_ref; 841 uint32 *p_cur; 842 uint32 tmp, pkres; 843 int result, result2, curr_offset, ref_offset; 844 int j, r0, r1, r2, r3, r4, r5; 845 846 p_cur = (uint32*)out; /* assume it's word aligned */ 847 curr_offset = (outpitch - blkwidth) >> 2; 848 p_ref = in; 849 ref_offset = inpitch - blkwidth; 850 851 if (dx&1) 852 { 853 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ 854 855 for (j = blkheight; j > 0 ; j--) 856 { 857 tmp = (uint32)(p_ref + blkwidth); 858 for (; (uint32)p_ref < tmp;) 859 { 860 861 r0 = p_ref[-2]; 862 r1 = p_ref[-1]; 863 r2 = *p_ref++; 864 r3 = *p_ref++; 865 r4 = *p_ref++; 866 /* first pixel */ 867 r5 = *p_ref++; 868 result = (r0 + r5); 869 r0 = (r1 + r4); 870 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 871 r0 = (r2 + r3); 872 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 873 result = (result + 512) >> 10; 874 CLIP_RESULT(result) 875 result2 = ((p_ref[dx] + 16) >> 5); 876 CLIP_RESULT(result2) 877 /* 3/4 pel, no need to clip */ 878 result = (result + result2 + 1); 879 pkres = (result >> 1); 880 /* second pixel */ 881 r0 = *p_ref++; 882 result = (r1 + r0); 883 r1 = (r2 + r5); 884 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 885 r1 = (r3 + r4); 886 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 887 result = (result + 512) >> 10; 888 CLIP_RESULT(result) 889 result2 = ((p_ref[dx] + 16) >> 5); 890 CLIP_RESULT(result2) 891 /* 3/4 pel, no need to clip */ 892 result = (result + result2 + 1); 893 result = (result >> 1); 894 pkres |= (result << 8); 895 /* third pixel */ 896 r1 = *p_ref++; 897 result = (r2 + r1); 898 r2 = (r3 + r0); 899 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 900 r2 = (r4 + r5); 901 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 902 result = (result + 512) >> 10; 903 CLIP_RESULT(result) 904 result2 = ((p_ref[dx] + 16) >> 5); 905 CLIP_RESULT(result2) 906 /* 3/4 pel, no need to clip */ 907 result = (result + result2 + 1); 908 result = (result >> 1); 909 pkres |= (result << 16); 910 /* fourth pixel */ 911 r2 = *p_ref++; 912 result = (r3 + r2); 913 r3 = (r4 + r1); 914 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 915 r3 = (r5 + r0); 916 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 917 result = (result + 512) >> 10; 918 CLIP_RESULT(result) 919 result2 = ((p_ref[dx] + 16) >> 5); 920 CLIP_RESULT(result2) 921 /* 3/4 pel, no need to clip */ 922 result = (result + result2 + 1); 923 result = (result >> 1); 924 pkres |= (result << 24); 925 *p_cur++ = pkres; /* write 4 pixels */ 926 p_ref -= 3; /* offset back to the middle of filter */ 927 } 928 p_cur += curr_offset; /* move to the next line */ 929 p_ref += ref_offset; /* move to the next line */ 930 } 931 } 932 else 933 { 934 for (j = blkheight; j > 0 ; j--) 935 { 936 tmp = (uint32)(p_ref + blkwidth); 937 for (; (uint32)p_ref < tmp;) 938 { 939 940 r0 = p_ref[-2]; 941 r1 = p_ref[-1]; 942 r2 = *p_ref++; 943 r3 = *p_ref++; 944 r4 = *p_ref++; 945 /* first pixel */ 946 r5 = *p_ref++; 947 result = (r0 + r5); 948 r0 = (r1 + r4); 949 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 950 r0 = (r2 + r3); 951 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 952 result = (result + 512) >> 10; 953 CLIP_RESULT(result) 954 pkres = result; 955 /* second pixel */ 956 r0 = *p_ref++; 957 result = (r1 + r0); 958 r1 = (r2 + r5); 959 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 960 r1 = (r3 + r4); 961 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 962 result = (result + 512) >> 10; 963 CLIP_RESULT(result) 964 pkres |= (result << 8); 965 /* third pixel */ 966 r1 = *p_ref++; 967 result = (r2 + r1); 968 r2 = (r3 + r0); 969 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 970 r2 = (r4 + r5); 971 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 972 result = (result + 512) >> 10; 973 CLIP_RESULT(result) 974 pkres |= (result << 16); 975 /* fourth pixel */ 976 r2 = *p_ref++; 977 result = (r3 + r2); 978 r3 = (r4 + r1); 979 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 980 r3 = (r5 + r0); 981 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 982 result = (result + 512) >> 10; 983 CLIP_RESULT(result) 984 pkres |= (result << 24); 985 *p_cur++ = pkres; /* write 4 pixels */ 986 p_ref -= 3; /* offset back to the middle of filter */ 987 } 988 p_cur += curr_offset; /* move to the next line */ 989 p_ref += ref_offset; /* move to the next line */ 990 } 991 } 992 993 return ; 994 } 995 996 void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch, 997 int blkwidth, int blkheight) 998 { 999 uint8 *p_ref; 1000 int *p_cur; 1001 uint32 tmp; 1002 int result, curr_offset, ref_offset; 1003 int j, r0, r1, r2, r3, r4, r5; 1004 1005 p_cur = out; 1006 curr_offset = (outpitch - blkwidth); 1007 p_ref = in; 1008 ref_offset = inpitch - blkwidth; 1009 1010 for (j = blkheight; j > 0 ; j--) 1011 { 1012 tmp = (uint32)(p_ref + blkwidth); 1013 for (; (uint32)p_ref < tmp;) 1014 { 1015 1016 r0 = p_ref[-2]; 1017 r1 = p_ref[-1]; 1018 r2 = *p_ref++; 1019 r3 = *p_ref++; 1020 r4 = *p_ref++; 1021 /* first pixel */ 1022 r5 = *p_ref++; 1023 result = (r0 + r5); 1024 r0 = (r1 + r4); 1025 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1026 r0 = (r2 + r3); 1027 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1028 *p_cur++ = result; 1029 /* second pixel */ 1030 r0 = *p_ref++; 1031 result = (r1 + r0); 1032 r1 = (r2 + r5); 1033 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1034 r1 = (r3 + r4); 1035 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1036 *p_cur++ = result; 1037 /* third pixel */ 1038 r1 = *p_ref++; 1039 result = (r2 + r1); 1040 r2 = (r3 + r0); 1041 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1042 r2 = (r4 + r5); 1043 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1044 *p_cur++ = result; 1045 /* fourth pixel */ 1046 r2 = *p_ref++; 1047 result = (r3 + r2); 1048 r3 = (r4 + r1); 1049 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1050 r3 = (r5 + r0); 1051 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1052 *p_cur++ = result; 1053 p_ref -= 3; /* move back to the middle of the filter */ 1054 } 1055 p_cur += curr_offset; /* move to the next line */ 1056 p_ref += ref_offset; 1057 } 1058 1059 return ; 1060 } 1061 void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, 1062 int blkwidth, int blkheight, int dy) 1063 { 1064 uint8 *p_cur, *p_ref; 1065 uint32 tmp; 1066 int result, curr_offset, ref_offset; 1067 int j, i; 1068 int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13; 1069 uint8 tmp_in[24][24]; 1070 1071 /* not word-aligned */ 1072 if (((uint32)in)&0x3) 1073 { 1074 CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); 1075 in = &tmp_in[2][0]; 1076 inpitch = 24; 1077 } 1078 p_cur = out; 1079 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 1080 ref_offset = blkheight * inpitch; /* for limit */ 1081 1082 curr_offset += 3; 1083 1084 if (dy&1) 1085 { 1086 dy = (dy >> 1) ? 0 : -inpitch; 1087 1088 for (j = 0; j < blkwidth; j += 4, in += 4) 1089 { 1090 r13 = 0; 1091 p_ref = in; 1092 p_cur -= outpitch; /* compensate for the first offset */ 1093 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1094 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 1095 { 1096 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 1097 p_ref += inpitch; 1098 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 1099 r0 &= 0xFF00FF; 1100 1101 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 1102 r7 = (r1 >> 8) & 0xFF00FF; 1103 r1 &= 0xFF00FF; 1104 1105 r0 += r1; 1106 r6 += r7; 1107 1108 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 1109 r8 = (r2 >> 8) & 0xFF00FF; 1110 r2 &= 0xFF00FF; 1111 1112 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 1113 r7 = (r1 >> 8) & 0xFF00FF; 1114 r1 &= 0xFF00FF; 1115 r1 += r2; 1116 1117 r7 += r8; 1118 1119 r0 += 20 * r1; 1120 r6 += 20 * r7; 1121 r0 += 0x100010; 1122 r6 += 0x100010; 1123 1124 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 1125 r8 = (r2 >> 8) & 0xFF00FF; 1126 r2 &= 0xFF00FF; 1127 1128 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 1129 r7 = (r1 >> 8) & 0xFF00FF; 1130 r1 &= 0xFF00FF; 1131 r1 += r2; 1132 1133 r7 += r8; 1134 1135 r0 -= 5 * r1; 1136 r6 -= 5 * r7; 1137 1138 r0 >>= 5; 1139 r6 >>= 5; 1140 /* clip */ 1141 r13 |= r6; 1142 r13 |= r0; 1143 //CLIPPACK(r6,result) 1144 1145 r1 = *((uint32*)(p_ref + dy)); 1146 r2 = (r1 >> 8) & 0xFF00FF; 1147 r1 &= 0xFF00FF; 1148 r0 += r1; 1149 r6 += r2; 1150 r0 += 0x10001; 1151 r6 += 0x10001; 1152 r0 = (r0 >> 1) & 0xFF00FF; 1153 r6 = (r6 >> 1) & 0xFF00FF; 1154 1155 r0 |= (r6 << 8); /* pack it back */ 1156 *((uint32*)(p_cur += outpitch)) = r0; 1157 } 1158 p_cur += curr_offset; /* offset to the next pixel */ 1159 if (r13 & 0xFF000700) /* this column need clipping */ 1160 { 1161 p_cur -= 4; 1162 for (i = 0; i < 4; i++) 1163 { 1164 p_ref = in + i; 1165 p_cur -= outpitch; /* compensate for the first offset */ 1166 1167 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1168 while ((uint32)p_ref < tmp) 1169 { /* loop un-rolled */ 1170 r0 = *(p_ref - (inpitch << 1)); 1171 r1 = *(p_ref - inpitch); 1172 r2 = *p_ref; 1173 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1174 r4 = *(p_ref += inpitch); 1175 /* first pixel */ 1176 r5 = *(p_ref += inpitch); 1177 result = (r0 + r5); 1178 r0 = (r1 + r4); 1179 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1180 r0 = (r2 + r3); 1181 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1182 result = (result + 16) >> 5; 1183 CLIP_RESULT(result) 1184 /* 3/4 pel, no need to clip */ 1185 result = (result + p_ref[dy-(inpitch<<1)] + 1); 1186 result = (result >> 1); 1187 *(p_cur += outpitch) = result; 1188 /* second pixel */ 1189 r0 = *(p_ref += inpitch); 1190 result = (r1 + r0); 1191 r1 = (r2 + r5); 1192 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1193 r1 = (r3 + r4); 1194 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1195 result = (result + 16) >> 5; 1196 CLIP_RESULT(result) 1197 /* 3/4 pel, no need to clip */ 1198 result = (result + p_ref[dy-(inpitch<<1)] + 1); 1199 result = (result >> 1); 1200 *(p_cur += outpitch) = result; 1201 /* third pixel */ 1202 r1 = *(p_ref += inpitch); 1203 result = (r2 + r1); 1204 r2 = (r3 + r0); 1205 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1206 r2 = (r4 + r5); 1207 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1208 result = (result + 16) >> 5; 1209 CLIP_RESULT(result) 1210 /* 3/4 pel, no need to clip */ 1211 result = (result + p_ref[dy-(inpitch<<1)] + 1); 1212 result = (result >> 1); 1213 *(p_cur += outpitch) = result; 1214 /* fourth pixel */ 1215 r2 = *(p_ref += inpitch); 1216 result = (r3 + r2); 1217 r3 = (r4 + r1); 1218 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1219 r3 = (r5 + r0); 1220 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1221 result = (result + 16) >> 5; 1222 CLIP_RESULT(result) 1223 /* 3/4 pel, no need to clip */ 1224 result = (result + p_ref[dy-(inpitch<<1)] + 1); 1225 result = (result >> 1); 1226 *(p_cur += outpitch) = result; 1227 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1228 } 1229 p_cur += (curr_offset - 3); 1230 } 1231 } 1232 } 1233 } 1234 else 1235 { 1236 for (j = 0; j < blkwidth; j += 4, in += 4) 1237 { 1238 r13 = 0; 1239 p_ref = in; 1240 p_cur -= outpitch; /* compensate for the first offset */ 1241 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1242 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 1243 { 1244 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 1245 p_ref += inpitch; 1246 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 1247 r0 &= 0xFF00FF; 1248 1249 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 1250 r7 = (r1 >> 8) & 0xFF00FF; 1251 r1 &= 0xFF00FF; 1252 1253 r0 += r1; 1254 r6 += r7; 1255 1256 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 1257 r8 = (r2 >> 8) & 0xFF00FF; 1258 r2 &= 0xFF00FF; 1259 1260 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 1261 r7 = (r1 >> 8) & 0xFF00FF; 1262 r1 &= 0xFF00FF; 1263 r1 += r2; 1264 1265 r7 += r8; 1266 1267 r0 += 20 * r1; 1268 r6 += 20 * r7; 1269 r0 += 0x100010; 1270 r6 += 0x100010; 1271 1272 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 1273 r8 = (r2 >> 8) & 0xFF00FF; 1274 r2 &= 0xFF00FF; 1275 1276 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 1277 r7 = (r1 >> 8) & 0xFF00FF; 1278 r1 &= 0xFF00FF; 1279 r1 += r2; 1280 1281 r7 += r8; 1282 1283 r0 -= 5 * r1; 1284 r6 -= 5 * r7; 1285 1286 r0 >>= 5; 1287 r6 >>= 5; 1288 /* clip */ 1289 r13 |= r6; 1290 r13 |= r0; 1291 //CLIPPACK(r6,result) 1292 r0 &= 0xFF00FF; 1293 r6 &= 0xFF00FF; 1294 r0 |= (r6 << 8); /* pack it back */ 1295 *((uint32*)(p_cur += outpitch)) = r0; 1296 } 1297 p_cur += curr_offset; /* offset to the next pixel */ 1298 if (r13 & 0xFF000700) /* this column need clipping */ 1299 { 1300 p_cur -= 4; 1301 for (i = 0; i < 4; i++) 1302 { 1303 p_ref = in + i; 1304 p_cur -= outpitch; /* compensate for the first offset */ 1305 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1306 while ((uint32)p_ref < tmp) 1307 { /* loop un-rolled */ 1308 r0 = *(p_ref - (inpitch << 1)); 1309 r1 = *(p_ref - inpitch); 1310 r2 = *p_ref; 1311 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1312 r4 = *(p_ref += inpitch); 1313 /* first pixel */ 1314 r5 = *(p_ref += inpitch); 1315 result = (r0 + r5); 1316 r0 = (r1 + r4); 1317 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1318 r0 = (r2 + r3); 1319 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1320 result = (result + 16) >> 5; 1321 CLIP_RESULT(result) 1322 *(p_cur += outpitch) = result; 1323 /* second pixel */ 1324 r0 = *(p_ref += inpitch); 1325 result = (r1 + r0); 1326 r1 = (r2 + r5); 1327 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1328 r1 = (r3 + r4); 1329 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1330 result = (result + 16) >> 5; 1331 CLIP_RESULT(result) 1332 *(p_cur += outpitch) = result; 1333 /* third pixel */ 1334 r1 = *(p_ref += inpitch); 1335 result = (r2 + r1); 1336 r2 = (r3 + r0); 1337 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1338 r2 = (r4 + r5); 1339 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1340 result = (result + 16) >> 5; 1341 CLIP_RESULT(result) 1342 *(p_cur += outpitch) = result; 1343 /* fourth pixel */ 1344 r2 = *(p_ref += inpitch); 1345 result = (r3 + r2); 1346 r3 = (r4 + r1); 1347 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1348 r3 = (r5 + r0); 1349 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1350 result = (result + 16) >> 5; 1351 CLIP_RESULT(result) 1352 *(p_cur += outpitch) = result; 1353 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1354 } 1355 p_cur += (curr_offset - 3); 1356 } 1357 } 1358 } 1359 } 1360 1361 return ; 1362 } 1363 1364 void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch, 1365 int blkwidth, int blkheight) 1366 { 1367 int *p_cur; 1368 uint8 *p_ref; 1369 uint32 tmp; 1370 int result, curr_offset, ref_offset; 1371 int j, r0, r1, r2, r3, r4, r5; 1372 1373 p_cur = out; 1374 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 1375 ref_offset = blkheight * inpitch; /* for limit */ 1376 1377 for (j = 0; j < blkwidth; j++) 1378 { 1379 p_cur -= outpitch; /* compensate for the first offset */ 1380 p_ref = in++; 1381 1382 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1383 while ((uint32)p_ref < tmp) 1384 { /* loop un-rolled */ 1385 r0 = *(p_ref - (inpitch << 1)); 1386 r1 = *(p_ref - inpitch); 1387 r2 = *p_ref; 1388 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1389 r4 = *(p_ref += inpitch); 1390 /* first pixel */ 1391 r5 = *(p_ref += inpitch); 1392 result = (r0 + r5); 1393 r0 = (r1 + r4); 1394 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1395 r0 = (r2 + r3); 1396 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1397 *(p_cur += outpitch) = result; 1398 /* second pixel */ 1399 r0 = *(p_ref += inpitch); 1400 result = (r1 + r0); 1401 r1 = (r2 + r5); 1402 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1403 r1 = (r3 + r4); 1404 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1405 *(p_cur += outpitch) = result; 1406 /* third pixel */ 1407 r1 = *(p_ref += inpitch); 1408 result = (r2 + r1); 1409 r2 = (r3 + r0); 1410 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1411 r2 = (r4 + r5); 1412 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1413 *(p_cur += outpitch) = result; 1414 /* fourth pixel */ 1415 r2 = *(p_ref += inpitch); 1416 result = (r3 + r2); 1417 r3 = (r4 + r1); 1418 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1419 r3 = (r5 + r0); 1420 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1421 *(p_cur += outpitch) = result; 1422 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1423 } 1424 p_cur += curr_offset; 1425 } 1426 1427 return ; 1428 } 1429 1430 void VertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch, 1431 int blkwidth, int blkheight, int dy) 1432 { 1433 uint8 *p_cur; 1434 int *p_ref; 1435 uint32 tmp; 1436 int result, result2, curr_offset, ref_offset; 1437 int j, r0, r1, r2, r3, r4, r5; 1438 1439 p_cur = out; 1440 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 1441 ref_offset = blkheight * inpitch; /* for limit */ 1442 1443 if (dy&1) 1444 { 1445 dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch; 1446 1447 for (j = 0; j < blkwidth; j++) 1448 { 1449 p_cur -= outpitch; /* compensate for the first offset */ 1450 p_ref = in++; 1451 1452 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1453 while ((uint32)p_ref < tmp) 1454 { /* loop un-rolled */ 1455 r0 = *(p_ref - (inpitch << 1)); 1456 r1 = *(p_ref - inpitch); 1457 r2 = *p_ref; 1458 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1459 r4 = *(p_ref += inpitch); 1460 /* first pixel */ 1461 r5 = *(p_ref += inpitch); 1462 result = (r0 + r5); 1463 r0 = (r1 + r4); 1464 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1465 r0 = (r2 + r3); 1466 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1467 result = (result + 512) >> 10; 1468 CLIP_RESULT(result) 1469 result2 = ((p_ref[dy] + 16) >> 5); 1470 CLIP_RESULT(result2) 1471 /* 3/4 pel, no need to clip */ 1472 result = (result + result2 + 1); 1473 result = (result >> 1); 1474 *(p_cur += outpitch) = result; 1475 /* second pixel */ 1476 r0 = *(p_ref += inpitch); 1477 result = (r1 + r0); 1478 r1 = (r2 + r5); 1479 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1480 r1 = (r3 + r4); 1481 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1482 result = (result + 512) >> 10; 1483 CLIP_RESULT(result) 1484 result2 = ((p_ref[dy] + 16) >> 5); 1485 CLIP_RESULT(result2) 1486 /* 3/4 pel, no need to clip */ 1487 result = (result + result2 + 1); 1488 result = (result >> 1); 1489 *(p_cur += outpitch) = result; 1490 /* third pixel */ 1491 r1 = *(p_ref += inpitch); 1492 result = (r2 + r1); 1493 r2 = (r3 + r0); 1494 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1495 r2 = (r4 + r5); 1496 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1497 result = (result + 512) >> 10; 1498 CLIP_RESULT(result) 1499 result2 = ((p_ref[dy] + 16) >> 5); 1500 CLIP_RESULT(result2) 1501 /* 3/4 pel, no need to clip */ 1502 result = (result + result2 + 1); 1503 result = (result >> 1); 1504 *(p_cur += outpitch) = result; 1505 /* fourth pixel */ 1506 r2 = *(p_ref += inpitch); 1507 result = (r3 + r2); 1508 r3 = (r4 + r1); 1509 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1510 r3 = (r5 + r0); 1511 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1512 result = (result + 512) >> 10; 1513 CLIP_RESULT(result) 1514 result2 = ((p_ref[dy] + 16) >> 5); 1515 CLIP_RESULT(result2) 1516 /* 3/4 pel, no need to clip */ 1517 result = (result + result2 + 1); 1518 result = (result >> 1); 1519 *(p_cur += outpitch) = result; 1520 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1521 } 1522 p_cur += curr_offset; 1523 } 1524 } 1525 else 1526 { 1527 for (j = 0; j < blkwidth; j++) 1528 { 1529 p_cur -= outpitch; /* compensate for the first offset */ 1530 p_ref = in++; 1531 1532 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1533 while ((uint32)p_ref < tmp) 1534 { /* loop un-rolled */ 1535 r0 = *(p_ref - (inpitch << 1)); 1536 r1 = *(p_ref - inpitch); 1537 r2 = *p_ref; 1538 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1539 r4 = *(p_ref += inpitch); 1540 /* first pixel */ 1541 r5 = *(p_ref += inpitch); 1542 result = (r0 + r5); 1543 r0 = (r1 + r4); 1544 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1545 r0 = (r2 + r3); 1546 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1547 result = (result + 512) >> 10; 1548 CLIP_RESULT(result) 1549 *(p_cur += outpitch) = result; 1550 /* second pixel */ 1551 r0 = *(p_ref += inpitch); 1552 result = (r1 + r0); 1553 r1 = (r2 + r5); 1554 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1555 r1 = (r3 + r4); 1556 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1557 result = (result + 512) >> 10; 1558 CLIP_RESULT(result) 1559 *(p_cur += outpitch) = result; 1560 /* third pixel */ 1561 r1 = *(p_ref += inpitch); 1562 result = (r2 + r1); 1563 r2 = (r3 + r0); 1564 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1565 r2 = (r4 + r5); 1566 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1567 result = (result + 512) >> 10; 1568 CLIP_RESULT(result) 1569 *(p_cur += outpitch) = result; 1570 /* fourth pixel */ 1571 r2 = *(p_ref += inpitch); 1572 result = (r3 + r2); 1573 r3 = (r4 + r1); 1574 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1575 r3 = (r5 + r0); 1576 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1577 result = (result + 512) >> 10; 1578 CLIP_RESULT(result) 1579 *(p_cur += outpitch) = result; 1580 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1581 } 1582 p_cur += curr_offset; 1583 } 1584 } 1585 1586 return ; 1587 } 1588 1589 void DiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch, 1590 uint8 *out, int outpitch, 1591 int blkwidth, int blkheight) 1592 { 1593 int j, i; 1594 int result; 1595 uint8 *p_cur, *p_ref, *p_tmp8; 1596 int curr_offset, ref_offset; 1597 uint8 tmp_res[24][24], tmp_in[24][24]; 1598 uint32 *p_tmp; 1599 uint32 tmp, pkres, tmp_result; 1600 int32 r0, r1, r2, r3, r4, r5; 1601 int32 r6, r7, r8, r9, r10, r13; 1602 1603 ref_offset = inpitch - blkwidth; 1604 p_ref = in1 - 2; 1605 /* perform horizontal interpolation */ 1606 /* not word-aligned */ 1607 /* It is faster to read 1 byte at time to avoid calling CreateAlign */ 1608 /* if(((uint32)p_ref)&0x3) 1609 { 1610 CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight); 1611 p_ref = &tmp_in[0][0]; 1612 ref_offset = 24-blkwidth; 1613 }*/ 1614 1615 p_tmp = (uint32*) & (tmp_res[0][0]); 1616 for (j = blkheight; j > 0; j--) 1617 { 1618 r13 = 0; 1619 tmp = (uint32)(p_ref + blkwidth); 1620 1621 //r0 = *((uint32*)p_ref); /* d,c,b,a */ 1622 //r1 = (r0>>8)&0xFF00FF; /* 0,d,0,b */ 1623 //r0 &= 0xFF00FF; /* 0,c,0,a */ 1624 /* It is faster to read 1 byte at a time, */ 1625 r0 = p_ref[0]; 1626 r1 = p_ref[2]; 1627 r0 |= (r1 << 16); /* 0,c,0,a */ 1628 r1 = p_ref[1]; 1629 r2 = p_ref[3]; 1630 r1 |= (r2 << 16); /* 0,d,0,b */ 1631 1632 while ((uint32)p_ref < tmp) 1633 { 1634 //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */ 1635 //r3 = (r2>>8)&0xFF00FF; /* 0,h,0,f */ 1636 //r2 &= 0xFF00FF; /* 0,g,0,e */ 1637 /* It is faster to read 1 byte at a time, */ 1638 r2 = *(p_ref += 4); 1639 r3 = p_ref[2]; 1640 r2 |= (r3 << 16); /* 0,g,0,e */ 1641 r3 = p_ref[1]; 1642 r4 = p_ref[3]; 1643 r3 |= (r4 << 16); /* 0,h,0,f */ 1644 1645 r4 = r0 + r3; /* c+h, a+f */ 1646 r5 = r0 + r1; /* c+d, a+b */ 1647 r6 = r2 + r3; /* g+h, e+f */ 1648 r5 >>= 16; 1649 r5 |= (r6 << 16); /* e+f, c+d */ 1650 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 1651 r4 += 0x100010; /* +16, +16 */ 1652 r5 = r1 + r2; /* d+g, b+e */ 1653 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 1654 r4 >>= 5; 1655 r13 |= r4; /* check clipping */ 1656 r4 &= 0xFF00FF; /* mask */ 1657 1658 r5 = p_ref[4]; /* i */ 1659 r6 = (r5 << 16); 1660 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 1661 r5 += r1; /* d+i, b+g */ /* r5 not free */ 1662 r1 >>= 16; 1663 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 1664 r1 += r2; /* f+g, d+e */ 1665 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 1666 r0 >>= 16; 1667 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 1668 r0 += r3; /* e+h, c+f */ 1669 r5 += 0x100010; /* 16,16 */ 1670 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 1671 r5 >>= 5; 1672 r13 |= r5; /* check clipping */ 1673 r5 &= 0xFF00FF; /* mask */ 1674 1675 r4 |= (r5 << 8); /* pack them together */ 1676 *p_tmp++ = r4; 1677 r1 = r3; 1678 r0 = r2; 1679 } 1680 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */ 1681 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 1682 1683 if (r13&0xFF000700) /* need clipping */ 1684 { 1685 /* move back to the beginning of the line */ 1686 p_ref -= (ref_offset + blkwidth); /* input */ 1687 p_tmp -= 6; /* intermediate output */ 1688 tmp = (uint32)(p_ref + blkwidth); 1689 while ((uint32)p_ref < tmp) 1690 { 1691 r0 = *p_ref++; 1692 r1 = *p_ref++; 1693 r2 = *p_ref++; 1694 r3 = *p_ref++; 1695 r4 = *p_ref++; 1696 /* first pixel */ 1697 r5 = *p_ref++; 1698 result = (r0 + r5); 1699 r0 = (r1 + r4); 1700 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1701 r0 = (r2 + r3); 1702 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1703 result = (result + 16) >> 5; 1704 CLIP_RESULT(result) 1705 pkres = result; 1706 /* second pixel */ 1707 r0 = *p_ref++; 1708 result = (r1 + r0); 1709 r1 = (r2 + r5); 1710 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1711 r1 = (r3 + r4); 1712 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1713 result = (result + 16) >> 5; 1714 CLIP_RESULT(result) 1715 pkres |= (result << 8); 1716 /* third pixel */ 1717 r1 = *p_ref++; 1718 result = (r2 + r1); 1719 r2 = (r3 + r0); 1720 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1721 r2 = (r4 + r5); 1722 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1723 result = (result + 16) >> 5; 1724 CLIP_RESULT(result) 1725 pkres |= (result << 16); 1726 /* fourth pixel */ 1727 r2 = *p_ref++; 1728 result = (r3 + r2); 1729 r3 = (r4 + r1); 1730 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1731 r3 = (r5 + r0); 1732 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1733 result = (result + 16) >> 5; 1734 CLIP_RESULT(result) 1735 pkres |= (result << 24); 1736 1737 *p_tmp++ = pkres; /* write 4 pixel */ 1738 p_ref -= 5; 1739 } 1740 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */ 1741 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 1742 } 1743 } 1744 1745 /* perform vertical interpolation */ 1746 /* not word-aligned */ 1747 if (((uint32)in2)&0x3) 1748 { 1749 CreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); 1750 in2 = &tmp_in[2][0]; 1751 inpitch = 24; 1752 } 1753 1754 p_cur = out; 1755 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */ 1756 pkres = blkheight * inpitch; /* reuse it for limit */ 1757 1758 curr_offset += 3; 1759 1760 for (j = 0; j < blkwidth; j += 4, in2 += 4) 1761 { 1762 r13 = 0; 1763 p_ref = in2; 1764 p_tmp8 = &(tmp_res[0][j]); /* intermediate result */ 1765 p_tmp8 -= 24; /* compensate for the first offset */ 1766 p_cur -= outpitch; /* compensate for the first offset */ 1767 tmp = (uint32)(p_ref + pkres); /* limit */ 1768 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 1769 { 1770 /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign, */ 1771 /*p_ref8 = p_ref-(inpitch<<1); r0 = p_ref8[0]; r1 = p_ref8[2]; 1772 r0 |= (r1<<16); r6 = p_ref8[1]; r1 = p_ref8[3]; 1773 r6 |= (r1<<16); p_ref+=inpitch; */ 1774 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 1775 p_ref += inpitch; 1776 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 1777 r0 &= 0xFF00FF; 1778 1779 /*p_ref8 = p_ref+(inpitch<<1); 1780 r1 = p_ref8[0]; r7 = p_ref8[2]; r1 |= (r7<<16); 1781 r7 = p_ref8[1]; r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1782 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 1783 r7 = (r1 >> 8) & 0xFF00FF; 1784 r1 &= 0xFF00FF; 1785 1786 r0 += r1; 1787 r6 += r7; 1788 1789 /*r2 = p_ref[0]; r8 = p_ref[2]; r2 |= (r8<<16); 1790 r8 = p_ref[1]; r1 = p_ref[3]; r8 |= (r1<<16);*/ 1791 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 1792 r8 = (r2 >> 8) & 0xFF00FF; 1793 r2 &= 0xFF00FF; 1794 1795 /*p_ref8 = p_ref-inpitch; r1 = p_ref8[0]; r7 = p_ref8[2]; 1796 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1]; 1797 r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1798 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 1799 r7 = (r1 >> 8) & 0xFF00FF; 1800 r1 &= 0xFF00FF; 1801 r1 += r2; 1802 1803 r7 += r8; 1804 1805 r0 += 20 * r1; 1806 r6 += 20 * r7; 1807 r0 += 0x100010; 1808 r6 += 0x100010; 1809 1810 /*p_ref8 = p_ref-(inpitch<<1); r2 = p_ref8[0]; r8 = p_ref8[2]; 1811 r2 |= (r8<<16); r8 = p_ref8[1]; r1 = p_ref8[3]; r8 |= (r1<<16);*/ 1812 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 1813 r8 = (r2 >> 8) & 0xFF00FF; 1814 r2 &= 0xFF00FF; 1815 1816 /*p_ref8 = p_ref+inpitch; r1 = p_ref8[0]; r7 = p_ref8[2]; 1817 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1]; 1818 r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1819 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 1820 r7 = (r1 >> 8) & 0xFF00FF; 1821 r1 &= 0xFF00FF; 1822 r1 += r2; 1823 1824 r7 += r8; 1825 1826 r0 -= 5 * r1; 1827 r6 -= 5 * r7; 1828 1829 r0 >>= 5; 1830 r6 >>= 5; 1831 /* clip */ 1832 r13 |= r6; 1833 r13 |= r0; 1834 //CLIPPACK(r6,result) 1835 /* add with horizontal results */ 1836 r10 = *((uint32*)(p_tmp8 += 24)); 1837 r9 = (r10 >> 8) & 0xFF00FF; 1838 r10 &= 0xFF00FF; 1839 1840 r0 += r10; 1841 r0 += 0x10001; 1842 r0 = (r0 >> 1) & 0xFF00FF; /* mask to 8 bytes */ 1843 1844 r6 += r9; 1845 r6 += 0x10001; 1846 r6 = (r6 >> 1) & 0xFF00FF; /* mask to 8 bytes */ 1847 1848 r0 |= (r6 << 8); /* pack it back */ 1849 *((uint32*)(p_cur += outpitch)) = r0; 1850 } 1851 p_cur += curr_offset; /* offset to the next pixel */ 1852 if (r13 & 0xFF000700) /* this column need clipping */ 1853 { 1854 p_cur -= 4; 1855 for (i = 0; i < 4; i++) 1856 { 1857 p_ref = in2 + i; 1858 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */ 1859 p_tmp8 -= 24; /* compensate for the first offset */ 1860 p_cur -= outpitch; /* compensate for the first offset */ 1861 tmp = (uint32)(p_ref + pkres); /* limit */ 1862 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 1863 { 1864 r0 = *(p_ref - (inpitch << 1)); 1865 r1 = *(p_ref - inpitch); 1866 r2 = *p_ref; 1867 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1868 r4 = *(p_ref += inpitch); 1869 /* first pixel */ 1870 r5 = *(p_ref += inpitch); 1871 result = (r0 + r5); 1872 r0 = (r1 + r4); 1873 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1874 r0 = (r2 + r3); 1875 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1876 result = (result + 16) >> 5; 1877 CLIP_RESULT(result) 1878 tmp_result = *(p_tmp8 += 24); /* modify pointer before loading */ 1879 result = (result + tmp_result + 1); /* no clip */ 1880 result = (result >> 1); 1881 *(p_cur += outpitch) = result; 1882 /* second pixel */ 1883 r0 = *(p_ref += inpitch); 1884 result = (r1 + r0); 1885 r1 = (r2 + r5); 1886 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1887 r1 = (r3 + r4); 1888 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1889 result = (result + 16) >> 5; 1890 CLIP_RESULT(result) 1891 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1892 result = (result + tmp_result + 1); /* no clip */ 1893 result = (result >> 1); 1894 *(p_cur += outpitch) = result; 1895 /* third pixel */ 1896 r1 = *(p_ref += inpitch); 1897 result = (r2 + r1); 1898 r2 = (r3 + r0); 1899 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1900 r2 = (r4 + r5); 1901 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1902 result = (result + 16) >> 5; 1903 CLIP_RESULT(result) 1904 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1905 result = (result + tmp_result + 1); /* no clip */ 1906 result = (result >> 1); 1907 *(p_cur += outpitch) = result; 1908 /* fourth pixel */ 1909 r2 = *(p_ref += inpitch); 1910 result = (r3 + r2); 1911 r3 = (r4 + r1); 1912 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1913 r3 = (r5 + r0); 1914 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1915 result = (result + 16) >> 5; 1916 CLIP_RESULT(result) 1917 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1918 result = (result + tmp_result + 1); /* no clip */ 1919 result = (result >> 1); 1920 *(p_cur += outpitch) = result; 1921 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1922 } 1923 p_cur += (curr_offset - 3); 1924 } 1925 } 1926 } 1927 1928 return ; 1929 } 1930 1931 /* position G */ 1932 void FullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch, 1933 int blkwidth, int blkheight) 1934 { 1935 int i, j; 1936 int offset_in = inpitch - blkwidth; 1937 int offset_out = outpitch - blkwidth; 1938 uint32 temp; 1939 uint8 byte; 1940 1941 if (((uint32)in)&3) 1942 { 1943 for (j = blkheight; j > 0; j--) 1944 { 1945 for (i = blkwidth; i > 0; i -= 4) 1946 { 1947 temp = *in++; 1948 byte = *in++; 1949 temp |= (byte << 8); 1950 byte = *in++; 1951 temp |= (byte << 16); 1952 byte = *in++; 1953 temp |= (byte << 24); 1954 1955 *((uint32*)out) = temp; /* write 4 bytes */ 1956 out += 4; 1957 } 1958 out += offset_out; 1959 in += offset_in; 1960 } 1961 } 1962 else 1963 { 1964 for (j = blkheight; j > 0; j--) 1965 { 1966 for (i = blkwidth; i > 0; i -= 4) 1967 { 1968 temp = *((uint32*)in); 1969 *((uint32*)out) = temp; 1970 in += 4; 1971 out += 4; 1972 } 1973 out += offset_out; 1974 in += offset_in; 1975 } 1976 } 1977 return ; 1978 } 1979 1980 void ChromaMotionComp(uint8 *ref, int picwidth, int picheight, 1981 int x_pos, int y_pos, 1982 uint8 *pred, int pred_pitch, 1983 int blkwidth, int blkheight) 1984 { 1985 int dx, dy; 1986 int offset_dx, offset_dy; 1987 int index; 1988 uint8 temp[24][24]; 1989 1990 dx = x_pos & 7; 1991 dy = y_pos & 7; 1992 offset_dx = (dx + 7) >> 3; 1993 offset_dy = (dy + 7) >> 3; 1994 x_pos = x_pos >> 3; /* round it to full-pel resolution */ 1995 y_pos = y_pos >> 3; 1996 1997 if ((x_pos >= 0 && x_pos + blkwidth + offset_dx <= picwidth) && (y_pos >= 0 && y_pos + blkheight + offset_dy <= picheight)) 1998 { 1999 ref += y_pos * picwidth + x_pos; 2000 } 2001 else 2002 { 2003 CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth + offset_dx, blkheight + offset_dy); 2004 ref = &temp[0][0]; 2005 picwidth = 24; 2006 } 2007 2008 index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7); 2009 2010 (*(ChromaMC_SIMD[index]))(ref, picwidth , dx, dy, pred, pred_pitch, blkwidth, blkheight); 2011 return ; 2012 } 2013 2014 2015 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */ 2016 void ChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2017 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2018 { 2019 int32 r0, r1, r2, r3, result0, result1; 2020 uint8 temp[288]; 2021 uint8 *ref, *out; 2022 int i, j; 2023 int dx_8 = 8 - dx; 2024 int dy_8 = 8 - dy; 2025 2026 /* horizontal first */ 2027 out = temp; 2028 for (i = 0; i < blkheight + 1; i++) 2029 { 2030 ref = pRef; 2031 r0 = ref[0]; 2032 for (j = 0; j < blkwidth; j += 4) 2033 { 2034 r0 |= (ref[2] << 16); 2035 result0 = dx_8 * r0; 2036 2037 r1 = ref[1] | (ref[3] << 16); 2038 result0 += dx * r1; 2039 *(int32 *)out = result0; 2040 2041 result0 = dx_8 * r1; 2042 2043 r2 = ref[4]; 2044 r0 = r0 >> 16; 2045 r1 = r0 | (r2 << 16); 2046 result0 += dx * r1; 2047 *(int32 *)(out + 16) = result0; 2048 2049 ref += 4; 2050 out += 4; 2051 r0 = r2; 2052 } 2053 pRef += srcPitch; 2054 out += (32 - blkwidth); 2055 } 2056 2057 // pRef -= srcPitch*(blkheight+1); 2058 ref = temp; 2059 2060 for (j = 0; j < blkwidth; j += 4) 2061 { 2062 r0 = *(int32 *)ref; 2063 r1 = *(int32 *)(ref + 16); 2064 ref += 32; 2065 out = pOut; 2066 for (i = 0; i < (blkheight >> 1); i++) 2067 { 2068 result0 = dy_8 * r0 + 0x00200020; 2069 r2 = *(int32 *)ref; 2070 result0 += dy * r2; 2071 result0 >>= 6; 2072 result0 &= 0x00FF00FF; 2073 r0 = r2; 2074 2075 result1 = dy_8 * r1 + 0x00200020; 2076 r3 = *(int32 *)(ref + 16); 2077 result1 += dy * r3; 2078 result1 >>= 6; 2079 result1 &= 0x00FF00FF; 2080 r1 = r3; 2081 *(int32 *)out = result0 | (result1 << 8); 2082 out += predPitch; 2083 ref += 32; 2084 2085 result0 = dy_8 * r0 + 0x00200020; 2086 r2 = *(int32 *)ref; 2087 result0 += dy * r2; 2088 result0 >>= 6; 2089 result0 &= 0x00FF00FF; 2090 r0 = r2; 2091 2092 result1 = dy_8 * r1 + 0x00200020; 2093 r3 = *(int32 *)(ref + 16); 2094 result1 += dy * r3; 2095 result1 >>= 6; 2096 result1 &= 0x00FF00FF; 2097 r1 = r3; 2098 *(int32 *)out = result0 | (result1 << 8); 2099 out += predPitch; 2100 ref += 32; 2101 } 2102 pOut += 4; 2103 ref = temp + 4; /* since it can only iterate twice max */ 2104 } 2105 return; 2106 } 2107 2108 void ChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2109 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2110 { 2111 OSCL_UNUSED_ARG(dy); 2112 int32 r0, r1, r2, result0, result1; 2113 uint8 *ref, *out; 2114 int i, j; 2115 int dx_8 = 8 - dx; 2116 2117 /* horizontal first */ 2118 for (i = 0; i < blkheight; i++) 2119 { 2120 ref = pRef; 2121 out = pOut; 2122 2123 r0 = ref[0]; 2124 for (j = 0; j < blkwidth; j += 4) 2125 { 2126 r0 |= (ref[2] << 16); 2127 result0 = dx_8 * r0 + 0x00040004; 2128 2129 r1 = ref[1] | (ref[3] << 16); 2130 result0 += dx * r1; 2131 result0 >>= 3; 2132 result0 &= 0x00FF00FF; 2133 2134 result1 = dx_8 * r1 + 0x00040004; 2135 2136 r2 = ref[4]; 2137 r0 = r0 >> 16; 2138 r1 = r0 | (r2 << 16); 2139 result1 += dx * r1; 2140 result1 >>= 3; 2141 result1 &= 0x00FF00FF; 2142 2143 *(int32 *)out = result0 | (result1 << 8); 2144 2145 ref += 4; 2146 out += 4; 2147 r0 = r2; 2148 } 2149 2150 pRef += srcPitch; 2151 pOut += predPitch; 2152 } 2153 return; 2154 } 2155 2156 void ChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2157 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2158 { 2159 OSCL_UNUSED_ARG(dx); 2160 int32 r0, r1, r2, r3, result0, result1; 2161 int i, j; 2162 uint8 *ref, *out; 2163 int dy_8 = 8 - dy; 2164 /* vertical first */ 2165 for (i = 0; i < blkwidth; i += 4) 2166 { 2167 ref = pRef; 2168 out = pOut; 2169 2170 r0 = ref[0] | (ref[2] << 16); 2171 r1 = ref[1] | (ref[3] << 16); 2172 ref += srcPitch; 2173 for (j = 0; j < blkheight; j++) 2174 { 2175 result0 = dy_8 * r0 + 0x00040004; 2176 r2 = ref[0] | (ref[2] << 16); 2177 result0 += dy * r2; 2178 result0 >>= 3; 2179 result0 &= 0x00FF00FF; 2180 r0 = r2; 2181 2182 result1 = dy_8 * r1 + 0x00040004; 2183 r3 = ref[1] | (ref[3] << 16); 2184 result1 += dy * r3; 2185 result1 >>= 3; 2186 result1 &= 0x00FF00FF; 2187 r1 = r3; 2188 *(int32 *)out = result0 | (result1 << 8); 2189 ref += srcPitch; 2190 out += predPitch; 2191 } 2192 pOut += 4; 2193 pRef += 4; 2194 } 2195 return; 2196 } 2197 2198 void ChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2199 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2200 { 2201 OSCL_UNUSED_ARG(blkwidth); 2202 int32 r0, r1, temp0, temp1, result; 2203 int32 temp[9]; 2204 int32 *out; 2205 int i, r_temp; 2206 int dy_8 = 8 - dy; 2207 2208 /* horizontal first */ 2209 out = temp; 2210 for (i = 0; i < blkheight + 1; i++) 2211 { 2212 r_temp = pRef[1]; 2213 temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]); 2214 temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp); 2215 r0 = temp0 | (temp1 << 16); 2216 *out++ = r0; 2217 pRef += srcPitch; 2218 } 2219 2220 pRef -= srcPitch * (blkheight + 1); 2221 2222 out = temp; 2223 2224 r0 = *out++; 2225 2226 for (i = 0; i < blkheight; i++) 2227 { 2228 result = dy_8 * r0 + 0x00200020; 2229 r1 = *out++; 2230 result += dy * r1; 2231 result >>= 6; 2232 result &= 0x00FF00FF; 2233 *(int16 *)pOut = (result >> 8) | (result & 0xFF); 2234 r0 = r1; 2235 pOut += predPitch; 2236 } 2237 return; 2238 } 2239 2240 void ChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2241 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2242 { 2243 OSCL_UNUSED_ARG(dy); 2244 OSCL_UNUSED_ARG(blkwidth); 2245 int i, temp, temp0, temp1; 2246 2247 /* horizontal first */ 2248 for (i = 0; i < blkheight; i++) 2249 { 2250 temp = pRef[1]; 2251 temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3; 2252 temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3; 2253 2254 *(int16 *)pOut = temp0 | (temp1 << 8); 2255 pRef += srcPitch; 2256 pOut += predPitch; 2257 2258 } 2259 return; 2260 } 2261 void ChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2262 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2263 { 2264 OSCL_UNUSED_ARG(dx); 2265 OSCL_UNUSED_ARG(blkwidth); 2266 int32 r0, r1, result; 2267 int i; 2268 int dy_8 = 8 - dy; 2269 r0 = pRef[0] | (pRef[1] << 16); 2270 pRef += srcPitch; 2271 for (i = 0; i < blkheight; i++) 2272 { 2273 result = dy_8 * r0 + 0x00040004; 2274 r1 = pRef[0] | (pRef[1] << 16); 2275 result += dy * r1; 2276 result >>= 3; 2277 result &= 0x00FF00FF; 2278 *(int16 *)pOut = (result >> 8) | (result & 0xFF); 2279 r0 = r1; 2280 pRef += srcPitch; 2281 pOut += predPitch; 2282 } 2283 return; 2284 } 2285 2286 void ChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2287 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2288 { 2289 OSCL_UNUSED_ARG(dx); 2290 OSCL_UNUSED_ARG(dy); 2291 int i, j; 2292 int offset_in = srcPitch - blkwidth; 2293 int offset_out = predPitch - blkwidth; 2294 uint16 temp; 2295 uint8 byte; 2296 2297 if (((uint32)pRef)&1) 2298 { 2299 for (j = blkheight; j > 0; j--) 2300 { 2301 for (i = blkwidth; i > 0; i -= 2) 2302 { 2303 temp = *pRef++; 2304 byte = *pRef++; 2305 temp |= (byte << 8); 2306 *((uint16*)pOut) = temp; /* write 2 bytes */ 2307 pOut += 2; 2308 } 2309 pOut += offset_out; 2310 pRef += offset_in; 2311 } 2312 } 2313 else 2314 { 2315 for (j = blkheight; j > 0; j--) 2316 { 2317 for (i = blkwidth; i > 0; i -= 2) 2318 { 2319 temp = *((uint16*)pRef); 2320 *((uint16*)pOut) = temp; 2321 pRef += 2; 2322 pOut += 2; 2323 } 2324 pOut += offset_out; 2325 pRef += offset_in; 2326 } 2327 } 2328 return ; 2329 } 2330