1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 #include "avcenc_lib.h" 19 #include "avcenc_int.h" 20 21 22 #define CLIP_RESULT(x) if((uint)x > 0xFF){ \ 23 x = 0xFF & (~(x>>31));} 24 25 /* (blkwidth << 2) + (dy << 1) + dx */ 26 static void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) = 27 { 28 &eChromaFullMC_SIMD, 29 &eChromaHorizontalMC_SIMD, 30 &eChromaVerticalMC_SIMD, 31 &eChromaDiagonalMC_SIMD, 32 &eChromaFullMC_SIMD, 33 &eChromaHorizontalMC2_SIMD, 34 &eChromaVerticalMC2_SIMD, 35 &eChromaDiagonalMC2_SIMD 36 }; 37 /* Perform motion prediction and compensation with residue if exist. */ 38 void AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video) 39 { 40 (void)(encvid); 41 42 AVCMacroblock *currMB = video->currMB; 43 AVCPictureData *currPic = video->currPic; 44 int mbPartIdx, subMbPartIdx; 45 int ref_idx; 46 int offset_MbPart_indx = 0; 47 int16 *mv; 48 uint32 x_pos, y_pos; 49 uint8 *curL, *curCb, *curCr; 50 uint8 *ref_l, *ref_Cb, *ref_Cr; 51 uint8 *predBlock, *predCb, *predCr; 52 int block_x, block_y, offset_x, offset_y, offsetP, offset; 53 int x_position = (video->mb_x << 4); 54 int y_position = (video->mb_y << 4); 55 int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx; 56 int picWidth = currPic->width; 57 int picPitch = currPic->pitch; 58 int picHeight = currPic->height; 59 uint32 tmp_word; 60 61 tmp_word = y_position * picPitch; 62 curL = currPic->Sl + tmp_word + x_position; 63 offset = (tmp_word >> 2) + (x_position >> 1); 64 curCb = currPic->Scb + offset; 65 curCr = currPic->Scr + offset; 66 67 predBlock = curL; 68 predCb = curCb; 69 predCr = curCr; 70 71 GetMotionVectorPredictor(video, 1); 72 73 for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++) 74 { 75 MbHeight = currMB->SubMbPartHeight[mbPartIdx]; 76 MbWidth = currMB->SubMbPartWidth[mbPartIdx]; 77 mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1); 78 mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1; 79 ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X]; 80 offset_indx = 0; 81 82 ref_l = video->RefPicList0[ref_idx]->Sl; 83 ref_Cb = video->RefPicList0[ref_idx]->Scb; 84 ref_Cr = video->RefPicList0[ref_idx]->Scr; 85 86 for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++) 87 { 88 block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1); 89 block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1); 90 mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2)); 91 offset_x = x_position + (block_x << 2); 92 offset_y = y_position + (block_y << 2); 93 x_pos = (offset_x << 2) + *mv++; /*quarter pel */ 94 y_pos = (offset_y << 2) + *mv; /*quarter pel */ 95 96 //offset = offset_y * currPic->width; 97 //offsetC = (offset >> 2) + (offset_x >> 1); 98 offsetP = (block_y << 2) * picPitch + (block_x << 2); 99 eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos, 100 /*comp_Sl + offset + offset_x,*/ 101 predBlock + offsetP, picPitch, MbWidth, MbHeight); 102 103 offsetP = (block_y * picWidth) + (block_x << 1); 104 eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 105 /*comp_Scb + offsetC,*/ 106 predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1); 107 eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 108 /*comp_Scr + offsetC,*/ 109 predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1); 110 111 offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3; 112 } 113 offset_MbPart_indx = currMB->MbPartWidth >> 4; 114 } 115 116 return ; 117 } 118 119 120 /* preform the actual motion comp here */ 121 void eLumaMotionComp(uint8 *ref, int picpitch, int picheight, 122 int x_pos, int y_pos, 123 uint8 *pred, int pred_pitch, 124 int blkwidth, int blkheight) 125 { 126 (void)(picheight); 127 128 int dx, dy; 129 int temp2[21][21]; /* for intermediate results */ 130 uint8 *ref2; 131 132 dx = x_pos & 3; 133 dy = y_pos & 3; 134 x_pos = x_pos >> 2; /* round it to full-pel resolution */ 135 y_pos = y_pos >> 2; 136 137 /* perform actual motion compensation */ 138 if (dx == 0 && dy == 0) 139 { /* fullpel position *//* G */ 140 141 ref += y_pos * picpitch + x_pos; 142 143 eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight); 144 145 } /* other positions */ 146 else if (dy == 0) 147 { /* no vertical interpolation *//* a,b,c*/ 148 149 ref += y_pos * picpitch + x_pos; 150 151 eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx); 152 } 153 else if (dx == 0) 154 { /*no horizontal interpolation *//* d,h,n */ 155 156 ref += y_pos * picpitch + x_pos; 157 158 eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy); 159 } 160 else if (dy == 2) 161 { /* horizontal cross *//* i, j, k */ 162 163 ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */ 164 165 eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight); 166 167 eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx); 168 } 169 else if (dx == 2) 170 { /* vertical cross */ /* f,q */ 171 172 ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */ 173 174 eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5); 175 eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy); 176 } 177 else 178 { /* diagonal *//* e,g,p,r */ 179 180 ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos; 181 182 ref += (y_pos * picpitch) + x_pos + (dx / 2); 183 184 eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight); 185 } 186 187 return ; 188 } 189 190 void eCreateAlign(uint8 *ref, int picpitch, int y_pos, 191 uint8 *out, int blkwidth, int blkheight) 192 { 193 int i, j; 194 int offset, out_offset; 195 uint32 prev_pix, result, pix1, pix2, pix4; 196 197 ref += y_pos * picpitch;// + x_pos; 198 out_offset = 24 - blkwidth; 199 200 //switch(x_pos&0x3){ 201 switch (((uint32)ref)&0x3) 202 { 203 case 1: 204 offset = picpitch - blkwidth - 3; 205 for (j = 0; j < blkheight; j++) 206 { 207 pix1 = *ref++; 208 pix2 = *((uint16*)ref); 209 ref += 2; 210 result = (pix2 << 8) | pix1; 211 212 for (i = 3; i < blkwidth; i += 4) 213 { 214 pix4 = *((uint32*)ref); 215 ref += 4; 216 prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */ 217 result |= prev_pix; 218 *((uint32*)out) = result; /* write 4 bytes */ 219 out += 4; 220 result = pix4 >> 8; /* for the next loop */ 221 } 222 ref += offset; 223 out += out_offset; 224 } 225 break; 226 case 2: 227 offset = picpitch - blkwidth - 2; 228 for (j = 0; j < blkheight; j++) 229 { 230 result = *((uint16*)ref); 231 ref += 2; 232 for (i = 2; i < blkwidth; i += 4) 233 { 234 pix4 = *((uint32*)ref); 235 ref += 4; 236 prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */ 237 result |= prev_pix; 238 *((uint32*)out) = result; /* write 4 bytes */ 239 out += 4; 240 result = pix4 >> 16; /* for the next loop */ 241 } 242 ref += offset; 243 out += out_offset; 244 } 245 break; 246 case 3: 247 offset = picpitch - blkwidth - 1; 248 for (j = 0; j < blkheight; j++) 249 { 250 result = *ref++; 251 for (i = 1; i < blkwidth; i += 4) 252 { 253 pix4 = *((uint32*)ref); 254 ref += 4; 255 prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */ 256 result |= prev_pix; 257 *((uint32*)out) = result; /* write 4 bytes */ 258 out += 4; 259 result = pix4 >> 24; /* for the next loop */ 260 } 261 ref += offset; 262 out += out_offset; 263 } 264 break; 265 } 266 } 267 268 void eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, 269 int blkwidth, int blkheight, int dx) 270 { 271 uint8 *p_ref; 272 uint32 *p_cur; 273 uint32 tmp, pkres; 274 int result, curr_offset, ref_offset; 275 int j; 276 int32 r0, r1, r2, r3, r4, r5; 277 int32 r13, r6; 278 279 p_cur = (uint32*)out; /* assume it's word aligned */ 280 curr_offset = (outpitch - blkwidth) >> 2; 281 p_ref = in; 282 ref_offset = inpitch - blkwidth; 283 284 if (dx&1) 285 { 286 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ 287 p_ref -= 2; 288 r13 = 0; 289 for (j = blkheight; j > 0; j--) 290 { 291 tmp = (uint32)(p_ref + blkwidth); 292 r0 = p_ref[0]; 293 r1 = p_ref[2]; 294 r0 |= (r1 << 16); /* 0,c,0,a */ 295 r1 = p_ref[1]; 296 r2 = p_ref[3]; 297 r1 |= (r2 << 16); /* 0,d,0,b */ 298 while ((uint32)p_ref < tmp) 299 { 300 r2 = *(p_ref += 4); /* move pointer to e */ 301 r3 = p_ref[2]; 302 r2 |= (r3 << 16); /* 0,g,0,e */ 303 r3 = p_ref[1]; 304 r4 = p_ref[3]; 305 r3 |= (r4 << 16); /* 0,h,0,f */ 306 307 r4 = r0 + r3; /* c+h, a+f */ 308 r5 = r0 + r1; /* c+d, a+b */ 309 r6 = r2 + r3; /* g+h, e+f */ 310 r5 >>= 16; 311 r5 |= (r6 << 16); /* e+f, c+d */ 312 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 313 r4 += 0x100010; /* +16, +16 */ 314 r5 = r1 + r2; /* d+g, b+e */ 315 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 316 r4 >>= 5; 317 r13 |= r4; /* check clipping */ 318 319 r5 = p_ref[dx+2]; 320 r6 = p_ref[dx+4]; 321 r5 |= (r6 << 16); 322 r4 += r5; 323 r4 += 0x10001; 324 r4 = (r4 >> 1) & 0xFF00FF; 325 326 r5 = p_ref[4]; /* i */ 327 r6 = (r5 << 16); 328 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 329 r5 += r1; /* d+i, b+g */ /* r5 not free */ 330 r1 >>= 16; 331 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 332 r1 += r2; /* f+g, d+e */ 333 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 334 r0 >>= 16; 335 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 336 r0 += r3; /* e+h, c+f */ 337 r5 += 0x100010; /* 16,16 */ 338 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 339 r5 >>= 5; 340 r13 |= r5; /* check clipping */ 341 342 r0 = p_ref[dx+3]; 343 r1 = p_ref[dx+5]; 344 r0 |= (r1 << 16); 345 r5 += r0; 346 r5 += 0x10001; 347 r5 = (r5 >> 1) & 0xFF00FF; 348 349 r4 |= (r5 << 8); /* pack them together */ 350 *p_cur++ = r4; 351 r1 = r3; 352 r0 = r2; 353 } 354 p_cur += curr_offset; /* move to the next line */ 355 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 356 357 if (r13&0xFF000700) /* need clipping */ 358 { 359 /* move back to the beginning of the line */ 360 p_ref -= (ref_offset + blkwidth); /* input */ 361 p_cur -= (outpitch >> 2); 362 363 tmp = (uint32)(p_ref + blkwidth); 364 for (; (uint32)p_ref < tmp;) 365 { 366 367 r0 = *p_ref++; 368 r1 = *p_ref++; 369 r2 = *p_ref++; 370 r3 = *p_ref++; 371 r4 = *p_ref++; 372 /* first pixel */ 373 r5 = *p_ref++; 374 result = (r0 + r5); 375 r0 = (r1 + r4); 376 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 377 r0 = (r2 + r3); 378 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 379 result = (result + 16) >> 5; 380 CLIP_RESULT(result) 381 /* 3/4 pel, no need to clip */ 382 result = (result + p_ref[dx] + 1); 383 pkres = (result >> 1) ; 384 /* second pixel */ 385 r0 = *p_ref++; 386 result = (r1 + r0); 387 r1 = (r2 + r5); 388 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 389 r1 = (r3 + r4); 390 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 391 result = (result + 16) >> 5; 392 CLIP_RESULT(result) 393 /* 3/4 pel, no need to clip */ 394 result = (result + p_ref[dx] + 1); 395 result = (result >> 1); 396 pkres |= (result << 8); 397 /* third pixel */ 398 r1 = *p_ref++; 399 result = (r2 + r1); 400 r2 = (r3 + r0); 401 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 402 r2 = (r4 + r5); 403 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 404 result = (result + 16) >> 5; 405 CLIP_RESULT(result) 406 /* 3/4 pel, no need to clip */ 407 result = (result + p_ref[dx] + 1); 408 result = (result >> 1); 409 pkres |= (result << 16); 410 /* fourth pixel */ 411 r2 = *p_ref++; 412 result = (r3 + r2); 413 r3 = (r4 + r1); 414 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 415 r3 = (r5 + r0); 416 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 417 result = (result + 16) >> 5; 418 CLIP_RESULT(result) 419 /* 3/4 pel, no need to clip */ 420 result = (result + p_ref[dx] + 1); 421 result = (result >> 1); 422 pkres |= (result << 24); 423 *p_cur++ = pkres; /* write 4 pixels */ 424 p_ref -= 5; /* offset back to the middle of filter */ 425 } 426 p_cur += curr_offset; /* move to the next line */ 427 p_ref += ref_offset; /* move to the next line */ 428 } 429 } 430 } 431 else 432 { 433 p_ref -= 2; 434 r13 = 0; 435 for (j = blkheight; j > 0; j--) 436 { 437 tmp = (uint32)(p_ref + blkwidth); 438 r0 = p_ref[0]; 439 r1 = p_ref[2]; 440 r0 |= (r1 << 16); /* 0,c,0,a */ 441 r1 = p_ref[1]; 442 r2 = p_ref[3]; 443 r1 |= (r2 << 16); /* 0,d,0,b */ 444 while ((uint32)p_ref < tmp) 445 { 446 r2 = *(p_ref += 4); /* move pointer to e */ 447 r3 = p_ref[2]; 448 r2 |= (r3 << 16); /* 0,g,0,e */ 449 r3 = p_ref[1]; 450 r4 = p_ref[3]; 451 r3 |= (r4 << 16); /* 0,h,0,f */ 452 453 r4 = r0 + r3; /* c+h, a+f */ 454 r5 = r0 + r1; /* c+d, a+b */ 455 r6 = r2 + r3; /* g+h, e+f */ 456 r5 >>= 16; 457 r5 |= (r6 << 16); /* e+f, c+d */ 458 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 459 r4 += 0x100010; /* +16, +16 */ 460 r5 = r1 + r2; /* d+g, b+e */ 461 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 462 r4 >>= 5; 463 r13 |= r4; /* check clipping */ 464 r4 &= 0xFF00FF; /* mask */ 465 466 r5 = p_ref[4]; /* i */ 467 r6 = (r5 << 16); 468 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 469 r5 += r1; /* d+i, b+g */ /* r5 not free */ 470 r1 >>= 16; 471 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 472 r1 += r2; /* f+g, d+e */ 473 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 474 r0 >>= 16; 475 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 476 r0 += r3; /* e+h, c+f */ 477 r5 += 0x100010; /* 16,16 */ 478 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 479 r5 >>= 5; 480 r13 |= r5; /* check clipping */ 481 r5 &= 0xFF00FF; /* mask */ 482 483 r4 |= (r5 << 8); /* pack them together */ 484 *p_cur++ = r4; 485 r1 = r3; 486 r0 = r2; 487 } 488 p_cur += curr_offset; /* move to the next line */ 489 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 490 491 if (r13&0xFF000700) /* need clipping */ 492 { 493 /* move back to the beginning of the line */ 494 p_ref -= (ref_offset + blkwidth); /* input */ 495 p_cur -= (outpitch >> 2); 496 497 tmp = (uint32)(p_ref + blkwidth); 498 for (; (uint32)p_ref < tmp;) 499 { 500 501 r0 = *p_ref++; 502 r1 = *p_ref++; 503 r2 = *p_ref++; 504 r3 = *p_ref++; 505 r4 = *p_ref++; 506 /* first pixel */ 507 r5 = *p_ref++; 508 result = (r0 + r5); 509 r0 = (r1 + r4); 510 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 511 r0 = (r2 + r3); 512 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 513 result = (result + 16) >> 5; 514 CLIP_RESULT(result) 515 pkres = result; 516 /* second pixel */ 517 r0 = *p_ref++; 518 result = (r1 + r0); 519 r1 = (r2 + r5); 520 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 521 r1 = (r3 + r4); 522 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 523 result = (result + 16) >> 5; 524 CLIP_RESULT(result) 525 pkres |= (result << 8); 526 /* third pixel */ 527 r1 = *p_ref++; 528 result = (r2 + r1); 529 r2 = (r3 + r0); 530 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 531 r2 = (r4 + r5); 532 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 533 result = (result + 16) >> 5; 534 CLIP_RESULT(result) 535 pkres |= (result << 16); 536 /* fourth pixel */ 537 r2 = *p_ref++; 538 result = (r3 + r2); 539 r3 = (r4 + r1); 540 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 541 r3 = (r5 + r0); 542 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 543 result = (result + 16) >> 5; 544 CLIP_RESULT(result) 545 pkres |= (result << 24); 546 *p_cur++ = pkres; /* write 4 pixels */ 547 p_ref -= 5; 548 } 549 p_cur += curr_offset; /* move to the next line */ 550 p_ref += ref_offset; 551 } 552 } 553 } 554 555 return ; 556 } 557 558 void eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch, 559 int blkwidth, int blkheight, int dx) 560 { 561 int *p_ref; 562 uint32 *p_cur; 563 uint32 tmp, pkres; 564 int result, result2, curr_offset, ref_offset; 565 int j, r0, r1, r2, r3, r4, r5; 566 567 p_cur = (uint32*)out; /* assume it's word aligned */ 568 curr_offset = (outpitch - blkwidth) >> 2; 569 p_ref = in; 570 ref_offset = inpitch - blkwidth; 571 572 if (dx&1) 573 { 574 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ 575 576 for (j = blkheight; j > 0 ; j--) 577 { 578 tmp = (uint32)(p_ref + blkwidth); 579 for (; (uint32)p_ref < tmp;) 580 { 581 582 r0 = p_ref[-2]; 583 r1 = p_ref[-1]; 584 r2 = *p_ref++; 585 r3 = *p_ref++; 586 r4 = *p_ref++; 587 /* first pixel */ 588 r5 = *p_ref++; 589 result = (r0 + r5); 590 r0 = (r1 + r4); 591 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 592 r0 = (r2 + r3); 593 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 594 result = (result + 512) >> 10; 595 CLIP_RESULT(result) 596 result2 = ((p_ref[dx] + 16) >> 5); 597 CLIP_RESULT(result2) 598 /* 3/4 pel, no need to clip */ 599 result = (result + result2 + 1); 600 pkres = (result >> 1); 601 /* second pixel */ 602 r0 = *p_ref++; 603 result = (r1 + r0); 604 r1 = (r2 + r5); 605 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 606 r1 = (r3 + r4); 607 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 608 result = (result + 512) >> 10; 609 CLIP_RESULT(result) 610 result2 = ((p_ref[dx] + 16) >> 5); 611 CLIP_RESULT(result2) 612 /* 3/4 pel, no need to clip */ 613 result = (result + result2 + 1); 614 result = (result >> 1); 615 pkres |= (result << 8); 616 /* third pixel */ 617 r1 = *p_ref++; 618 result = (r2 + r1); 619 r2 = (r3 + r0); 620 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 621 r2 = (r4 + r5); 622 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 623 result = (result + 512) >> 10; 624 CLIP_RESULT(result) 625 result2 = ((p_ref[dx] + 16) >> 5); 626 CLIP_RESULT(result2) 627 /* 3/4 pel, no need to clip */ 628 result = (result + result2 + 1); 629 result = (result >> 1); 630 pkres |= (result << 16); 631 /* fourth pixel */ 632 r2 = *p_ref++; 633 result = (r3 + r2); 634 r3 = (r4 + r1); 635 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 636 r3 = (r5 + r0); 637 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 638 result = (result + 512) >> 10; 639 CLIP_RESULT(result) 640 result2 = ((p_ref[dx] + 16) >> 5); 641 CLIP_RESULT(result2) 642 /* 3/4 pel, no need to clip */ 643 result = (result + result2 + 1); 644 result = (result >> 1); 645 pkres |= (result << 24); 646 *p_cur++ = pkres; /* write 4 pixels */ 647 p_ref -= 3; /* offset back to the middle of filter */ 648 } 649 p_cur += curr_offset; /* move to the next line */ 650 p_ref += ref_offset; /* move to the next line */ 651 } 652 } 653 else 654 { 655 for (j = blkheight; j > 0 ; j--) 656 { 657 tmp = (uint32)(p_ref + blkwidth); 658 for (; (uint32)p_ref < tmp;) 659 { 660 661 r0 = p_ref[-2]; 662 r1 = p_ref[-1]; 663 r2 = *p_ref++; 664 r3 = *p_ref++; 665 r4 = *p_ref++; 666 /* first pixel */ 667 r5 = *p_ref++; 668 result = (r0 + r5); 669 r0 = (r1 + r4); 670 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 671 r0 = (r2 + r3); 672 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 673 result = (result + 512) >> 10; 674 CLIP_RESULT(result) 675 pkres = result; 676 /* second pixel */ 677 r0 = *p_ref++; 678 result = (r1 + r0); 679 r1 = (r2 + r5); 680 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 681 r1 = (r3 + r4); 682 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 683 result = (result + 512) >> 10; 684 CLIP_RESULT(result) 685 pkres |= (result << 8); 686 /* third pixel */ 687 r1 = *p_ref++; 688 result = (r2 + r1); 689 r2 = (r3 + r0); 690 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 691 r2 = (r4 + r5); 692 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 693 result = (result + 512) >> 10; 694 CLIP_RESULT(result) 695 pkres |= (result << 16); 696 /* fourth pixel */ 697 r2 = *p_ref++; 698 result = (r3 + r2); 699 r3 = (r4 + r1); 700 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 701 r3 = (r5 + r0); 702 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 703 result = (result + 512) >> 10; 704 CLIP_RESULT(result) 705 pkres |= (result << 24); 706 *p_cur++ = pkres; /* write 4 pixels */ 707 p_ref -= 3; /* offset back to the middle of filter */ 708 } 709 p_cur += curr_offset; /* move to the next line */ 710 p_ref += ref_offset; /* move to the next line */ 711 } 712 } 713 714 return ; 715 } 716 717 void eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch, 718 int blkwidth, int blkheight) 719 { 720 uint8 *p_ref; 721 int *p_cur; 722 uint32 tmp; 723 int result, curr_offset, ref_offset; 724 int j, r0, r1, r2, r3, r4, r5; 725 726 p_cur = out; 727 curr_offset = (outpitch - blkwidth); 728 p_ref = in; 729 ref_offset = inpitch - blkwidth; 730 731 for (j = blkheight; j > 0 ; j--) 732 { 733 tmp = (uint32)(p_ref + blkwidth); 734 for (; (uint32)p_ref < tmp;) 735 { 736 737 r0 = p_ref[-2]; 738 r1 = p_ref[-1]; 739 r2 = *p_ref++; 740 r3 = *p_ref++; 741 r4 = *p_ref++; 742 /* first pixel */ 743 r5 = *p_ref++; 744 result = (r0 + r5); 745 r0 = (r1 + r4); 746 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 747 r0 = (r2 + r3); 748 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 749 *p_cur++ = result; 750 /* second pixel */ 751 r0 = *p_ref++; 752 result = (r1 + r0); 753 r1 = (r2 + r5); 754 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 755 r1 = (r3 + r4); 756 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 757 *p_cur++ = result; 758 /* third pixel */ 759 r1 = *p_ref++; 760 result = (r2 + r1); 761 r2 = (r3 + r0); 762 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 763 r2 = (r4 + r5); 764 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 765 *p_cur++ = result; 766 /* fourth pixel */ 767 r2 = *p_ref++; 768 result = (r3 + r2); 769 r3 = (r4 + r1); 770 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 771 r3 = (r5 + r0); 772 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 773 *p_cur++ = result; 774 p_ref -= 3; /* move back to the middle of the filter */ 775 } 776 p_cur += curr_offset; /* move to the next line */ 777 p_ref += ref_offset; 778 } 779 780 return ; 781 } 782 void eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, 783 int blkwidth, int blkheight, int dy) 784 { 785 uint8 *p_cur, *p_ref; 786 uint32 tmp; 787 int result, curr_offset, ref_offset; 788 int j, i; 789 int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13; 790 uint8 tmp_in[24][24]; 791 792 /* not word-aligned */ 793 if (((uint32)in)&0x3) 794 { 795 eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); 796 in = &tmp_in[2][0]; 797 inpitch = 24; 798 } 799 p_cur = out; 800 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 801 ref_offset = blkheight * inpitch; /* for limit */ 802 803 curr_offset += 3; 804 805 if (dy&1) 806 { 807 dy = (dy >> 1) ? 0 : -inpitch; 808 809 for (j = 0; j < blkwidth; j += 4, in += 4) 810 { 811 r13 = 0; 812 p_ref = in; 813 p_cur -= outpitch; /* compensate for the first offset */ 814 tmp = (uint32)(p_ref + ref_offset); /* limit */ 815 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 816 { 817 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 818 p_ref += inpitch; 819 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 820 r0 &= 0xFF00FF; 821 822 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 823 r7 = (r1 >> 8) & 0xFF00FF; 824 r1 &= 0xFF00FF; 825 826 r0 += r1; 827 r6 += r7; 828 829 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 830 r8 = (r2 >> 8) & 0xFF00FF; 831 r2 &= 0xFF00FF; 832 833 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 834 r7 = (r1 >> 8) & 0xFF00FF; 835 r1 &= 0xFF00FF; 836 r1 += r2; 837 838 r7 += r8; 839 840 r0 += 20 * r1; 841 r6 += 20 * r7; 842 r0 += 0x100010; 843 r6 += 0x100010; 844 845 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 846 r8 = (r2 >> 8) & 0xFF00FF; 847 r2 &= 0xFF00FF; 848 849 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 850 r7 = (r1 >> 8) & 0xFF00FF; 851 r1 &= 0xFF00FF; 852 r1 += r2; 853 854 r7 += r8; 855 856 r0 -= 5 * r1; 857 r6 -= 5 * r7; 858 859 r0 >>= 5; 860 r6 >>= 5; 861 /* clip */ 862 r13 |= r6; 863 r13 |= r0; 864 //CLIPPACK(r6,result) 865 866 r1 = *((uint32*)(p_ref + dy)); 867 r2 = (r1 >> 8) & 0xFF00FF; 868 r1 &= 0xFF00FF; 869 r0 += r1; 870 r6 += r2; 871 r0 += 0x10001; 872 r6 += 0x10001; 873 r0 = (r0 >> 1) & 0xFF00FF; 874 r6 = (r6 >> 1) & 0xFF00FF; 875 876 r0 |= (r6 << 8); /* pack it back */ 877 *((uint32*)(p_cur += outpitch)) = r0; 878 } 879 p_cur += curr_offset; /* offset to the next pixel */ 880 if (r13 & 0xFF000700) /* this column need clipping */ 881 { 882 p_cur -= 4; 883 for (i = 0; i < 4; i++) 884 { 885 p_ref = in + i; 886 p_cur -= outpitch; /* compensate for the first offset */ 887 888 tmp = (uint32)(p_ref + ref_offset); /* limit */ 889 while ((uint32)p_ref < tmp) 890 { /* loop un-rolled */ 891 r0 = *(p_ref - (inpitch << 1)); 892 r1 = *(p_ref - inpitch); 893 r2 = *p_ref; 894 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 895 r4 = *(p_ref += inpitch); 896 /* first pixel */ 897 r5 = *(p_ref += inpitch); 898 result = (r0 + r5); 899 r0 = (r1 + r4); 900 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 901 r0 = (r2 + r3); 902 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 903 result = (result + 16) >> 5; 904 CLIP_RESULT(result) 905 /* 3/4 pel, no need to clip */ 906 result = (result + p_ref[dy-(inpitch<<1)] + 1); 907 result = (result >> 1); 908 *(p_cur += outpitch) = result; 909 /* second pixel */ 910 r0 = *(p_ref += inpitch); 911 result = (r1 + r0); 912 r1 = (r2 + r5); 913 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 914 r1 = (r3 + r4); 915 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 916 result = (result + 16) >> 5; 917 CLIP_RESULT(result) 918 /* 3/4 pel, no need to clip */ 919 result = (result + p_ref[dy-(inpitch<<1)] + 1); 920 result = (result >> 1); 921 *(p_cur += outpitch) = result; 922 /* third pixel */ 923 r1 = *(p_ref += inpitch); 924 result = (r2 + r1); 925 r2 = (r3 + r0); 926 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 927 r2 = (r4 + r5); 928 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 929 result = (result + 16) >> 5; 930 CLIP_RESULT(result) 931 /* 3/4 pel, no need to clip */ 932 result = (result + p_ref[dy-(inpitch<<1)] + 1); 933 result = (result >> 1); 934 *(p_cur += outpitch) = result; 935 /* fourth pixel */ 936 r2 = *(p_ref += inpitch); 937 result = (r3 + r2); 938 r3 = (r4 + r1); 939 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 940 r3 = (r5 + r0); 941 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 942 result = (result + 16) >> 5; 943 CLIP_RESULT(result) 944 /* 3/4 pel, no need to clip */ 945 result = (result + p_ref[dy-(inpitch<<1)] + 1); 946 result = (result >> 1); 947 *(p_cur += outpitch) = result; 948 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 949 } 950 p_cur += (curr_offset - 3); 951 } 952 } 953 } 954 } 955 else 956 { 957 for (j = 0; j < blkwidth; j += 4, in += 4) 958 { 959 r13 = 0; 960 p_ref = in; 961 p_cur -= outpitch; /* compensate for the first offset */ 962 tmp = (uint32)(p_ref + ref_offset); /* limit */ 963 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 964 { 965 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 966 p_ref += inpitch; 967 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 968 r0 &= 0xFF00FF; 969 970 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 971 r7 = (r1 >> 8) & 0xFF00FF; 972 r1 &= 0xFF00FF; 973 974 r0 += r1; 975 r6 += r7; 976 977 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 978 r8 = (r2 >> 8) & 0xFF00FF; 979 r2 &= 0xFF00FF; 980 981 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 982 r7 = (r1 >> 8) & 0xFF00FF; 983 r1 &= 0xFF00FF; 984 r1 += r2; 985 986 r7 += r8; 987 988 r0 += 20 * r1; 989 r6 += 20 * r7; 990 r0 += 0x100010; 991 r6 += 0x100010; 992 993 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 994 r8 = (r2 >> 8) & 0xFF00FF; 995 r2 &= 0xFF00FF; 996 997 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 998 r7 = (r1 >> 8) & 0xFF00FF; 999 r1 &= 0xFF00FF; 1000 r1 += r2; 1001 1002 r7 += r8; 1003 1004 r0 -= 5 * r1; 1005 r6 -= 5 * r7; 1006 1007 r0 >>= 5; 1008 r6 >>= 5; 1009 /* clip */ 1010 r13 |= r6; 1011 r13 |= r0; 1012 //CLIPPACK(r6,result) 1013 r0 &= 0xFF00FF; 1014 r6 &= 0xFF00FF; 1015 r0 |= (r6 << 8); /* pack it back */ 1016 *((uint32*)(p_cur += outpitch)) = r0; 1017 } 1018 p_cur += curr_offset; /* offset to the next pixel */ 1019 if (r13 & 0xFF000700) /* this column need clipping */ 1020 { 1021 p_cur -= 4; 1022 for (i = 0; i < 4; i++) 1023 { 1024 p_ref = in + i; 1025 p_cur -= outpitch; /* compensate for the first offset */ 1026 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1027 while ((uint32)p_ref < tmp) 1028 { /* loop un-rolled */ 1029 r0 = *(p_ref - (inpitch << 1)); 1030 r1 = *(p_ref - inpitch); 1031 r2 = *p_ref; 1032 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1033 r4 = *(p_ref += inpitch); 1034 /* first pixel */ 1035 r5 = *(p_ref += inpitch); 1036 result = (r0 + r5); 1037 r0 = (r1 + r4); 1038 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1039 r0 = (r2 + r3); 1040 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1041 result = (result + 16) >> 5; 1042 CLIP_RESULT(result) 1043 *(p_cur += outpitch) = result; 1044 /* second pixel */ 1045 r0 = *(p_ref += inpitch); 1046 result = (r1 + r0); 1047 r1 = (r2 + r5); 1048 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1049 r1 = (r3 + r4); 1050 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1051 result = (result + 16) >> 5; 1052 CLIP_RESULT(result) 1053 *(p_cur += outpitch) = result; 1054 /* third pixel */ 1055 r1 = *(p_ref += inpitch); 1056 result = (r2 + r1); 1057 r2 = (r3 + r0); 1058 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1059 r2 = (r4 + r5); 1060 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1061 result = (result + 16) >> 5; 1062 CLIP_RESULT(result) 1063 *(p_cur += outpitch) = result; 1064 /* fourth pixel */ 1065 r2 = *(p_ref += inpitch); 1066 result = (r3 + r2); 1067 r3 = (r4 + r1); 1068 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1069 r3 = (r5 + r0); 1070 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1071 result = (result + 16) >> 5; 1072 CLIP_RESULT(result) 1073 *(p_cur += outpitch) = result; 1074 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1075 } 1076 p_cur += (curr_offset - 3); 1077 } 1078 } 1079 } 1080 } 1081 1082 return ; 1083 } 1084 1085 void eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch, 1086 int blkwidth, int blkheight) 1087 { 1088 int *p_cur; 1089 uint8 *p_ref; 1090 uint32 tmp; 1091 int result, curr_offset, ref_offset; 1092 int j, r0, r1, r2, r3, r4, r5; 1093 1094 p_cur = out; 1095 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 1096 ref_offset = blkheight * inpitch; /* for limit */ 1097 1098 for (j = 0; j < blkwidth; j++) 1099 { 1100 p_cur -= outpitch; /* compensate for the first offset */ 1101 p_ref = in++; 1102 1103 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1104 while ((uint32)p_ref < tmp) 1105 { /* loop un-rolled */ 1106 r0 = *(p_ref - (inpitch << 1)); 1107 r1 = *(p_ref - inpitch); 1108 r2 = *p_ref; 1109 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1110 r4 = *(p_ref += inpitch); 1111 /* first pixel */ 1112 r5 = *(p_ref += inpitch); 1113 result = (r0 + r5); 1114 r0 = (r1 + r4); 1115 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1116 r0 = (r2 + r3); 1117 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1118 *(p_cur += outpitch) = result; 1119 /* second pixel */ 1120 r0 = *(p_ref += inpitch); 1121 result = (r1 + r0); 1122 r1 = (r2 + r5); 1123 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1124 r1 = (r3 + r4); 1125 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1126 *(p_cur += outpitch) = result; 1127 /* third pixel */ 1128 r1 = *(p_ref += inpitch); 1129 result = (r2 + r1); 1130 r2 = (r3 + r0); 1131 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1132 r2 = (r4 + r5); 1133 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1134 *(p_cur += outpitch) = result; 1135 /* fourth pixel */ 1136 r2 = *(p_ref += inpitch); 1137 result = (r3 + r2); 1138 r3 = (r4 + r1); 1139 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1140 r3 = (r5 + r0); 1141 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1142 *(p_cur += outpitch) = result; 1143 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1144 } 1145 p_cur += curr_offset; 1146 } 1147 1148 return ; 1149 } 1150 1151 void eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch, 1152 int blkwidth, int blkheight, int dy) 1153 { 1154 uint8 *p_cur; 1155 int *p_ref; 1156 uint32 tmp; 1157 int result, result2, curr_offset, ref_offset; 1158 int j, r0, r1, r2, r3, r4, r5; 1159 1160 p_cur = out; 1161 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 1162 ref_offset = blkheight * inpitch; /* for limit */ 1163 1164 if (dy&1) 1165 { 1166 dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch; 1167 1168 for (j = 0; j < blkwidth; j++) 1169 { 1170 p_cur -= outpitch; /* compensate for the first offset */ 1171 p_ref = in++; 1172 1173 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1174 while ((uint32)p_ref < tmp) 1175 { /* loop un-rolled */ 1176 r0 = *(p_ref - (inpitch << 1)); 1177 r1 = *(p_ref - inpitch); 1178 r2 = *p_ref; 1179 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1180 r4 = *(p_ref += inpitch); 1181 /* first pixel */ 1182 r5 = *(p_ref += inpitch); 1183 result = (r0 + r5); 1184 r0 = (r1 + r4); 1185 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1186 r0 = (r2 + r3); 1187 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1188 result = (result + 512) >> 10; 1189 CLIP_RESULT(result) 1190 result2 = ((p_ref[dy] + 16) >> 5); 1191 CLIP_RESULT(result2) 1192 /* 3/4 pel, no need to clip */ 1193 result = (result + result2 + 1); 1194 result = (result >> 1); 1195 *(p_cur += outpitch) = result; 1196 /* second pixel */ 1197 r0 = *(p_ref += inpitch); 1198 result = (r1 + r0); 1199 r1 = (r2 + r5); 1200 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1201 r1 = (r3 + r4); 1202 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1203 result = (result + 512) >> 10; 1204 CLIP_RESULT(result) 1205 result2 = ((p_ref[dy] + 16) >> 5); 1206 CLIP_RESULT(result2) 1207 /* 3/4 pel, no need to clip */ 1208 result = (result + result2 + 1); 1209 result = (result >> 1); 1210 *(p_cur += outpitch) = result; 1211 /* third pixel */ 1212 r1 = *(p_ref += inpitch); 1213 result = (r2 + r1); 1214 r2 = (r3 + r0); 1215 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1216 r2 = (r4 + r5); 1217 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1218 result = (result + 512) >> 10; 1219 CLIP_RESULT(result) 1220 result2 = ((p_ref[dy] + 16) >> 5); 1221 CLIP_RESULT(result2) 1222 /* 3/4 pel, no need to clip */ 1223 result = (result + result2 + 1); 1224 result = (result >> 1); 1225 *(p_cur += outpitch) = result; 1226 /* fourth pixel */ 1227 r2 = *(p_ref += inpitch); 1228 result = (r3 + r2); 1229 r3 = (r4 + r1); 1230 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1231 r3 = (r5 + r0); 1232 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1233 result = (result + 512) >> 10; 1234 CLIP_RESULT(result) 1235 result2 = ((p_ref[dy] + 16) >> 5); 1236 CLIP_RESULT(result2) 1237 /* 3/4 pel, no need to clip */ 1238 result = (result + result2 + 1); 1239 result = (result >> 1); 1240 *(p_cur += outpitch) = result; 1241 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1242 } 1243 p_cur += curr_offset; 1244 } 1245 } 1246 else 1247 { 1248 for (j = 0; j < blkwidth; j++) 1249 { 1250 p_cur -= outpitch; /* compensate for the first offset */ 1251 p_ref = in++; 1252 1253 tmp = (uint32)(p_ref + ref_offset); /* limit */ 1254 while ((uint32)p_ref < tmp) 1255 { /* loop un-rolled */ 1256 r0 = *(p_ref - (inpitch << 1)); 1257 r1 = *(p_ref - inpitch); 1258 r2 = *p_ref; 1259 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1260 r4 = *(p_ref += inpitch); 1261 /* first pixel */ 1262 r5 = *(p_ref += inpitch); 1263 result = (r0 + r5); 1264 r0 = (r1 + r4); 1265 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1266 r0 = (r2 + r3); 1267 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1268 result = (result + 512) >> 10; 1269 CLIP_RESULT(result) 1270 *(p_cur += outpitch) = result; 1271 /* second pixel */ 1272 r0 = *(p_ref += inpitch); 1273 result = (r1 + r0); 1274 r1 = (r2 + r5); 1275 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1276 r1 = (r3 + r4); 1277 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1278 result = (result + 512) >> 10; 1279 CLIP_RESULT(result) 1280 *(p_cur += outpitch) = result; 1281 /* third pixel */ 1282 r1 = *(p_ref += inpitch); 1283 result = (r2 + r1); 1284 r2 = (r3 + r0); 1285 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1286 r2 = (r4 + r5); 1287 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1288 result = (result + 512) >> 10; 1289 CLIP_RESULT(result) 1290 *(p_cur += outpitch) = result; 1291 /* fourth pixel */ 1292 r2 = *(p_ref += inpitch); 1293 result = (r3 + r2); 1294 r3 = (r4 + r1); 1295 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1296 r3 = (r5 + r0); 1297 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1298 result = (result + 512) >> 10; 1299 CLIP_RESULT(result) 1300 *(p_cur += outpitch) = result; 1301 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1302 } 1303 p_cur += curr_offset; 1304 } 1305 } 1306 1307 return ; 1308 } 1309 1310 void eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch, 1311 uint8 *out, int outpitch, 1312 int blkwidth, int blkheight) 1313 { 1314 int j, i; 1315 int result; 1316 uint8 *p_cur, *p_ref, *p_tmp8; 1317 int curr_offset, ref_offset; 1318 uint8 tmp_res[24][24], tmp_in[24][24]; 1319 uint32 *p_tmp; 1320 uint32 tmp, pkres, tmp_result; 1321 int32 r0, r1, r2, r3, r4, r5; 1322 int32 r6, r7, r8, r9, r10, r13; 1323 1324 ref_offset = inpitch - blkwidth; 1325 p_ref = in1 - 2; 1326 /* perform horizontal interpolation */ 1327 /* not word-aligned */ 1328 /* It is faster to read 1 byte at time to avoid calling CreateAlign */ 1329 /* if(((uint32)p_ref)&0x3) 1330 { 1331 CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight); 1332 p_ref = &tmp_in[0][0]; 1333 ref_offset = 24-blkwidth; 1334 }*/ 1335 1336 p_tmp = (uint32*) & (tmp_res[0][0]); 1337 for (j = blkheight; j > 0; j--) 1338 { 1339 r13 = 0; 1340 tmp = (uint32)(p_ref + blkwidth); 1341 1342 //r0 = *((uint32*)p_ref); /* d,c,b,a */ 1343 //r1 = (r0>>8)&0xFF00FF; /* 0,d,0,b */ 1344 //r0 &= 0xFF00FF; /* 0,c,0,a */ 1345 /* It is faster to read 1 byte at a time */ 1346 r0 = p_ref[0]; 1347 r1 = p_ref[2]; 1348 r0 |= (r1 << 16); /* 0,c,0,a */ 1349 r1 = p_ref[1]; 1350 r2 = p_ref[3]; 1351 r1 |= (r2 << 16); /* 0,d,0,b */ 1352 1353 while ((uint32)p_ref < tmp) 1354 { 1355 //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */ 1356 //r3 = (r2>>8)&0xFF00FF; /* 0,h,0,f */ 1357 //r2 &= 0xFF00FF; /* 0,g,0,e */ 1358 /* It is faster to read 1 byte at a time */ 1359 r2 = *(p_ref += 4); 1360 r3 = p_ref[2]; 1361 r2 |= (r3 << 16); /* 0,g,0,e */ 1362 r3 = p_ref[1]; 1363 r4 = p_ref[3]; 1364 r3 |= (r4 << 16); /* 0,h,0,f */ 1365 1366 r4 = r0 + r3; /* c+h, a+f */ 1367 r5 = r0 + r1; /* c+d, a+b */ 1368 r6 = r2 + r3; /* g+h, e+f */ 1369 r5 >>= 16; 1370 r5 |= (r6 << 16); /* e+f, c+d */ 1371 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 1372 r4 += 0x100010; /* +16, +16 */ 1373 r5 = r1 + r2; /* d+g, b+e */ 1374 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 1375 r4 >>= 5; 1376 r13 |= r4; /* check clipping */ 1377 r4 &= 0xFF00FF; /* mask */ 1378 1379 r5 = p_ref[4]; /* i */ 1380 r6 = (r5 << 16); 1381 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 1382 r5 += r1; /* d+i, b+g */ /* r5 not free */ 1383 r1 >>= 16; 1384 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 1385 r1 += r2; /* f+g, d+e */ 1386 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 1387 r0 >>= 16; 1388 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 1389 r0 += r3; /* e+h, c+f */ 1390 r5 += 0x100010; /* 16,16 */ 1391 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 1392 r5 >>= 5; 1393 r13 |= r5; /* check clipping */ 1394 r5 &= 0xFF00FF; /* mask */ 1395 1396 r4 |= (r5 << 8); /* pack them together */ 1397 *p_tmp++ = r4; 1398 r1 = r3; 1399 r0 = r2; 1400 } 1401 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */ 1402 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 1403 1404 if (r13&0xFF000700) /* need clipping */ 1405 { 1406 /* move back to the beginning of the line */ 1407 p_ref -= (ref_offset + blkwidth); /* input */ 1408 p_tmp -= 6; /* intermediate output */ 1409 tmp = (uint32)(p_ref + blkwidth); 1410 while ((uint32)p_ref < tmp) 1411 { 1412 r0 = *p_ref++; 1413 r1 = *p_ref++; 1414 r2 = *p_ref++; 1415 r3 = *p_ref++; 1416 r4 = *p_ref++; 1417 /* first pixel */ 1418 r5 = *p_ref++; 1419 result = (r0 + r5); 1420 r0 = (r1 + r4); 1421 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1422 r0 = (r2 + r3); 1423 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1424 result = (result + 16) >> 5; 1425 CLIP_RESULT(result) 1426 pkres = result; 1427 /* second pixel */ 1428 r0 = *p_ref++; 1429 result = (r1 + r0); 1430 r1 = (r2 + r5); 1431 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1432 r1 = (r3 + r4); 1433 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1434 result = (result + 16) >> 5; 1435 CLIP_RESULT(result) 1436 pkres |= (result << 8); 1437 /* third pixel */ 1438 r1 = *p_ref++; 1439 result = (r2 + r1); 1440 r2 = (r3 + r0); 1441 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1442 r2 = (r4 + r5); 1443 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1444 result = (result + 16) >> 5; 1445 CLIP_RESULT(result) 1446 pkres |= (result << 16); 1447 /* fourth pixel */ 1448 r2 = *p_ref++; 1449 result = (r3 + r2); 1450 r3 = (r4 + r1); 1451 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1452 r3 = (r5 + r0); 1453 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1454 result = (result + 16) >> 5; 1455 CLIP_RESULT(result) 1456 pkres |= (result << 24); 1457 1458 *p_tmp++ = pkres; /* write 4 pixel */ 1459 p_ref -= 5; 1460 } 1461 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */ 1462 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 1463 } 1464 } 1465 1466 /* perform vertical interpolation */ 1467 /* not word-aligned */ 1468 if (((uint32)in2)&0x3) 1469 { 1470 eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); 1471 in2 = &tmp_in[2][0]; 1472 inpitch = 24; 1473 } 1474 1475 p_cur = out; 1476 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */ 1477 pkres = blkheight * inpitch; /* reuse it for limit */ 1478 1479 curr_offset += 3; 1480 1481 for (j = 0; j < blkwidth; j += 4, in2 += 4) 1482 { 1483 r13 = 0; 1484 p_ref = in2; 1485 p_tmp8 = &(tmp_res[0][j]); /* intermediate result */ 1486 p_tmp8 -= 24; /* compensate for the first offset */ 1487 p_cur -= outpitch; /* compensate for the first offset */ 1488 tmp = (uint32)(p_ref + pkres); /* limit */ 1489 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 1490 { 1491 /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */ 1492 /*p_ref8 = p_ref-(inpitch<<1); r0 = p_ref8[0]; r1 = p_ref8[2]; 1493 r0 |= (r1<<16); r6 = p_ref8[1]; r1 = p_ref8[3]; 1494 r6 |= (r1<<16); p_ref+=inpitch; */ 1495 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 1496 p_ref += inpitch; 1497 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 1498 r0 &= 0xFF00FF; 1499 1500 /*p_ref8 = p_ref+(inpitch<<1); 1501 r1 = p_ref8[0]; r7 = p_ref8[2]; r1 |= (r7<<16); 1502 r7 = p_ref8[1]; r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1503 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 1504 r7 = (r1 >> 8) & 0xFF00FF; 1505 r1 &= 0xFF00FF; 1506 1507 r0 += r1; 1508 r6 += r7; 1509 1510 /*r2 = p_ref[0]; r8 = p_ref[2]; r2 |= (r8<<16); 1511 r8 = p_ref[1]; r1 = p_ref[3]; r8 |= (r1<<16);*/ 1512 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 1513 r8 = (r2 >> 8) & 0xFF00FF; 1514 r2 &= 0xFF00FF; 1515 1516 /*p_ref8 = p_ref-inpitch; r1 = p_ref8[0]; r7 = p_ref8[2]; 1517 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1]; 1518 r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1519 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 1520 r7 = (r1 >> 8) & 0xFF00FF; 1521 r1 &= 0xFF00FF; 1522 r1 += r2; 1523 1524 r7 += r8; 1525 1526 r0 += 20 * r1; 1527 r6 += 20 * r7; 1528 r0 += 0x100010; 1529 r6 += 0x100010; 1530 1531 /*p_ref8 = p_ref-(inpitch<<1); r2 = p_ref8[0]; r8 = p_ref8[2]; 1532 r2 |= (r8<<16); r8 = p_ref8[1]; r1 = p_ref8[3]; r8 |= (r1<<16);*/ 1533 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 1534 r8 = (r2 >> 8) & 0xFF00FF; 1535 r2 &= 0xFF00FF; 1536 1537 /*p_ref8 = p_ref+inpitch; r1 = p_ref8[0]; r7 = p_ref8[2]; 1538 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1]; 1539 r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1540 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 1541 r7 = (r1 >> 8) & 0xFF00FF; 1542 r1 &= 0xFF00FF; 1543 r1 += r2; 1544 1545 r7 += r8; 1546 1547 r0 -= 5 * r1; 1548 r6 -= 5 * r7; 1549 1550 r0 >>= 5; 1551 r6 >>= 5; 1552 /* clip */ 1553 r13 |= r6; 1554 r13 |= r0; 1555 //CLIPPACK(r6,result) 1556 /* add with horizontal results */ 1557 r10 = *((uint32*)(p_tmp8 += 24)); 1558 r9 = (r10 >> 8) & 0xFF00FF; 1559 r10 &= 0xFF00FF; 1560 1561 r0 += r10; 1562 r0 += 0x10001; 1563 r0 = (r0 >> 1) & 0xFF00FF; /* mask to 8 bytes */ 1564 1565 r6 += r9; 1566 r6 += 0x10001; 1567 r6 = (r6 >> 1) & 0xFF00FF; /* mask to 8 bytes */ 1568 1569 r0 |= (r6 << 8); /* pack it back */ 1570 *((uint32*)(p_cur += outpitch)) = r0; 1571 } 1572 p_cur += curr_offset; /* offset to the next pixel */ 1573 if (r13 & 0xFF000700) /* this column need clipping */ 1574 { 1575 p_cur -= 4; 1576 for (i = 0; i < 4; i++) 1577 { 1578 p_ref = in2 + i; 1579 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */ 1580 p_tmp8 -= 24; /* compensate for the first offset */ 1581 p_cur -= outpitch; /* compensate for the first offset */ 1582 tmp = (uint32)(p_ref + pkres); /* limit */ 1583 while ((uint32)p_ref < tmp) /* the loop un-rolled */ 1584 { 1585 r0 = *(p_ref - (inpitch << 1)); 1586 r1 = *(p_ref - inpitch); 1587 r2 = *p_ref; 1588 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1589 r4 = *(p_ref += inpitch); 1590 /* first pixel */ 1591 r5 = *(p_ref += inpitch); 1592 result = (r0 + r5); 1593 r0 = (r1 + r4); 1594 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1595 r0 = (r2 + r3); 1596 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1597 result = (result + 16) >> 5; 1598 CLIP_RESULT(result) 1599 tmp_result = *(p_tmp8 += 24); /* modify pointer before loading */ 1600 result = (result + tmp_result + 1); /* no clip */ 1601 result = (result >> 1); 1602 *(p_cur += outpitch) = result; 1603 /* second pixel */ 1604 r0 = *(p_ref += inpitch); 1605 result = (r1 + r0); 1606 r1 = (r2 + r5); 1607 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1608 r1 = (r3 + r4); 1609 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1610 result = (result + 16) >> 5; 1611 CLIP_RESULT(result) 1612 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1613 result = (result + tmp_result + 1); /* no clip */ 1614 result = (result >> 1); 1615 *(p_cur += outpitch) = result; 1616 /* third pixel */ 1617 r1 = *(p_ref += inpitch); 1618 result = (r2 + r1); 1619 r2 = (r3 + r0); 1620 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1621 r2 = (r4 + r5); 1622 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1623 result = (result + 16) >> 5; 1624 CLIP_RESULT(result) 1625 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1626 result = (result + tmp_result + 1); /* no clip */ 1627 result = (result >> 1); 1628 *(p_cur += outpitch) = result; 1629 /* fourth pixel */ 1630 r2 = *(p_ref += inpitch); 1631 result = (r3 + r2); 1632 r3 = (r4 + r1); 1633 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1634 r3 = (r5 + r0); 1635 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1636 result = (result + 16) >> 5; 1637 CLIP_RESULT(result) 1638 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1639 result = (result + tmp_result + 1); /* no clip */ 1640 result = (result >> 1); 1641 *(p_cur += outpitch) = result; 1642 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1643 } 1644 p_cur += (curr_offset - 3); 1645 } 1646 } 1647 } 1648 1649 return ; 1650 } 1651 1652 /* position G */ 1653 void eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch, 1654 int blkwidth, int blkheight) 1655 { 1656 int i, j; 1657 int offset_in = inpitch - blkwidth; 1658 int offset_out = outpitch - blkwidth; 1659 uint32 temp; 1660 uint8 byte; 1661 1662 if (((uint32)in)&3) 1663 { 1664 for (j = blkheight; j > 0; j--) 1665 { 1666 for (i = blkwidth; i > 0; i -= 4) 1667 { 1668 temp = *in++; 1669 byte = *in++; 1670 temp |= (byte << 8); 1671 byte = *in++; 1672 temp |= (byte << 16); 1673 byte = *in++; 1674 temp |= (byte << 24); 1675 1676 *((uint32*)out) = temp; /* write 4 bytes */ 1677 out += 4; 1678 } 1679 out += offset_out; 1680 in += offset_in; 1681 } 1682 } 1683 else 1684 { 1685 for (j = blkheight; j > 0; j--) 1686 { 1687 for (i = blkwidth; i > 0; i -= 4) 1688 { 1689 temp = *((uint32*)in); 1690 *((uint32*)out) = temp; 1691 in += 4; 1692 out += 4; 1693 } 1694 out += offset_out; 1695 in += offset_in; 1696 } 1697 } 1698 return ; 1699 } 1700 1701 void ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos) 1702 { 1703 int pad_height; 1704 int pad_width; 1705 uint8 *start; 1706 uint32 word1, word2, word3; 1707 int offset, j; 1708 1709 1710 pad_height = 8 + ((y_pos & 7) ? 1 : 0); 1711 pad_width = 8 + ((x_pos & 7) ? 1 : 0); 1712 1713 y_pos >>= 3; 1714 x_pos >>= 3; 1715 // pad vertical first 1716 if (y_pos < 0) // need to pad up 1717 { 1718 if (x_pos < -8) start = ref - 8; 1719 else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width; 1720 else start = ref + x_pos; 1721 1722 /* word-align start */ 1723 offset = (uint32)start & 0x3; 1724 if (offset) start -= offset; 1725 1726 word1 = *((uint32*)start); 1727 word2 = *((uint32*)(start + 4)); 1728 word3 = *((uint32*)(start + 8)); 1729 1730 /* pad up N rows */ 1731 j = -y_pos; 1732 if (j > 8) j = 8; 1733 while (j--) 1734 { 1735 *((uint32*)(start -= picpitch)) = word1; 1736 *((uint32*)(start + 4)) = word2; 1737 *((uint32*)(start + 8)) = word3; 1738 } 1739 1740 } 1741 else if (y_pos + pad_height >= picheight) /* pad down */ 1742 { 1743 if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8; 1744 else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) + 1745 picwidth + 7 - pad_width; 1746 else start = ref + picpitch * (picheight - 1) + x_pos; 1747 1748 /* word-align start */ 1749 offset = (uint32)start & 0x3; 1750 if (offset) start -= offset; 1751 1752 word1 = *((uint32*)start); 1753 word2 = *((uint32*)(start + 4)); 1754 word3 = *((uint32*)(start + 8)); 1755 1756 /* pad down N rows */ 1757 j = y_pos + pad_height - picheight; 1758 if (j > 8) j = 8; 1759 while (j--) 1760 { 1761 *((uint32*)(start += picpitch)) = word1; 1762 *((uint32*)(start + 4)) = word2; 1763 *((uint32*)(start + 8)) = word3; 1764 } 1765 } 1766 1767 /* now pad horizontal */ 1768 if (x_pos < 0) // pad left 1769 { 1770 if (y_pos < -8) start = ref - (picpitch << 3); 1771 else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch; 1772 else start = ref + y_pos * picpitch; 1773 1774 // now pad left 8 pixels for pad_height rows */ 1775 j = pad_height; 1776 start -= picpitch; 1777 while (j--) 1778 { 1779 word1 = *(start += picpitch); 1780 word1 |= (word1 << 8); 1781 word1 |= (word1 << 16); 1782 *((uint32*)(start - 8)) = word1; 1783 *((uint32*)(start - 4)) = word1; 1784 } 1785 } 1786 else if (x_pos + pad_width >= picwidth) /* pad right */ 1787 { 1788 if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1; 1789 else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1; 1790 else start = ref + y_pos * picpitch + picwidth - 1; 1791 1792 // now pad right 8 pixels for pad_height rows */ 1793 j = pad_height; 1794 start -= picpitch; 1795 while (j--) 1796 { 1797 word1 = *(start += picpitch); 1798 word1 |= (word1 << 8); 1799 word1 |= (word1 << 16); 1800 *((uint32*)(start + 1)) = word1; 1801 *((uint32*)(start + 5)) = word1; 1802 } 1803 } 1804 1805 return ; 1806 } 1807 1808 1809 void eChromaMotionComp(uint8 *ref, int picwidth, int picheight, 1810 int x_pos, int y_pos, 1811 uint8 *pred, int picpitch, 1812 int blkwidth, int blkheight) 1813 { 1814 int dx, dy; 1815 int offset_dx, offset_dy; 1816 int index; 1817 1818 ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos); 1819 1820 dx = x_pos & 7; 1821 dy = y_pos & 7; 1822 offset_dx = (dx + 7) >> 3; 1823 offset_dy = (dy + 7) >> 3; 1824 x_pos = x_pos >> 3; /* round it to full-pel resolution */ 1825 y_pos = y_pos >> 3; 1826 1827 ref += y_pos * picpitch + x_pos; 1828 1829 index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7); 1830 1831 (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight); 1832 return ; 1833 } 1834 1835 1836 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */ 1837 void eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 1838 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 1839 { 1840 int32 r0, r1, r2, r3, result0, result1; 1841 uint8 temp[288]; 1842 uint8 *ref, *out; 1843 int i, j; 1844 int dx_8 = 8 - dx; 1845 int dy_8 = 8 - dy; 1846 1847 /* horizontal first */ 1848 out = temp; 1849 for (i = 0; i < blkheight + 1; i++) 1850 { 1851 ref = pRef; 1852 r0 = ref[0]; 1853 for (j = 0; j < blkwidth; j += 4) 1854 { 1855 r0 |= (ref[2] << 16); 1856 result0 = dx_8 * r0; 1857 1858 r1 = ref[1] | (ref[3] << 16); 1859 result0 += dx * r1; 1860 *(int32 *)out = result0; 1861 1862 result0 = dx_8 * r1; 1863 1864 r2 = ref[4]; 1865 r0 = r0 >> 16; 1866 r1 = r0 | (r2 << 16); 1867 result0 += dx * r1; 1868 *(int32 *)(out + 16) = result0; 1869 1870 ref += 4; 1871 out += 4; 1872 r0 = r2; 1873 } 1874 pRef += srcPitch; 1875 out += (32 - blkwidth); 1876 } 1877 1878 // pRef -= srcPitch*(blkheight+1); 1879 ref = temp; 1880 1881 for (j = 0; j < blkwidth; j += 4) 1882 { 1883 r0 = *(int32 *)ref; 1884 r1 = *(int32 *)(ref + 16); 1885 ref += 32; 1886 out = pOut; 1887 for (i = 0; i < (blkheight >> 1); i++) 1888 { 1889 result0 = dy_8 * r0 + 0x00200020; 1890 r2 = *(int32 *)ref; 1891 result0 += dy * r2; 1892 result0 >>= 6; 1893 result0 &= 0x00FF00FF; 1894 r0 = r2; 1895 1896 result1 = dy_8 * r1 + 0x00200020; 1897 r3 = *(int32 *)(ref + 16); 1898 result1 += dy * r3; 1899 result1 >>= 6; 1900 result1 &= 0x00FF00FF; 1901 r1 = r3; 1902 *(int32 *)out = result0 | (result1 << 8); 1903 out += predPitch; 1904 ref += 32; 1905 1906 result0 = dy_8 * r0 + 0x00200020; 1907 r2 = *(int32 *)ref; 1908 result0 += dy * r2; 1909 result0 >>= 6; 1910 result0 &= 0x00FF00FF; 1911 r0 = r2; 1912 1913 result1 = dy_8 * r1 + 0x00200020; 1914 r3 = *(int32 *)(ref + 16); 1915 result1 += dy * r3; 1916 result1 >>= 6; 1917 result1 &= 0x00FF00FF; 1918 r1 = r3; 1919 *(int32 *)out = result0 | (result1 << 8); 1920 out += predPitch; 1921 ref += 32; 1922 } 1923 pOut += 4; 1924 ref = temp + 4; /* since it can only iterate twice max */ 1925 } 1926 return; 1927 } 1928 1929 void eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 1930 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 1931 { 1932 (void)(dy); 1933 1934 int32 r0, r1, r2, result0, result1; 1935 uint8 *ref, *out; 1936 int i, j; 1937 int dx_8 = 8 - dx; 1938 1939 /* horizontal first */ 1940 for (i = 0; i < blkheight; i++) 1941 { 1942 ref = pRef; 1943 out = pOut; 1944 1945 r0 = ref[0]; 1946 for (j = 0; j < blkwidth; j += 4) 1947 { 1948 r0 |= (ref[2] << 16); 1949 result0 = dx_8 * r0 + 0x00040004; 1950 1951 r1 = ref[1] | (ref[3] << 16); 1952 result0 += dx * r1; 1953 result0 >>= 3; 1954 result0 &= 0x00FF00FF; 1955 1956 result1 = dx_8 * r1 + 0x00040004; 1957 1958 r2 = ref[4]; 1959 r0 = r0 >> 16; 1960 r1 = r0 | (r2 << 16); 1961 result1 += dx * r1; 1962 result1 >>= 3; 1963 result1 &= 0x00FF00FF; 1964 1965 *(int32 *)out = result0 | (result1 << 8); 1966 1967 ref += 4; 1968 out += 4; 1969 r0 = r2; 1970 } 1971 1972 pRef += srcPitch; 1973 pOut += predPitch; 1974 } 1975 return; 1976 } 1977 1978 void eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 1979 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 1980 { 1981 (void)(dx); 1982 1983 int32 r0, r1, r2, r3, result0, result1; 1984 int i, j; 1985 uint8 *ref, *out; 1986 int dy_8 = 8 - dy; 1987 /* vertical first */ 1988 for (i = 0; i < blkwidth; i += 4) 1989 { 1990 ref = pRef; 1991 out = pOut; 1992 1993 r0 = ref[0] | (ref[2] << 16); 1994 r1 = ref[1] | (ref[3] << 16); 1995 ref += srcPitch; 1996 for (j = 0; j < blkheight; j++) 1997 { 1998 result0 = dy_8 * r0 + 0x00040004; 1999 r2 = ref[0] | (ref[2] << 16); 2000 result0 += dy * r2; 2001 result0 >>= 3; 2002 result0 &= 0x00FF00FF; 2003 r0 = r2; 2004 2005 result1 = dy_8 * r1 + 0x00040004; 2006 r3 = ref[1] | (ref[3] << 16); 2007 result1 += dy * r3; 2008 result1 >>= 3; 2009 result1 &= 0x00FF00FF; 2010 r1 = r3; 2011 *(int32 *)out = result0 | (result1 << 8); 2012 ref += srcPitch; 2013 out += predPitch; 2014 } 2015 pOut += 4; 2016 pRef += 4; 2017 } 2018 return; 2019 } 2020 2021 void eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2022 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2023 { 2024 (void)(blkwidth); 2025 2026 int32 r0, r1, temp0, temp1, result; 2027 int32 temp[9]; 2028 int32 *out; 2029 int i, r_temp; 2030 int dy_8 = 8 - dy; 2031 2032 /* horizontal first */ 2033 out = temp; 2034 for (i = 0; i < blkheight + 1; i++) 2035 { 2036 r_temp = pRef[1]; 2037 temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]); 2038 temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp); 2039 r0 = temp0 | (temp1 << 16); 2040 *out++ = r0; 2041 pRef += srcPitch; 2042 } 2043 2044 pRef -= srcPitch * (blkheight + 1); 2045 2046 out = temp; 2047 2048 r0 = *out++; 2049 2050 for (i = 0; i < blkheight; i++) 2051 { 2052 result = dy_8 * r0 + 0x00200020; 2053 r1 = *out++; 2054 result += dy * r1; 2055 result >>= 6; 2056 result &= 0x00FF00FF; 2057 *(int16 *)pOut = (result >> 8) | (result & 0xFF); 2058 r0 = r1; 2059 pOut += predPitch; 2060 } 2061 return; 2062 } 2063 2064 void eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2065 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2066 { 2067 (void)(dy); 2068 (void)(blkwidth); 2069 2070 int i, temp, temp0, temp1; 2071 2072 /* horizontal first */ 2073 for (i = 0; i < blkheight; i++) 2074 { 2075 temp = pRef[1]; 2076 temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3; 2077 temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3; 2078 2079 *(int16 *)pOut = temp0 | (temp1 << 8); 2080 pRef += srcPitch; 2081 pOut += predPitch; 2082 2083 } 2084 return; 2085 } 2086 void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2087 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2088 { 2089 (void)(dx); 2090 (void)(blkwidth); 2091 2092 int32 r0, r1, result; 2093 int i; 2094 int dy_8 = 8 - dy; 2095 r0 = pRef[0] | (pRef[1] << 16); 2096 pRef += srcPitch; 2097 for (i = 0; i < blkheight; i++) 2098 { 2099 result = dy_8 * r0 + 0x00040004; 2100 r1 = pRef[0] | (pRef[1] << 16); 2101 result += dy * r1; 2102 result >>= 3; 2103 result &= 0x00FF00FF; 2104 *(int16 *)pOut = (result >> 8) | (result & 0xFF); 2105 r0 = r1; 2106 pRef += srcPitch; 2107 pOut += predPitch; 2108 } 2109 return; 2110 } 2111 2112 void eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2113 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2114 { 2115 (void)(dx); 2116 (void)(dy); 2117 2118 int i, j; 2119 int offset_in = srcPitch - blkwidth; 2120 int offset_out = predPitch - blkwidth; 2121 uint16 temp; 2122 uint8 byte; 2123 2124 if (((uint32)pRef)&1) 2125 { 2126 for (j = blkheight; j > 0; j--) 2127 { 2128 for (i = blkwidth; i > 0; i -= 2) 2129 { 2130 temp = *pRef++; 2131 byte = *pRef++; 2132 temp |= (byte << 8); 2133 *((uint16*)pOut) = temp; /* write 2 bytes */ 2134 pOut += 2; 2135 } 2136 pOut += offset_out; 2137 pRef += offset_in; 2138 } 2139 } 2140 else 2141 { 2142 for (j = blkheight; j > 0; j--) 2143 { 2144 for (i = blkwidth; i > 0; i -= 2) 2145 { 2146 temp = *((uint16*)pRef); 2147 *((uint16*)pOut) = temp; 2148 pRef += 2; 2149 pOut += 2; 2150 } 2151 pOut += offset_out; 2152 pRef += offset_in; 2153 } 2154 } 2155 return ; 2156 } 2157