1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 #include "avcenc_lib.h" 19 #include "avcenc_int.h" 20 21 22 #define CLIP_RESULT(x) if((uint)x > 0xFF){ \ 23 x = 0xFF & (~(x>>31));} 24 25 /* (blkwidth << 2) + (dy << 1) + dx */ 26 static void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) = 27 { 28 &eChromaFullMC_SIMD, 29 &eChromaHorizontalMC_SIMD, 30 &eChromaVerticalMC_SIMD, 31 &eChromaDiagonalMC_SIMD, 32 &eChromaFullMC_SIMD, 33 &eChromaHorizontalMC2_SIMD, 34 &eChromaVerticalMC2_SIMD, 35 &eChromaDiagonalMC2_SIMD 36 }; 37 /* Perform motion prediction and compensation with residue if exist. */ 38 void AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video) 39 { 40 (void)(encvid); 41 42 AVCMacroblock *currMB = video->currMB; 43 AVCPictureData *currPic = video->currPic; 44 int mbPartIdx, subMbPartIdx; 45 int ref_idx; 46 int offset_MbPart_indx = 0; 47 int16 *mv; 48 uint32 x_pos, y_pos; 49 uint8 *curL, *curCb, *curCr; 50 uint8 *ref_l, *ref_Cb, *ref_Cr; 51 uint8 *predBlock, *predCb, *predCr; 52 int block_x, block_y, offset_x, offset_y, offsetP, offset; 53 int x_position = (video->mb_x << 4); 54 int y_position = (video->mb_y << 4); 55 int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx; 56 int picWidth = currPic->width; 57 int picPitch = currPic->pitch; 58 int picHeight = currPic->height; 59 uint32 tmp_word; 60 61 tmp_word = y_position * picPitch; 62 curL = currPic->Sl + tmp_word + x_position; 63 offset = (tmp_word >> 2) + (x_position >> 1); 64 curCb = currPic->Scb + offset; 65 curCr = currPic->Scr + offset; 66 67 predBlock = curL; 68 predCb = curCb; 69 predCr = curCr; 70 71 GetMotionVectorPredictor(video, 1); 72 73 for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++) 74 { 75 MbHeight = currMB->SubMbPartHeight[mbPartIdx]; 76 MbWidth = currMB->SubMbPartWidth[mbPartIdx]; 77 mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1); 78 mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1; 79 ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X]; 80 offset_indx = 0; 81 82 ref_l = video->RefPicList0[ref_idx]->Sl; 83 ref_Cb = video->RefPicList0[ref_idx]->Scb; 84 ref_Cr = video->RefPicList0[ref_idx]->Scr; 85 86 for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++) 87 { 88 block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1); 89 block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1); 90 mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2)); 91 offset_x = x_position + (block_x << 2); 92 offset_y = y_position + (block_y << 2); 93 x_pos = (offset_x << 2) + *mv++; /*quarter pel */ 94 y_pos = (offset_y << 2) + *mv; /*quarter pel */ 95 96 //offset = offset_y * currPic->width; 97 //offsetC = (offset >> 2) + (offset_x >> 1); 98 offsetP = (block_y << 2) * picPitch + (block_x << 2); 99 eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos, 100 /*comp_Sl + offset + offset_x,*/ 101 predBlock + offsetP, picPitch, MbWidth, MbHeight); 102 103 offsetP = (block_y * picWidth) + (block_x << 1); 104 eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 105 /*comp_Scb + offsetC,*/ 106 predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1); 107 eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos, 108 /*comp_Scr + offsetC,*/ 109 predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1); 110 111 offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3; 112 } 113 offset_MbPart_indx = currMB->MbPartWidth >> 4; 114 } 115 116 return ; 117 } 118 119 120 /* preform the actual motion comp here */ 121 void eLumaMotionComp(uint8 *ref, int picpitch, int picheight, 122 int x_pos, int y_pos, 123 uint8 *pred, int pred_pitch, 124 int blkwidth, int blkheight) 125 { 126 (void)(picheight); 127 128 int dx, dy; 129 int temp2[21][21]; /* for intermediate results */ 130 uint8 *ref2; 131 132 dx = x_pos & 3; 133 dy = y_pos & 3; 134 x_pos = x_pos >> 2; /* round it to full-pel resolution */ 135 y_pos = y_pos >> 2; 136 137 /* perform actual motion compensation */ 138 if (dx == 0 && dy == 0) 139 { /* fullpel position *//* G */ 140 141 ref += y_pos * picpitch + x_pos; 142 143 eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight); 144 145 } /* other positions */ 146 else if (dy == 0) 147 { /* no vertical interpolation *//* a,b,c*/ 148 149 ref += y_pos * picpitch + x_pos; 150 151 eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx); 152 } 153 else if (dx == 0) 154 { /*no horizontal interpolation *//* d,h,n */ 155 156 ref += y_pos * picpitch + x_pos; 157 158 eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy); 159 } 160 else if (dy == 2) 161 { /* horizontal cross *//* i, j, k */ 162 163 ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */ 164 165 eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight); 166 167 eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx); 168 } 169 else if (dx == 2) 170 { /* vertical cross */ /* f,q */ 171 172 ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */ 173 174 eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5); 175 eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy); 176 } 177 else 178 { /* diagonal *//* e,g,p,r */ 179 180 ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos; 181 182 ref += (y_pos * picpitch) + x_pos + (dx / 2); 183 184 eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight); 185 } 186 187 return ; 188 } 189 190 void eCreateAlign(uint8 *ref, int picpitch, int y_pos, 191 uint8 *out, int blkwidth, int blkheight) 192 { 193 int i, j; 194 int offset, out_offset; 195 uint32 prev_pix, result, pix1, pix2, pix4; 196 197 ref += y_pos * picpitch;// + x_pos; 198 out_offset = 24 - blkwidth; 199 200 //switch(x_pos&0x3){ 201 switch (((intptr_t)ref)&0x3) 202 { 203 case 1: 204 offset = picpitch - blkwidth - 3; 205 for (j = 0; j < blkheight; j++) 206 { 207 pix1 = *ref++; 208 pix2 = *((uint16*)ref); 209 ref += 2; 210 result = (pix2 << 8) | pix1; 211 212 for (i = 3; i < blkwidth; i += 4) 213 { 214 pix4 = *((uint32*)ref); 215 ref += 4; 216 prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */ 217 result |= prev_pix; 218 *((uint32*)out) = result; /* write 4 bytes */ 219 out += 4; 220 result = pix4 >> 8; /* for the next loop */ 221 } 222 ref += offset; 223 out += out_offset; 224 } 225 break; 226 case 2: 227 offset = picpitch - blkwidth - 2; 228 for (j = 0; j < blkheight; j++) 229 { 230 result = *((uint16*)ref); 231 ref += 2; 232 for (i = 2; i < blkwidth; i += 4) 233 { 234 pix4 = *((uint32*)ref); 235 ref += 4; 236 prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */ 237 result |= prev_pix; 238 *((uint32*)out) = result; /* write 4 bytes */ 239 out += 4; 240 result = pix4 >> 16; /* for the next loop */ 241 } 242 ref += offset; 243 out += out_offset; 244 } 245 break; 246 case 3: 247 offset = picpitch - blkwidth - 1; 248 for (j = 0; j < blkheight; j++) 249 { 250 result = *ref++; 251 for (i = 1; i < blkwidth; i += 4) 252 { 253 pix4 = *((uint32*)ref); 254 ref += 4; 255 prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */ 256 result |= prev_pix; 257 *((uint32*)out) = result; /* write 4 bytes */ 258 out += 4; 259 result = pix4 >> 24; /* for the next loop */ 260 } 261 ref += offset; 262 out += out_offset; 263 } 264 break; 265 } 266 } 267 268 void eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, 269 int blkwidth, int blkheight, int dx) 270 { 271 uint8 *p_ref, *tmp; 272 uint32 *p_cur; 273 uint32 pkres; 274 int result, curr_offset, ref_offset; 275 int j; 276 int32 r0, r1, r2, r3, r4, r5; 277 int32 r13, r6; 278 279 p_cur = (uint32*)out; /* assume it's word aligned */ 280 curr_offset = (outpitch - blkwidth) >> 2; 281 p_ref = in; 282 ref_offset = inpitch - blkwidth; 283 284 if (dx&1) 285 { 286 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ 287 p_ref -= 2; 288 r13 = 0; 289 for (j = blkheight; j > 0; j--) 290 { 291 tmp = p_ref + blkwidth; 292 r0 = p_ref[0]; 293 r1 = p_ref[2]; 294 r0 |= (r1 << 16); /* 0,c,0,a */ 295 r1 = p_ref[1]; 296 r2 = p_ref[3]; 297 r1 |= (r2 << 16); /* 0,d,0,b */ 298 while (p_ref < tmp) 299 { 300 r2 = *(p_ref += 4); /* move pointer to e */ 301 r3 = p_ref[2]; 302 r2 |= (r3 << 16); /* 0,g,0,e */ 303 r3 = p_ref[1]; 304 r4 = p_ref[3]; 305 r3 |= (r4 << 16); /* 0,h,0,f */ 306 307 r4 = r0 + r3; /* c+h, a+f */ 308 r5 = r0 + r1; /* c+d, a+b */ 309 r6 = r2 + r3; /* g+h, e+f */ 310 r5 >>= 16; 311 r5 |= (r6 << 16); /* e+f, c+d */ 312 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 313 r4 += 0x100010; /* +16, +16 */ 314 r5 = r1 + r2; /* d+g, b+e */ 315 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 316 r4 >>= 5; 317 r13 |= r4; /* check clipping */ 318 319 r5 = p_ref[dx+2]; 320 r6 = p_ref[dx+4]; 321 r5 |= (r6 << 16); 322 r4 += r5; 323 r4 += 0x10001; 324 r4 = (r4 >> 1) & 0xFF00FF; 325 326 r5 = p_ref[4]; /* i */ 327 r6 = (r5 << 16); 328 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 329 r5 += r1; /* d+i, b+g */ /* r5 not free */ 330 r1 >>= 16; 331 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 332 r1 += r2; /* f+g, d+e */ 333 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 334 r0 >>= 16; 335 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 336 r0 += r3; /* e+h, c+f */ 337 r5 += 0x100010; /* 16,16 */ 338 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 339 r5 >>= 5; 340 r13 |= r5; /* check clipping */ 341 342 r0 = p_ref[dx+3]; 343 r1 = p_ref[dx+5]; 344 r0 |= (r1 << 16); 345 r5 += r0; 346 r5 += 0x10001; 347 r5 = (r5 >> 1) & 0xFF00FF; 348 349 r4 |= (r5 << 8); /* pack them together */ 350 *p_cur++ = r4; 351 r1 = r3; 352 r0 = r2; 353 } 354 p_cur += curr_offset; /* move to the next line */ 355 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 356 357 if (r13&0xFF000700) /* need clipping */ 358 { 359 /* move back to the beginning of the line */ 360 p_ref -= (ref_offset + blkwidth); /* input */ 361 p_cur -= (outpitch >> 2); 362 363 tmp = p_ref + blkwidth; 364 for (; p_ref < tmp;) 365 { 366 367 r0 = *p_ref++; 368 r1 = *p_ref++; 369 r2 = *p_ref++; 370 r3 = *p_ref++; 371 r4 = *p_ref++; 372 /* first pixel */ 373 r5 = *p_ref++; 374 result = (r0 + r5); 375 r0 = (r1 + r4); 376 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 377 r0 = (r2 + r3); 378 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 379 result = (result + 16) >> 5; 380 CLIP_RESULT(result) 381 /* 3/4 pel, no need to clip */ 382 result = (result + p_ref[dx] + 1); 383 pkres = (result >> 1) ; 384 /* second pixel */ 385 r0 = *p_ref++; 386 result = (r1 + r0); 387 r1 = (r2 + r5); 388 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 389 r1 = (r3 + r4); 390 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 391 result = (result + 16) >> 5; 392 CLIP_RESULT(result) 393 /* 3/4 pel, no need to clip */ 394 result = (result + p_ref[dx] + 1); 395 result = (result >> 1); 396 pkres |= (result << 8); 397 /* third pixel */ 398 r1 = *p_ref++; 399 result = (r2 + r1); 400 r2 = (r3 + r0); 401 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 402 r2 = (r4 + r5); 403 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 404 result = (result + 16) >> 5; 405 CLIP_RESULT(result) 406 /* 3/4 pel, no need to clip */ 407 result = (result + p_ref[dx] + 1); 408 result = (result >> 1); 409 pkres |= (result << 16); 410 /* fourth pixel */ 411 r2 = *p_ref++; 412 result = (r3 + r2); 413 r3 = (r4 + r1); 414 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 415 r3 = (r5 + r0); 416 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 417 result = (result + 16) >> 5; 418 CLIP_RESULT(result) 419 /* 3/4 pel, no need to clip */ 420 result = (result + p_ref[dx] + 1); 421 result = (result >> 1); 422 pkres |= (result << 24); 423 *p_cur++ = pkres; /* write 4 pixels */ 424 p_ref -= 5; /* offset back to the middle of filter */ 425 } 426 p_cur += curr_offset; /* move to the next line */ 427 p_ref += ref_offset; /* move to the next line */ 428 } 429 } 430 } 431 else 432 { 433 p_ref -= 2; 434 r13 = 0; 435 for (j = blkheight; j > 0; j--) 436 { 437 tmp = p_ref + blkwidth; 438 r0 = p_ref[0]; 439 r1 = p_ref[2]; 440 r0 |= (r1 << 16); /* 0,c,0,a */ 441 r1 = p_ref[1]; 442 r2 = p_ref[3]; 443 r1 |= (r2 << 16); /* 0,d,0,b */ 444 while (p_ref < tmp) 445 { 446 r2 = *(p_ref += 4); /* move pointer to e */ 447 r3 = p_ref[2]; 448 r2 |= (r3 << 16); /* 0,g,0,e */ 449 r3 = p_ref[1]; 450 r4 = p_ref[3]; 451 r3 |= (r4 << 16); /* 0,h,0,f */ 452 453 r4 = r0 + r3; /* c+h, a+f */ 454 r5 = r0 + r1; /* c+d, a+b */ 455 r6 = r2 + r3; /* g+h, e+f */ 456 r5 >>= 16; 457 r5 |= (r6 << 16); /* e+f, c+d */ 458 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 459 r4 += 0x100010; /* +16, +16 */ 460 r5 = r1 + r2; /* d+g, b+e */ 461 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 462 r4 >>= 5; 463 r13 |= r4; /* check clipping */ 464 r4 &= 0xFF00FF; /* mask */ 465 466 r5 = p_ref[4]; /* i */ 467 r6 = (r5 << 16); 468 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 469 r5 += r1; /* d+i, b+g */ /* r5 not free */ 470 r1 >>= 16; 471 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 472 r1 += r2; /* f+g, d+e */ 473 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 474 r0 >>= 16; 475 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 476 r0 += r3; /* e+h, c+f */ 477 r5 += 0x100010; /* 16,16 */ 478 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 479 r5 >>= 5; 480 r13 |= r5; /* check clipping */ 481 r5 &= 0xFF00FF; /* mask */ 482 483 r4 |= (r5 << 8); /* pack them together */ 484 *p_cur++ = r4; 485 r1 = r3; 486 r0 = r2; 487 } 488 p_cur += curr_offset; /* move to the next line */ 489 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 490 491 if (r13&0xFF000700) /* need clipping */ 492 { 493 /* move back to the beginning of the line */ 494 p_ref -= (ref_offset + blkwidth); /* input */ 495 p_cur -= (outpitch >> 2); 496 497 tmp = p_ref + blkwidth; 498 for (; p_ref < tmp;) 499 { 500 501 r0 = *p_ref++; 502 r1 = *p_ref++; 503 r2 = *p_ref++; 504 r3 = *p_ref++; 505 r4 = *p_ref++; 506 /* first pixel */ 507 r5 = *p_ref++; 508 result = (r0 + r5); 509 r0 = (r1 + r4); 510 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 511 r0 = (r2 + r3); 512 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 513 result = (result + 16) >> 5; 514 CLIP_RESULT(result) 515 pkres = result; 516 /* second pixel */ 517 r0 = *p_ref++; 518 result = (r1 + r0); 519 r1 = (r2 + r5); 520 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 521 r1 = (r3 + r4); 522 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 523 result = (result + 16) >> 5; 524 CLIP_RESULT(result) 525 pkres |= (result << 8); 526 /* third pixel */ 527 r1 = *p_ref++; 528 result = (r2 + r1); 529 r2 = (r3 + r0); 530 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 531 r2 = (r4 + r5); 532 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 533 result = (result + 16) >> 5; 534 CLIP_RESULT(result) 535 pkres |= (result << 16); 536 /* fourth pixel */ 537 r2 = *p_ref++; 538 result = (r3 + r2); 539 r3 = (r4 + r1); 540 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 541 r3 = (r5 + r0); 542 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 543 result = (result + 16) >> 5; 544 CLIP_RESULT(result) 545 pkres |= (result << 24); 546 *p_cur++ = pkres; /* write 4 pixels */ 547 p_ref -= 5; 548 } 549 p_cur += curr_offset; /* move to the next line */ 550 p_ref += ref_offset; 551 } 552 } 553 } 554 555 return ; 556 } 557 558 void eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch, 559 int blkwidth, int blkheight, int dx) 560 { 561 int *p_ref, *tmp; 562 uint32 *p_cur; 563 uint32 pkres; 564 int result, result2, curr_offset, ref_offset; 565 int j, r0, r1, r2, r3, r4, r5; 566 567 p_cur = (uint32*)out; /* assume it's word aligned */ 568 curr_offset = (outpitch - blkwidth) >> 2; 569 p_ref = in; 570 ref_offset = inpitch - blkwidth; 571 572 if (dx&1) 573 { 574 dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */ 575 576 for (j = blkheight; j > 0 ; j--) 577 { 578 tmp = p_ref + blkwidth; 579 for (; p_ref < tmp;) 580 { 581 582 r0 = p_ref[-2]; 583 r1 = p_ref[-1]; 584 r2 = *p_ref++; 585 r3 = *p_ref++; 586 r4 = *p_ref++; 587 /* first pixel */ 588 r5 = *p_ref++; 589 result = (r0 + r5); 590 r0 = (r1 + r4); 591 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 592 r0 = (r2 + r3); 593 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 594 result = (result + 512) >> 10; 595 CLIP_RESULT(result) 596 result2 = ((p_ref[dx] + 16) >> 5); 597 CLIP_RESULT(result2) 598 /* 3/4 pel, no need to clip */ 599 result = (result + result2 + 1); 600 pkres = (result >> 1); 601 /* second pixel */ 602 r0 = *p_ref++; 603 result = (r1 + r0); 604 r1 = (r2 + r5); 605 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 606 r1 = (r3 + r4); 607 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 608 result = (result + 512) >> 10; 609 CLIP_RESULT(result) 610 result2 = ((p_ref[dx] + 16) >> 5); 611 CLIP_RESULT(result2) 612 /* 3/4 pel, no need to clip */ 613 result = (result + result2 + 1); 614 result = (result >> 1); 615 pkres |= (result << 8); 616 /* third pixel */ 617 r1 = *p_ref++; 618 result = (r2 + r1); 619 r2 = (r3 + r0); 620 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 621 r2 = (r4 + r5); 622 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 623 result = (result + 512) >> 10; 624 CLIP_RESULT(result) 625 result2 = ((p_ref[dx] + 16) >> 5); 626 CLIP_RESULT(result2) 627 /* 3/4 pel, no need to clip */ 628 result = (result + result2 + 1); 629 result = (result >> 1); 630 pkres |= (result << 16); 631 /* fourth pixel */ 632 r2 = *p_ref++; 633 result = (r3 + r2); 634 r3 = (r4 + r1); 635 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 636 r3 = (r5 + r0); 637 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 638 result = (result + 512) >> 10; 639 CLIP_RESULT(result) 640 result2 = ((p_ref[dx] + 16) >> 5); 641 CLIP_RESULT(result2) 642 /* 3/4 pel, no need to clip */ 643 result = (result + result2 + 1); 644 result = (result >> 1); 645 pkres |= (result << 24); 646 *p_cur++ = pkres; /* write 4 pixels */ 647 p_ref -= 3; /* offset back to the middle of filter */ 648 } 649 p_cur += curr_offset; /* move to the next line */ 650 p_ref += ref_offset; /* move to the next line */ 651 } 652 } 653 else 654 { 655 for (j = blkheight; j > 0 ; j--) 656 { 657 tmp = p_ref + blkwidth; 658 for (; p_ref < tmp;) 659 { 660 661 r0 = p_ref[-2]; 662 r1 = p_ref[-1]; 663 r2 = *p_ref++; 664 r3 = *p_ref++; 665 r4 = *p_ref++; 666 /* first pixel */ 667 r5 = *p_ref++; 668 result = (r0 + r5); 669 r0 = (r1 + r4); 670 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 671 r0 = (r2 + r3); 672 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 673 result = (result + 512) >> 10; 674 CLIP_RESULT(result) 675 pkres = result; 676 /* second pixel */ 677 r0 = *p_ref++; 678 result = (r1 + r0); 679 r1 = (r2 + r5); 680 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 681 r1 = (r3 + r4); 682 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 683 result = (result + 512) >> 10; 684 CLIP_RESULT(result) 685 pkres |= (result << 8); 686 /* third pixel */ 687 r1 = *p_ref++; 688 result = (r2 + r1); 689 r2 = (r3 + r0); 690 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 691 r2 = (r4 + r5); 692 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 693 result = (result + 512) >> 10; 694 CLIP_RESULT(result) 695 pkres |= (result << 16); 696 /* fourth pixel */ 697 r2 = *p_ref++; 698 result = (r3 + r2); 699 r3 = (r4 + r1); 700 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 701 r3 = (r5 + r0); 702 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 703 result = (result + 512) >> 10; 704 CLIP_RESULT(result) 705 pkres |= (result << 24); 706 *p_cur++ = pkres; /* write 4 pixels */ 707 p_ref -= 3; /* offset back to the middle of filter */ 708 } 709 p_cur += curr_offset; /* move to the next line */ 710 p_ref += ref_offset; /* move to the next line */ 711 } 712 } 713 714 return ; 715 } 716 717 void eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch, 718 int blkwidth, int blkheight) 719 { 720 uint8 *p_ref, *tmp; 721 int *p_cur; 722 int result, curr_offset, ref_offset; 723 int j, r0, r1, r2, r3, r4, r5; 724 725 p_cur = out; 726 curr_offset = (outpitch - blkwidth); 727 p_ref = in; 728 ref_offset = inpitch - blkwidth; 729 730 for (j = blkheight; j > 0 ; j--) 731 { 732 tmp = p_ref + blkwidth; 733 for (; p_ref < tmp;) 734 { 735 736 r0 = p_ref[-2]; 737 r1 = p_ref[-1]; 738 r2 = *p_ref++; 739 r3 = *p_ref++; 740 r4 = *p_ref++; 741 /* first pixel */ 742 r5 = *p_ref++; 743 result = (r0 + r5); 744 r0 = (r1 + r4); 745 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 746 r0 = (r2 + r3); 747 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 748 *p_cur++ = result; 749 /* second pixel */ 750 r0 = *p_ref++; 751 result = (r1 + r0); 752 r1 = (r2 + r5); 753 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 754 r1 = (r3 + r4); 755 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 756 *p_cur++ = result; 757 /* third pixel */ 758 r1 = *p_ref++; 759 result = (r2 + r1); 760 r2 = (r3 + r0); 761 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 762 r2 = (r4 + r5); 763 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 764 *p_cur++ = result; 765 /* fourth pixel */ 766 r2 = *p_ref++; 767 result = (r3 + r2); 768 r3 = (r4 + r1); 769 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 770 r3 = (r5 + r0); 771 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 772 *p_cur++ = result; 773 p_ref -= 3; /* move back to the middle of the filter */ 774 } 775 p_cur += curr_offset; /* move to the next line */ 776 p_ref += ref_offset; 777 } 778 779 return ; 780 } 781 void eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch, 782 int blkwidth, int blkheight, int dy) 783 { 784 uint8 *p_cur, *p_ref, *tmp; 785 int result, curr_offset, ref_offset; 786 int j, i; 787 int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13; 788 uint8 tmp_in[24][24]; 789 790 /* not word-aligned */ 791 if (((intptr_t)in)&0x3) 792 { 793 eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); 794 in = &tmp_in[2][0]; 795 inpitch = 24; 796 } 797 p_cur = out; 798 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 799 ref_offset = blkheight * inpitch; /* for limit */ 800 801 curr_offset += 3; 802 803 if (dy&1) 804 { 805 dy = (dy >> 1) ? 0 : -inpitch; 806 807 for (j = 0; j < blkwidth; j += 4, in += 4) 808 { 809 r13 = 0; 810 p_ref = in; 811 p_cur -= outpitch; /* compensate for the first offset */ 812 tmp = p_ref + ref_offset; /* limit */ 813 while (p_ref < tmp) /* the loop un-rolled */ 814 { 815 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 816 p_ref += inpitch; 817 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 818 r0 &= 0xFF00FF; 819 820 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 821 r7 = (r1 >> 8) & 0xFF00FF; 822 r1 &= 0xFF00FF; 823 824 r0 += r1; 825 r6 += r7; 826 827 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 828 r8 = (r2 >> 8) & 0xFF00FF; 829 r2 &= 0xFF00FF; 830 831 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 832 r7 = (r1 >> 8) & 0xFF00FF; 833 r1 &= 0xFF00FF; 834 r1 += r2; 835 836 r7 += r8; 837 838 r0 += 20 * r1; 839 r6 += 20 * r7; 840 r0 += 0x100010; 841 r6 += 0x100010; 842 843 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 844 r8 = (r2 >> 8) & 0xFF00FF; 845 r2 &= 0xFF00FF; 846 847 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 848 r7 = (r1 >> 8) & 0xFF00FF; 849 r1 &= 0xFF00FF; 850 r1 += r2; 851 852 r7 += r8; 853 854 r0 -= 5 * r1; 855 r6 -= 5 * r7; 856 857 r0 >>= 5; 858 r6 >>= 5; 859 /* clip */ 860 r13 |= r6; 861 r13 |= r0; 862 //CLIPPACK(r6,result) 863 864 r1 = *((uint32*)(p_ref + dy)); 865 r2 = (r1 >> 8) & 0xFF00FF; 866 r1 &= 0xFF00FF; 867 r0 += r1; 868 r6 += r2; 869 r0 += 0x10001; 870 r6 += 0x10001; 871 r0 = (r0 >> 1) & 0xFF00FF; 872 r6 = (r6 >> 1) & 0xFF00FF; 873 874 r0 |= (r6 << 8); /* pack it back */ 875 *((uint32*)(p_cur += outpitch)) = r0; 876 } 877 p_cur += curr_offset; /* offset to the next pixel */ 878 if (r13 & 0xFF000700) /* this column need clipping */ 879 { 880 p_cur -= 4; 881 for (i = 0; i < 4; i++) 882 { 883 p_ref = in + i; 884 p_cur -= outpitch; /* compensate for the first offset */ 885 886 tmp = p_ref + ref_offset; /* limit */ 887 while (p_ref < tmp) 888 { /* loop un-rolled */ 889 r0 = *(p_ref - (inpitch << 1)); 890 r1 = *(p_ref - inpitch); 891 r2 = *p_ref; 892 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 893 r4 = *(p_ref += inpitch); 894 /* first pixel */ 895 r5 = *(p_ref += inpitch); 896 result = (r0 + r5); 897 r0 = (r1 + r4); 898 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 899 r0 = (r2 + r3); 900 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 901 result = (result + 16) >> 5; 902 CLIP_RESULT(result) 903 /* 3/4 pel, no need to clip */ 904 result = (result + p_ref[dy-(inpitch<<1)] + 1); 905 result = (result >> 1); 906 *(p_cur += outpitch) = result; 907 /* second pixel */ 908 r0 = *(p_ref += inpitch); 909 result = (r1 + r0); 910 r1 = (r2 + r5); 911 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 912 r1 = (r3 + r4); 913 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 914 result = (result + 16) >> 5; 915 CLIP_RESULT(result) 916 /* 3/4 pel, no need to clip */ 917 result = (result + p_ref[dy-(inpitch<<1)] + 1); 918 result = (result >> 1); 919 *(p_cur += outpitch) = result; 920 /* third pixel */ 921 r1 = *(p_ref += inpitch); 922 result = (r2 + r1); 923 r2 = (r3 + r0); 924 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 925 r2 = (r4 + r5); 926 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 927 result = (result + 16) >> 5; 928 CLIP_RESULT(result) 929 /* 3/4 pel, no need to clip */ 930 result = (result + p_ref[dy-(inpitch<<1)] + 1); 931 result = (result >> 1); 932 *(p_cur += outpitch) = result; 933 /* fourth pixel */ 934 r2 = *(p_ref += inpitch); 935 result = (r3 + r2); 936 r3 = (r4 + r1); 937 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 938 r3 = (r5 + r0); 939 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 940 result = (result + 16) >> 5; 941 CLIP_RESULT(result) 942 /* 3/4 pel, no need to clip */ 943 result = (result + p_ref[dy-(inpitch<<1)] + 1); 944 result = (result >> 1); 945 *(p_cur += outpitch) = result; 946 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 947 } 948 p_cur += (curr_offset - 3); 949 } 950 } 951 } 952 } 953 else 954 { 955 for (j = 0; j < blkwidth; j += 4, in += 4) 956 { 957 r13 = 0; 958 p_ref = in; 959 p_cur -= outpitch; /* compensate for the first offset */ 960 tmp = p_ref + ref_offset; /* limit */ 961 while (p_ref < tmp) /* the loop un-rolled */ 962 { 963 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 964 p_ref += inpitch; 965 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 966 r0 &= 0xFF00FF; 967 968 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 969 r7 = (r1 >> 8) & 0xFF00FF; 970 r1 &= 0xFF00FF; 971 972 r0 += r1; 973 r6 += r7; 974 975 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 976 r8 = (r2 >> 8) & 0xFF00FF; 977 r2 &= 0xFF00FF; 978 979 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 980 r7 = (r1 >> 8) & 0xFF00FF; 981 r1 &= 0xFF00FF; 982 r1 += r2; 983 984 r7 += r8; 985 986 r0 += 20 * r1; 987 r6 += 20 * r7; 988 r0 += 0x100010; 989 r6 += 0x100010; 990 991 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 992 r8 = (r2 >> 8) & 0xFF00FF; 993 r2 &= 0xFF00FF; 994 995 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 996 r7 = (r1 >> 8) & 0xFF00FF; 997 r1 &= 0xFF00FF; 998 r1 += r2; 999 1000 r7 += r8; 1001 1002 r0 -= 5 * r1; 1003 r6 -= 5 * r7; 1004 1005 r0 >>= 5; 1006 r6 >>= 5; 1007 /* clip */ 1008 r13 |= r6; 1009 r13 |= r0; 1010 //CLIPPACK(r6,result) 1011 r0 &= 0xFF00FF; 1012 r6 &= 0xFF00FF; 1013 r0 |= (r6 << 8); /* pack it back */ 1014 *((uint32*)(p_cur += outpitch)) = r0; 1015 } 1016 p_cur += curr_offset; /* offset to the next pixel */ 1017 if (r13 & 0xFF000700) /* this column need clipping */ 1018 { 1019 p_cur -= 4; 1020 for (i = 0; i < 4; i++) 1021 { 1022 p_ref = in + i; 1023 p_cur -= outpitch; /* compensate for the first offset */ 1024 tmp = p_ref + ref_offset; /* limit */ 1025 while (p_ref < tmp) 1026 { /* loop un-rolled */ 1027 r0 = *(p_ref - (inpitch << 1)); 1028 r1 = *(p_ref - inpitch); 1029 r2 = *p_ref; 1030 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1031 r4 = *(p_ref += inpitch); 1032 /* first pixel */ 1033 r5 = *(p_ref += inpitch); 1034 result = (r0 + r5); 1035 r0 = (r1 + r4); 1036 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1037 r0 = (r2 + r3); 1038 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1039 result = (result + 16) >> 5; 1040 CLIP_RESULT(result) 1041 *(p_cur += outpitch) = result; 1042 /* second pixel */ 1043 r0 = *(p_ref += inpitch); 1044 result = (r1 + r0); 1045 r1 = (r2 + r5); 1046 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1047 r1 = (r3 + r4); 1048 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1049 result = (result + 16) >> 5; 1050 CLIP_RESULT(result) 1051 *(p_cur += outpitch) = result; 1052 /* third pixel */ 1053 r1 = *(p_ref += inpitch); 1054 result = (r2 + r1); 1055 r2 = (r3 + r0); 1056 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1057 r2 = (r4 + r5); 1058 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1059 result = (result + 16) >> 5; 1060 CLIP_RESULT(result) 1061 *(p_cur += outpitch) = result; 1062 /* fourth pixel */ 1063 r2 = *(p_ref += inpitch); 1064 result = (r3 + r2); 1065 r3 = (r4 + r1); 1066 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1067 r3 = (r5 + r0); 1068 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1069 result = (result + 16) >> 5; 1070 CLIP_RESULT(result) 1071 *(p_cur += outpitch) = result; 1072 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1073 } 1074 p_cur += (curr_offset - 3); 1075 } 1076 } 1077 } 1078 } 1079 1080 return ; 1081 } 1082 1083 void eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch, 1084 int blkwidth, int blkheight) 1085 { 1086 int *p_cur; 1087 uint8 *p_ref, *tmp; 1088 int result, curr_offset, ref_offset; 1089 int j, r0, r1, r2, r3, r4, r5; 1090 1091 p_cur = out; 1092 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 1093 ref_offset = blkheight * inpitch; /* for limit */ 1094 1095 for (j = 0; j < blkwidth; j++) 1096 { 1097 p_cur -= outpitch; /* compensate for the first offset */ 1098 p_ref = in++; 1099 1100 tmp = p_ref + ref_offset; /* limit */ 1101 while (p_ref < tmp) 1102 { /* loop un-rolled */ 1103 r0 = *(p_ref - (inpitch << 1)); 1104 r1 = *(p_ref - inpitch); 1105 r2 = *p_ref; 1106 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1107 r4 = *(p_ref += inpitch); 1108 /* first pixel */ 1109 r5 = *(p_ref += inpitch); 1110 result = (r0 + r5); 1111 r0 = (r1 + r4); 1112 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1113 r0 = (r2 + r3); 1114 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1115 *(p_cur += outpitch) = result; 1116 /* second pixel */ 1117 r0 = *(p_ref += inpitch); 1118 result = (r1 + r0); 1119 r1 = (r2 + r5); 1120 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1121 r1 = (r3 + r4); 1122 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1123 *(p_cur += outpitch) = result; 1124 /* third pixel */ 1125 r1 = *(p_ref += inpitch); 1126 result = (r2 + r1); 1127 r2 = (r3 + r0); 1128 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1129 r2 = (r4 + r5); 1130 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1131 *(p_cur += outpitch) = result; 1132 /* fourth pixel */ 1133 r2 = *(p_ref += inpitch); 1134 result = (r3 + r2); 1135 r3 = (r4 + r1); 1136 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1137 r3 = (r5 + r0); 1138 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1139 *(p_cur += outpitch) = result; 1140 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1141 } 1142 p_cur += curr_offset; 1143 } 1144 1145 return ; 1146 } 1147 1148 void eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch, 1149 int blkwidth, int blkheight, int dy) 1150 { 1151 uint8 *p_cur; 1152 int *p_ref, *tmp; 1153 int result, result2, curr_offset, ref_offset; 1154 int j, r0, r1, r2, r3, r4, r5; 1155 1156 p_cur = out; 1157 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */ 1158 ref_offset = blkheight * inpitch; /* for limit */ 1159 1160 if (dy&1) 1161 { 1162 dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch; 1163 1164 for (j = 0; j < blkwidth; j++) 1165 { 1166 p_cur -= outpitch; /* compensate for the first offset */ 1167 p_ref = in++; 1168 1169 tmp = p_ref + ref_offset; /* limit */ 1170 while (p_ref < tmp) 1171 { /* loop un-rolled */ 1172 r0 = *(p_ref - (inpitch << 1)); 1173 r1 = *(p_ref - inpitch); 1174 r2 = *p_ref; 1175 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1176 r4 = *(p_ref += inpitch); 1177 /* first pixel */ 1178 r5 = *(p_ref += inpitch); 1179 result = (r0 + r5); 1180 r0 = (r1 + r4); 1181 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1182 r0 = (r2 + r3); 1183 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1184 result = (result + 512) >> 10; 1185 CLIP_RESULT(result) 1186 result2 = ((p_ref[dy] + 16) >> 5); 1187 CLIP_RESULT(result2) 1188 /* 3/4 pel, no need to clip */ 1189 result = (result + result2 + 1); 1190 result = (result >> 1); 1191 *(p_cur += outpitch) = result; 1192 /* second pixel */ 1193 r0 = *(p_ref += inpitch); 1194 result = (r1 + r0); 1195 r1 = (r2 + r5); 1196 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1197 r1 = (r3 + r4); 1198 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1199 result = (result + 512) >> 10; 1200 CLIP_RESULT(result) 1201 result2 = ((p_ref[dy] + 16) >> 5); 1202 CLIP_RESULT(result2) 1203 /* 3/4 pel, no need to clip */ 1204 result = (result + result2 + 1); 1205 result = (result >> 1); 1206 *(p_cur += outpitch) = result; 1207 /* third pixel */ 1208 r1 = *(p_ref += inpitch); 1209 result = (r2 + r1); 1210 r2 = (r3 + r0); 1211 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1212 r2 = (r4 + r5); 1213 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1214 result = (result + 512) >> 10; 1215 CLIP_RESULT(result) 1216 result2 = ((p_ref[dy] + 16) >> 5); 1217 CLIP_RESULT(result2) 1218 /* 3/4 pel, no need to clip */ 1219 result = (result + result2 + 1); 1220 result = (result >> 1); 1221 *(p_cur += outpitch) = result; 1222 /* fourth pixel */ 1223 r2 = *(p_ref += inpitch); 1224 result = (r3 + r2); 1225 r3 = (r4 + r1); 1226 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1227 r3 = (r5 + r0); 1228 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1229 result = (result + 512) >> 10; 1230 CLIP_RESULT(result) 1231 result2 = ((p_ref[dy] + 16) >> 5); 1232 CLIP_RESULT(result2) 1233 /* 3/4 pel, no need to clip */ 1234 result = (result + result2 + 1); 1235 result = (result >> 1); 1236 *(p_cur += outpitch) = result; 1237 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1238 } 1239 p_cur += curr_offset; 1240 } 1241 } 1242 else 1243 { 1244 for (j = 0; j < blkwidth; j++) 1245 { 1246 p_cur -= outpitch; /* compensate for the first offset */ 1247 p_ref = in++; 1248 1249 tmp = p_ref + ref_offset; /* limit */ 1250 while (p_ref < tmp) 1251 { /* loop un-rolled */ 1252 r0 = *(p_ref - (inpitch << 1)); 1253 r1 = *(p_ref - inpitch); 1254 r2 = *p_ref; 1255 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1256 r4 = *(p_ref += inpitch); 1257 /* first pixel */ 1258 r5 = *(p_ref += inpitch); 1259 result = (r0 + r5); 1260 r0 = (r1 + r4); 1261 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1262 r0 = (r2 + r3); 1263 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1264 result = (result + 512) >> 10; 1265 CLIP_RESULT(result) 1266 *(p_cur += outpitch) = result; 1267 /* second pixel */ 1268 r0 = *(p_ref += inpitch); 1269 result = (r1 + r0); 1270 r1 = (r2 + r5); 1271 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1272 r1 = (r3 + r4); 1273 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1274 result = (result + 512) >> 10; 1275 CLIP_RESULT(result) 1276 *(p_cur += outpitch) = result; 1277 /* third pixel */ 1278 r1 = *(p_ref += inpitch); 1279 result = (r2 + r1); 1280 r2 = (r3 + r0); 1281 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1282 r2 = (r4 + r5); 1283 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1284 result = (result + 512) >> 10; 1285 CLIP_RESULT(result) 1286 *(p_cur += outpitch) = result; 1287 /* fourth pixel */ 1288 r2 = *(p_ref += inpitch); 1289 result = (r3 + r2); 1290 r3 = (r4 + r1); 1291 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1292 r3 = (r5 + r0); 1293 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1294 result = (result + 512) >> 10; 1295 CLIP_RESULT(result) 1296 *(p_cur += outpitch) = result; 1297 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1298 } 1299 p_cur += curr_offset; 1300 } 1301 } 1302 1303 return ; 1304 } 1305 1306 void eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch, 1307 uint8 *out, int outpitch, 1308 int blkwidth, int blkheight) 1309 { 1310 int j, i; 1311 int result; 1312 uint8 *p_cur, *p_ref, *p_tmp8, *tmp; 1313 int curr_offset, ref_offset; 1314 uint8 tmp_res[24][24], tmp_in[24][24]; 1315 uint32 *p_tmp; 1316 uint32 pkres, tmp_result; 1317 int32 r0, r1, r2, r3, r4, r5; 1318 int32 r6, r7, r8, r9, r10, r13; 1319 1320 ref_offset = inpitch - blkwidth; 1321 p_ref = in1 - 2; 1322 /* perform horizontal interpolation */ 1323 /* not word-aligned */ 1324 /* It is faster to read 1 byte at time to avoid calling CreateAlign */ 1325 /* if(((uint32)p_ref)&0x3) 1326 { 1327 CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight); 1328 p_ref = &tmp_in[0][0]; 1329 ref_offset = 24-blkwidth; 1330 }*/ 1331 1332 p_tmp = (uint32*) & (tmp_res[0][0]); 1333 for (j = blkheight; j > 0; j--) 1334 { 1335 r13 = 0; 1336 tmp = p_ref + blkwidth; 1337 1338 //r0 = *((uint32*)p_ref); /* d,c,b,a */ 1339 //r1 = (r0>>8)&0xFF00FF; /* 0,d,0,b */ 1340 //r0 &= 0xFF00FF; /* 0,c,0,a */ 1341 /* It is faster to read 1 byte at a time */ 1342 r0 = p_ref[0]; 1343 r1 = p_ref[2]; 1344 r0 |= (r1 << 16); /* 0,c,0,a */ 1345 r1 = p_ref[1]; 1346 r2 = p_ref[3]; 1347 r1 |= (r2 << 16); /* 0,d,0,b */ 1348 1349 while (p_ref < tmp) 1350 { 1351 //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */ 1352 //r3 = (r2>>8)&0xFF00FF; /* 0,h,0,f */ 1353 //r2 &= 0xFF00FF; /* 0,g,0,e */ 1354 /* It is faster to read 1 byte at a time */ 1355 r2 = *(p_ref += 4); 1356 r3 = p_ref[2]; 1357 r2 |= (r3 << 16); /* 0,g,0,e */ 1358 r3 = p_ref[1]; 1359 r4 = p_ref[3]; 1360 r3 |= (r4 << 16); /* 0,h,0,f */ 1361 1362 r4 = r0 + r3; /* c+h, a+f */ 1363 r5 = r0 + r1; /* c+d, a+b */ 1364 r6 = r2 + r3; /* g+h, e+f */ 1365 r5 >>= 16; 1366 r5 |= (r6 << 16); /* e+f, c+d */ 1367 r4 += r5 * 20; /* c+20*e+20*f+h, a+20*c+20*d+f */ 1368 r4 += 0x100010; /* +16, +16 */ 1369 r5 = r1 + r2; /* d+g, b+e */ 1370 r4 -= r5 * 5; /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */ 1371 r4 >>= 5; 1372 r13 |= r4; /* check clipping */ 1373 r4 &= 0xFF00FF; /* mask */ 1374 1375 r5 = p_ref[4]; /* i */ 1376 r6 = (r5 << 16); 1377 r5 = r6 | (r2 >> 16);/* 0,i,0,g */ 1378 r5 += r1; /* d+i, b+g */ /* r5 not free */ 1379 r1 >>= 16; 1380 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */ 1381 r1 += r2; /* f+g, d+e */ 1382 r5 += 20 * r1; /* d+20f+20g+i, b+20d+20e+g */ 1383 r0 >>= 16; 1384 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */ 1385 r0 += r3; /* e+h, c+f */ 1386 r5 += 0x100010; /* 16,16 */ 1387 r5 -= r0 * 5; /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */ 1388 r5 >>= 5; 1389 r13 |= r5; /* check clipping */ 1390 r5 &= 0xFF00FF; /* mask */ 1391 1392 r4 |= (r5 << 8); /* pack them together */ 1393 *p_tmp++ = r4; 1394 r1 = r3; 1395 r0 = r2; 1396 } 1397 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */ 1398 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 1399 1400 if (r13&0xFF000700) /* need clipping */ 1401 { 1402 /* move back to the beginning of the line */ 1403 p_ref -= (ref_offset + blkwidth); /* input */ 1404 p_tmp -= 6; /* intermediate output */ 1405 tmp = p_ref + blkwidth; 1406 while (p_ref < tmp) 1407 { 1408 r0 = *p_ref++; 1409 r1 = *p_ref++; 1410 r2 = *p_ref++; 1411 r3 = *p_ref++; 1412 r4 = *p_ref++; 1413 /* first pixel */ 1414 r5 = *p_ref++; 1415 result = (r0 + r5); 1416 r0 = (r1 + r4); 1417 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1418 r0 = (r2 + r3); 1419 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1420 result = (result + 16) >> 5; 1421 CLIP_RESULT(result) 1422 pkres = result; 1423 /* second pixel */ 1424 r0 = *p_ref++; 1425 result = (r1 + r0); 1426 r1 = (r2 + r5); 1427 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1428 r1 = (r3 + r4); 1429 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1430 result = (result + 16) >> 5; 1431 CLIP_RESULT(result) 1432 pkres |= (result << 8); 1433 /* third pixel */ 1434 r1 = *p_ref++; 1435 result = (r2 + r1); 1436 r2 = (r3 + r0); 1437 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1438 r2 = (r4 + r5); 1439 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1440 result = (result + 16) >> 5; 1441 CLIP_RESULT(result) 1442 pkres |= (result << 16); 1443 /* fourth pixel */ 1444 r2 = *p_ref++; 1445 result = (r3 + r2); 1446 r3 = (r4 + r1); 1447 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1448 r3 = (r5 + r0); 1449 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1450 result = (result + 16) >> 5; 1451 CLIP_RESULT(result) 1452 pkres |= (result << 24); 1453 1454 *p_tmp++ = pkres; /* write 4 pixel */ 1455 p_ref -= 5; 1456 } 1457 p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */ 1458 p_ref += ref_offset; /* ref_offset = inpitch-blkwidth; */ 1459 } 1460 } 1461 1462 /* perform vertical interpolation */ 1463 /* not word-aligned */ 1464 if (((intptr_t)in2)&0x3) 1465 { 1466 eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5); 1467 in2 = &tmp_in[2][0]; 1468 inpitch = 24; 1469 } 1470 1471 p_cur = out; 1472 curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */ 1473 pkres = blkheight * inpitch; /* reuse it for limit */ 1474 1475 curr_offset += 3; 1476 1477 for (j = 0; j < blkwidth; j += 4, in2 += 4) 1478 { 1479 r13 = 0; 1480 p_ref = in2; 1481 p_tmp8 = &(tmp_res[0][j]); /* intermediate result */ 1482 p_tmp8 -= 24; /* compensate for the first offset */ 1483 p_cur -= outpitch; /* compensate for the first offset */ 1484 tmp = p_ref + pkres; /* limit */ 1485 while (p_ref < tmp) /* the loop un-rolled */ 1486 { 1487 /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */ 1488 /*p_ref8 = p_ref-(inpitch<<1); r0 = p_ref8[0]; r1 = p_ref8[2]; 1489 r0 |= (r1<<16); r6 = p_ref8[1]; r1 = p_ref8[3]; 1490 r6 |= (r1<<16); p_ref+=inpitch; */ 1491 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */ 1492 p_ref += inpitch; 1493 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */ 1494 r0 &= 0xFF00FF; 1495 1496 /*p_ref8 = p_ref+(inpitch<<1); 1497 r1 = p_ref8[0]; r7 = p_ref8[2]; r1 |= (r7<<16); 1498 r7 = p_ref8[1]; r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1499 r1 = *((uint32*)(p_ref + (inpitch << 1))); /* r1, r7, ref[3] */ 1500 r7 = (r1 >> 8) & 0xFF00FF; 1501 r1 &= 0xFF00FF; 1502 1503 r0 += r1; 1504 r6 += r7; 1505 1506 /*r2 = p_ref[0]; r8 = p_ref[2]; r2 |= (r8<<16); 1507 r8 = p_ref[1]; r1 = p_ref[3]; r8 |= (r1<<16);*/ 1508 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */ 1509 r8 = (r2 >> 8) & 0xFF00FF; 1510 r2 &= 0xFF00FF; 1511 1512 /*p_ref8 = p_ref-inpitch; r1 = p_ref8[0]; r7 = p_ref8[2]; 1513 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1]; 1514 r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1515 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */ 1516 r7 = (r1 >> 8) & 0xFF00FF; 1517 r1 &= 0xFF00FF; 1518 r1 += r2; 1519 1520 r7 += r8; 1521 1522 r0 += 20 * r1; 1523 r6 += 20 * r7; 1524 r0 += 0x100010; 1525 r6 += 0x100010; 1526 1527 /*p_ref8 = p_ref-(inpitch<<1); r2 = p_ref8[0]; r8 = p_ref8[2]; 1528 r2 |= (r8<<16); r8 = p_ref8[1]; r1 = p_ref8[3]; r8 |= (r1<<16);*/ 1529 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */ 1530 r8 = (r2 >> 8) & 0xFF00FF; 1531 r2 &= 0xFF00FF; 1532 1533 /*p_ref8 = p_ref+inpitch; r1 = p_ref8[0]; r7 = p_ref8[2]; 1534 r1 |= (r7<<16); r1 += r2; r7 = p_ref8[1]; 1535 r2 = p_ref8[3]; r7 |= (r2<<16);*/ 1536 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */ 1537 r7 = (r1 >> 8) & 0xFF00FF; 1538 r1 &= 0xFF00FF; 1539 r1 += r2; 1540 1541 r7 += r8; 1542 1543 r0 -= 5 * r1; 1544 r6 -= 5 * r7; 1545 1546 r0 >>= 5; 1547 r6 >>= 5; 1548 /* clip */ 1549 r13 |= r6; 1550 r13 |= r0; 1551 //CLIPPACK(r6,result) 1552 /* add with horizontal results */ 1553 r10 = *((uint32*)(p_tmp8 += 24)); 1554 r9 = (r10 >> 8) & 0xFF00FF; 1555 r10 &= 0xFF00FF; 1556 1557 r0 += r10; 1558 r0 += 0x10001; 1559 r0 = (r0 >> 1) & 0xFF00FF; /* mask to 8 bytes */ 1560 1561 r6 += r9; 1562 r6 += 0x10001; 1563 r6 = (r6 >> 1) & 0xFF00FF; /* mask to 8 bytes */ 1564 1565 r0 |= (r6 << 8); /* pack it back */ 1566 *((uint32*)(p_cur += outpitch)) = r0; 1567 } 1568 p_cur += curr_offset; /* offset to the next pixel */ 1569 if (r13 & 0xFF000700) /* this column need clipping */ 1570 { 1571 p_cur -= 4; 1572 for (i = 0; i < 4; i++) 1573 { 1574 p_ref = in2 + i; 1575 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */ 1576 p_tmp8 -= 24; /* compensate for the first offset */ 1577 p_cur -= outpitch; /* compensate for the first offset */ 1578 tmp = p_ref + pkres; /* limit */ 1579 while (p_ref < tmp) /* the loop un-rolled */ 1580 { 1581 r0 = *(p_ref - (inpitch << 1)); 1582 r1 = *(p_ref - inpitch); 1583 r2 = *p_ref; 1584 r3 = *(p_ref += inpitch); /* modify pointer before loading */ 1585 r4 = *(p_ref += inpitch); 1586 /* first pixel */ 1587 r5 = *(p_ref += inpitch); 1588 result = (r0 + r5); 1589 r0 = (r1 + r4); 1590 result -= (r0 * 5);//result -= r0; result -= (r0<<2); 1591 r0 = (r2 + r3); 1592 result += (r0 * 20);//result += (r0<<4); result += (r0<<2); 1593 result = (result + 16) >> 5; 1594 CLIP_RESULT(result) 1595 tmp_result = *(p_tmp8 += 24); /* modify pointer before loading */ 1596 result = (result + tmp_result + 1); /* no clip */ 1597 result = (result >> 1); 1598 *(p_cur += outpitch) = result; 1599 /* second pixel */ 1600 r0 = *(p_ref += inpitch); 1601 result = (r1 + r0); 1602 r1 = (r2 + r5); 1603 result -= (r1 * 5);//result -= r1; result -= (r1<<2); 1604 r1 = (r3 + r4); 1605 result += (r1 * 20);//result += (r1<<4); result += (r1<<2); 1606 result = (result + 16) >> 5; 1607 CLIP_RESULT(result) 1608 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1609 result = (result + tmp_result + 1); /* no clip */ 1610 result = (result >> 1); 1611 *(p_cur += outpitch) = result; 1612 /* third pixel */ 1613 r1 = *(p_ref += inpitch); 1614 result = (r2 + r1); 1615 r2 = (r3 + r0); 1616 result -= (r2 * 5);//result -= r2; result -= (r2<<2); 1617 r2 = (r4 + r5); 1618 result += (r2 * 20);//result += (r2<<4); result += (r2<<2); 1619 result = (result + 16) >> 5; 1620 CLIP_RESULT(result) 1621 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1622 result = (result + tmp_result + 1); /* no clip */ 1623 result = (result >> 1); 1624 *(p_cur += outpitch) = result; 1625 /* fourth pixel */ 1626 r2 = *(p_ref += inpitch); 1627 result = (r3 + r2); 1628 r3 = (r4 + r1); 1629 result -= (r3 * 5);//result -= r3; result -= (r3<<2); 1630 r3 = (r5 + r0); 1631 result += (r3 * 20);//result += (r3<<4); result += (r3<<2); 1632 result = (result + 16) >> 5; 1633 CLIP_RESULT(result) 1634 tmp_result = *(p_tmp8 += 24); /* intermediate result */ 1635 result = (result + tmp_result + 1); /* no clip */ 1636 result = (result >> 1); 1637 *(p_cur += outpitch) = result; 1638 p_ref -= (inpitch << 1); /* move back to center of the filter of the next one */ 1639 } 1640 p_cur += (curr_offset - 3); 1641 } 1642 } 1643 } 1644 1645 return ; 1646 } 1647 1648 /* position G */ 1649 void eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch, 1650 int blkwidth, int blkheight) 1651 { 1652 int i, j; 1653 int offset_in = inpitch - blkwidth; 1654 int offset_out = outpitch - blkwidth; 1655 uint32 temp; 1656 uint8 byte; 1657 1658 if (((intptr_t)in)&3) 1659 { 1660 for (j = blkheight; j > 0; j--) 1661 { 1662 for (i = blkwidth; i > 0; i -= 4) 1663 { 1664 temp = *in++; 1665 byte = *in++; 1666 temp |= (byte << 8); 1667 byte = *in++; 1668 temp |= (byte << 16); 1669 byte = *in++; 1670 temp |= (byte << 24); 1671 1672 *((uint32*)out) = temp; /* write 4 bytes */ 1673 out += 4; 1674 } 1675 out += offset_out; 1676 in += offset_in; 1677 } 1678 } 1679 else 1680 { 1681 for (j = blkheight; j > 0; j--) 1682 { 1683 for (i = blkwidth; i > 0; i -= 4) 1684 { 1685 temp = *((uint32*)in); 1686 *((uint32*)out) = temp; 1687 in += 4; 1688 out += 4; 1689 } 1690 out += offset_out; 1691 in += offset_in; 1692 } 1693 } 1694 return ; 1695 } 1696 1697 void ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos) 1698 { 1699 int pad_height; 1700 int pad_width; 1701 uint8 *start; 1702 uint32 word1, word2, word3; 1703 int offset, j; 1704 1705 1706 pad_height = 8 + ((y_pos & 7) ? 1 : 0); 1707 pad_width = 8 + ((x_pos & 7) ? 1 : 0); 1708 1709 y_pos >>= 3; 1710 x_pos >>= 3; 1711 // pad vertical first 1712 if (y_pos < 0) // need to pad up 1713 { 1714 if (x_pos < -8) start = ref - 8; 1715 else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width; 1716 else start = ref + x_pos; 1717 1718 /* word-align start */ 1719 offset = (intptr_t)start & 0x3; 1720 if (offset) start -= offset; 1721 1722 word1 = *((uint32*)start); 1723 word2 = *((uint32*)(start + 4)); 1724 word3 = *((uint32*)(start + 8)); 1725 1726 /* pad up N rows */ 1727 j = -y_pos; 1728 if (j > 8) j = 8; 1729 while (j--) 1730 { 1731 *((uint32*)(start -= picpitch)) = word1; 1732 *((uint32*)(start + 4)) = word2; 1733 *((uint32*)(start + 8)) = word3; 1734 } 1735 1736 } 1737 else if (y_pos + pad_height >= picheight) /* pad down */ 1738 { 1739 if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8; 1740 else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) + 1741 picwidth + 7 - pad_width; 1742 else start = ref + picpitch * (picheight - 1) + x_pos; 1743 1744 /* word-align start */ 1745 offset = (intptr_t)start & 0x3; 1746 if (offset) start -= offset; 1747 1748 word1 = *((uint32*)start); 1749 word2 = *((uint32*)(start + 4)); 1750 word3 = *((uint32*)(start + 8)); 1751 1752 /* pad down N rows */ 1753 j = y_pos + pad_height - picheight; 1754 if (j > 8) j = 8; 1755 while (j--) 1756 { 1757 *((uint32*)(start += picpitch)) = word1; 1758 *((uint32*)(start + 4)) = word2; 1759 *((uint32*)(start + 8)) = word3; 1760 } 1761 } 1762 1763 /* now pad horizontal */ 1764 if (x_pos < 0) // pad left 1765 { 1766 if (y_pos < -8) start = ref - (picpitch << 3); 1767 else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch; 1768 else start = ref + y_pos * picpitch; 1769 1770 // now pad left 8 pixels for pad_height rows */ 1771 j = pad_height; 1772 start -= picpitch; 1773 while (j--) 1774 { 1775 word1 = *(start += picpitch); 1776 word1 |= (word1 << 8); 1777 word1 |= (word1 << 16); 1778 *((uint32*)(start - 8)) = word1; 1779 *((uint32*)(start - 4)) = word1; 1780 } 1781 } 1782 else if (x_pos + pad_width >= picwidth) /* pad right */ 1783 { 1784 if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1; 1785 else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1; 1786 else start = ref + y_pos * picpitch + picwidth - 1; 1787 1788 // now pad right 8 pixels for pad_height rows */ 1789 j = pad_height; 1790 start -= picpitch; 1791 while (j--) 1792 { 1793 word1 = *(start += picpitch); 1794 word1 |= (word1 << 8); 1795 word1 |= (word1 << 16); 1796 *((uint32*)(start + 1)) = word1; 1797 *((uint32*)(start + 5)) = word1; 1798 } 1799 } 1800 1801 return ; 1802 } 1803 1804 1805 void eChromaMotionComp(uint8 *ref, int picwidth, int picheight, 1806 int x_pos, int y_pos, 1807 uint8 *pred, int picpitch, 1808 int blkwidth, int blkheight) 1809 { 1810 int dx, dy; 1811 int offset_dx, offset_dy; 1812 int index; 1813 1814 ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos); 1815 1816 dx = x_pos & 7; 1817 dy = y_pos & 7; 1818 offset_dx = (dx + 7) >> 3; 1819 offset_dy = (dy + 7) >> 3; 1820 x_pos = x_pos >> 3; /* round it to full-pel resolution */ 1821 y_pos = y_pos >> 3; 1822 1823 ref += y_pos * picpitch + x_pos; 1824 1825 index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7); 1826 1827 (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight); 1828 return ; 1829 } 1830 1831 1832 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */ 1833 void eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 1834 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 1835 { 1836 int32 r0, r1, r2, r3, result0, result1; 1837 uint8 temp[288]; 1838 uint8 *ref, *out; 1839 int i, j; 1840 int dx_8 = 8 - dx; 1841 int dy_8 = 8 - dy; 1842 1843 /* horizontal first */ 1844 out = temp; 1845 for (i = 0; i < blkheight + 1; i++) 1846 { 1847 ref = pRef; 1848 r0 = ref[0]; 1849 for (j = 0; j < blkwidth; j += 4) 1850 { 1851 r0 |= (ref[2] << 16); 1852 result0 = dx_8 * r0; 1853 1854 r1 = ref[1] | (ref[3] << 16); 1855 result0 += dx * r1; 1856 *(int32 *)out = result0; 1857 1858 result0 = dx_8 * r1; 1859 1860 r2 = ref[4]; 1861 r0 = r0 >> 16; 1862 r1 = r0 | (r2 << 16); 1863 result0 += dx * r1; 1864 *(int32 *)(out + 16) = result0; 1865 1866 ref += 4; 1867 out += 4; 1868 r0 = r2; 1869 } 1870 pRef += srcPitch; 1871 out += (32 - blkwidth); 1872 } 1873 1874 // pRef -= srcPitch*(blkheight+1); 1875 ref = temp; 1876 1877 for (j = 0; j < blkwidth; j += 4) 1878 { 1879 r0 = *(int32 *)ref; 1880 r1 = *(int32 *)(ref + 16); 1881 ref += 32; 1882 out = pOut; 1883 for (i = 0; i < (blkheight >> 1); i++) 1884 { 1885 result0 = dy_8 * r0 + 0x00200020; 1886 r2 = *(int32 *)ref; 1887 result0 += dy * r2; 1888 result0 >>= 6; 1889 result0 &= 0x00FF00FF; 1890 r0 = r2; 1891 1892 result1 = dy_8 * r1 + 0x00200020; 1893 r3 = *(int32 *)(ref + 16); 1894 result1 += dy * r3; 1895 result1 >>= 6; 1896 result1 &= 0x00FF00FF; 1897 r1 = r3; 1898 *(int32 *)out = result0 | (result1 << 8); 1899 out += predPitch; 1900 ref += 32; 1901 1902 result0 = dy_8 * r0 + 0x00200020; 1903 r2 = *(int32 *)ref; 1904 result0 += dy * r2; 1905 result0 >>= 6; 1906 result0 &= 0x00FF00FF; 1907 r0 = r2; 1908 1909 result1 = dy_8 * r1 + 0x00200020; 1910 r3 = *(int32 *)(ref + 16); 1911 result1 += dy * r3; 1912 result1 >>= 6; 1913 result1 &= 0x00FF00FF; 1914 r1 = r3; 1915 *(int32 *)out = result0 | (result1 << 8); 1916 out += predPitch; 1917 ref += 32; 1918 } 1919 pOut += 4; 1920 ref = temp + 4; /* since it can only iterate twice max */ 1921 } 1922 return; 1923 } 1924 1925 void eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 1926 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 1927 { 1928 (void)(dy); 1929 1930 int32 r0, r1, r2, result0, result1; 1931 uint8 *ref, *out; 1932 int i, j; 1933 int dx_8 = 8 - dx; 1934 1935 /* horizontal first */ 1936 for (i = 0; i < blkheight; i++) 1937 { 1938 ref = pRef; 1939 out = pOut; 1940 1941 r0 = ref[0]; 1942 for (j = 0; j < blkwidth; j += 4) 1943 { 1944 r0 |= (ref[2] << 16); 1945 result0 = dx_8 * r0 + 0x00040004; 1946 1947 r1 = ref[1] | (ref[3] << 16); 1948 result0 += dx * r1; 1949 result0 >>= 3; 1950 result0 &= 0x00FF00FF; 1951 1952 result1 = dx_8 * r1 + 0x00040004; 1953 1954 r2 = ref[4]; 1955 r0 = r0 >> 16; 1956 r1 = r0 | (r2 << 16); 1957 result1 += dx * r1; 1958 result1 >>= 3; 1959 result1 &= 0x00FF00FF; 1960 1961 *(int32 *)out = result0 | (result1 << 8); 1962 1963 ref += 4; 1964 out += 4; 1965 r0 = r2; 1966 } 1967 1968 pRef += srcPitch; 1969 pOut += predPitch; 1970 } 1971 return; 1972 } 1973 1974 void eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 1975 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 1976 { 1977 (void)(dx); 1978 1979 int32 r0, r1, r2, r3, result0, result1; 1980 int i, j; 1981 uint8 *ref, *out; 1982 int dy_8 = 8 - dy; 1983 /* vertical first */ 1984 for (i = 0; i < blkwidth; i += 4) 1985 { 1986 ref = pRef; 1987 out = pOut; 1988 1989 r0 = ref[0] | (ref[2] << 16); 1990 r1 = ref[1] | (ref[3] << 16); 1991 ref += srcPitch; 1992 for (j = 0; j < blkheight; j++) 1993 { 1994 result0 = dy_8 * r0 + 0x00040004; 1995 r2 = ref[0] | (ref[2] << 16); 1996 result0 += dy * r2; 1997 result0 >>= 3; 1998 result0 &= 0x00FF00FF; 1999 r0 = r2; 2000 2001 result1 = dy_8 * r1 + 0x00040004; 2002 r3 = ref[1] | (ref[3] << 16); 2003 result1 += dy * r3; 2004 result1 >>= 3; 2005 result1 &= 0x00FF00FF; 2006 r1 = r3; 2007 *(int32 *)out = result0 | (result1 << 8); 2008 ref += srcPitch; 2009 out += predPitch; 2010 } 2011 pOut += 4; 2012 pRef += 4; 2013 } 2014 return; 2015 } 2016 2017 void eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2018 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2019 { 2020 (void)(blkwidth); 2021 2022 int32 r0, r1, temp0, temp1, result; 2023 int32 temp[9]; 2024 int32 *out; 2025 int i, r_temp; 2026 int dy_8 = 8 - dy; 2027 2028 /* horizontal first */ 2029 out = temp; 2030 for (i = 0; i < blkheight + 1; i++) 2031 { 2032 r_temp = pRef[1]; 2033 temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]); 2034 temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp); 2035 r0 = temp0 | (temp1 << 16); 2036 *out++ = r0; 2037 pRef += srcPitch; 2038 } 2039 2040 pRef -= srcPitch * (blkheight + 1); 2041 2042 out = temp; 2043 2044 r0 = *out++; 2045 2046 for (i = 0; i < blkheight; i++) 2047 { 2048 result = dy_8 * r0 + 0x00200020; 2049 r1 = *out++; 2050 result += dy * r1; 2051 result >>= 6; 2052 result &= 0x00FF00FF; 2053 *(int16 *)pOut = (result >> 8) | (result & 0xFF); 2054 r0 = r1; 2055 pOut += predPitch; 2056 } 2057 return; 2058 } 2059 2060 void eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2061 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2062 { 2063 (void)(dy); 2064 (void)(blkwidth); 2065 2066 int i, temp, temp0, temp1; 2067 2068 /* horizontal first */ 2069 for (i = 0; i < blkheight; i++) 2070 { 2071 temp = pRef[1]; 2072 temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3; 2073 temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3; 2074 2075 *(int16 *)pOut = temp0 | (temp1 << 8); 2076 pRef += srcPitch; 2077 pOut += predPitch; 2078 2079 } 2080 return; 2081 } 2082 void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2083 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2084 { 2085 (void)(dx); 2086 (void)(blkwidth); 2087 2088 int32 r0, r1, result; 2089 int i; 2090 int dy_8 = 8 - dy; 2091 r0 = pRef[0] | (pRef[1] << 16); 2092 pRef += srcPitch; 2093 for (i = 0; i < blkheight; i++) 2094 { 2095 result = dy_8 * r0 + 0x00040004; 2096 r1 = pRef[0] | (pRef[1] << 16); 2097 result += dy * r1; 2098 result >>= 3; 2099 result &= 0x00FF00FF; 2100 *(int16 *)pOut = (result >> 8) | (result & 0xFF); 2101 r0 = r1; 2102 pRef += srcPitch; 2103 pOut += predPitch; 2104 } 2105 return; 2106 } 2107 2108 void eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy, 2109 uint8 *pOut, int predPitch, int blkwidth, int blkheight) 2110 { 2111 (void)(dx); 2112 (void)(dy); 2113 2114 int i, j; 2115 int offset_in = srcPitch - blkwidth; 2116 int offset_out = predPitch - blkwidth; 2117 uint16 temp; 2118 uint8 byte; 2119 2120 if (((intptr_t)pRef)&1) 2121 { 2122 for (j = blkheight; j > 0; j--) 2123 { 2124 for (i = blkwidth; i > 0; i -= 2) 2125 { 2126 temp = *pRef++; 2127 byte = *pRef++; 2128 temp |= (byte << 8); 2129 *((uint16*)pOut) = temp; /* write 2 bytes */ 2130 pOut += 2; 2131 } 2132 pOut += offset_out; 2133 pRef += offset_in; 2134 } 2135 } 2136 else 2137 { 2138 for (j = blkheight; j > 0; j--) 2139 { 2140 for (i = blkwidth; i > 0; i -= 2) 2141 { 2142 temp = *((uint16*)pRef); 2143 *((uint16*)pOut) = temp; 2144 pRef += 2; 2145 pOut += 2; 2146 } 2147 pOut += offset_out; 2148 pRef += offset_in; 2149 } 2150 } 2151 return ; 2152 } 2153