1 /* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18 #include "avcenc_lib.h" 19 /* 3/29/01 fast half-pel search based on neighboring guess */ 20 /* value ranging from 0 to 4, high complexity (more accurate) to 21 low complexity (less accurate) */ 22 #define HP_DISTANCE_TH 5 // 2 /* half-pel distance threshold */ 23 24 #define PREF_16_VEC 129 /* 1MV bias versus 4MVs*/ 25 26 const static int distance_tab[9][9] = /* [hp_guess][k] */ 27 { 28 {0, 1, 1, 1, 1, 1, 1, 1, 1}, 29 {1, 0, 1, 2, 3, 4, 3, 2, 1}, 30 {1, 0, 0, 0, 1, 2, 3, 2, 1}, 31 {1, 2, 1, 0, 1, 2, 3, 4, 3}, 32 {1, 2, 1, 0, 0, 0, 1, 2, 3}, 33 {1, 4, 3, 2, 1, 0, 1, 2, 3}, 34 {1, 2, 3, 2, 1, 0, 0, 0, 1}, 35 {1, 2, 3, 4, 3, 2, 1, 0, 1}, 36 {1, 0, 1, 2, 3, 2, 1, 0, 0} 37 }; 38 39 #define CLIP_RESULT(x) if((uint)x > 0xFF){ \ 40 x = 0xFF & (~(x>>31));} 41 42 #define CLIP_UPPER16(x) if((uint)x >= 0x20000000){ \ 43 x = 0xFF0000 & (~(x>>31));} \ 44 else { \ 45 x = (x>>5)&0xFF0000; \ 46 } 47 48 /*===================================================================== 49 Function: AVCFindHalfPelMB 50 Date: 10/31/2007 51 Purpose: Find half pel resolution MV surrounding the full-pel MV 52 =====================================================================*/ 53 54 int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand, 55 int xpos, int ypos, int hp_guess, int cmvx, int cmvy) 56 { 57 AVCPictureData *currPic = encvid->common->currPic; 58 int lx = currPic->pitch; 59 int d, dmin, satd_min; 60 uint8* cand; 61 int lambda_motion = encvid->lambda_motion; 62 uint8 *mvbits = encvid->mvbits; 63 int mvcost; 64 /* list of candidate to go through for half-pel search*/ 65 uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions 66 uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */ 67 68 int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2}; 69 int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2}; 70 int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1}; 71 int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1}; 72 int h, hmin, q, qmin; 73 74 OSCL_UNUSED_ARG(xpos); 75 OSCL_UNUSED_ARG(ypos); 76 OSCL_UNUSED_ARG(hp_guess); 77 78 GenerateHalfPelPred(subpel_pred, ncand, lx); 79 80 cur = encvid->currYMB; // pre-load current original MB 81 82 cand = hpel_cand[0]; 83 84 // find cost for the current full-pel position 85 dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD 86 mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy); 87 satd_min = dmin; 88 dmin += mvcost; 89 hmin = 0; 90 91 /* find half-pel */ 92 for (h = 1; h < 9; h++) 93 { 94 d = SATD_MB(hpel_cand[h], cur, dmin); 95 mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy); 96 d += mvcost; 97 98 if (d < dmin) 99 { 100 dmin = d; 101 hmin = h; 102 satd_min = d - mvcost; 103 } 104 } 105 106 mot->sad = dmin; 107 mot->x += xh[hmin]; 108 mot->y += yh[hmin]; 109 encvid->best_hpel_pos = hmin; 110 111 /*** search for quarter-pel ****/ 112 GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin); 113 114 encvid->best_qpel_pos = qmin = -1; 115 116 for (q = 0; q < 8; q++) 117 { 118 d = SATD_MB(encvid->qpel_cand[q], cur, dmin); 119 mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy); 120 d += mvcost; 121 if (d < dmin) 122 { 123 dmin = d; 124 qmin = q; 125 satd_min = d - mvcost; 126 } 127 } 128 129 if (qmin != -1) 130 { 131 mot->sad = dmin; 132 mot->x += xq[qmin]; 133 mot->y += yq[qmin]; 134 encvid->best_qpel_pos = qmin; 135 } 136 137 return satd_min; 138 } 139 140 141 142 /** This function generates sub-pel prediction around the full-pel candidate. 143 Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */ 144 /** The sub-pel position is labeled in spiral manner from the center. */ 145 146 void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx) 147 { 148 /* let's do straightforward way first */ 149 uint8 *ref; 150 uint8 *dst; 151 uint8 tmp8; 152 int32 tmp32; 153 int16 tmp_horz[18*22], *dst_16, *src_16; 154 register int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp register 155 int msk; 156 int i, j; 157 158 /* first copy full-pel to the first array */ 159 /* to be optimized later based on byte-offset load */ 160 ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */ 161 dst = subpel_pred; 162 163 dst -= 4; /* offset */ 164 for (j = 0; j < 22; j++) /* 24x22 */ 165 { 166 i = 6; 167 while (i > 0) 168 { 169 tmp32 = *ref++; 170 tmp8 = *ref++; 171 tmp32 |= (tmp8 << 8); 172 tmp8 = *ref++; 173 tmp32 |= (tmp8 << 16); 174 tmp8 = *ref++; 175 tmp32 |= (tmp8 << 24); 176 *((uint32*)(dst += 4)) = tmp32; 177 i--; 178 } 179 ref += (lx - 24); 180 } 181 182 /* from the first array, we do horizontal interp */ 183 ref = subpel_pred + 2; 184 dst_16 = tmp_horz; /* 17 x 22 */ 185 186 for (j = 4; j > 0; j--) 187 { 188 for (i = 16; i > 0; i -= 4) 189 { 190 a = ref[-2]; 191 b = ref[-1]; 192 c = ref[0]; 193 d = ref[1]; 194 e = ref[2]; 195 f = ref[3]; 196 *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d); 197 a = ref[4]; 198 *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e); 199 b = ref[5]; 200 *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f); 201 c = ref[6]; 202 *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a); 203 204 ref += 4; 205 } 206 /* do the 17th column here */ 207 d = ref[3]; 208 *dst_16 = e + d - 5 * (f + c) + 20 * (a + b); 209 dst_16 += 2; /* stride for tmp_horz is 18 */ 210 ref += 8; /* stride for ref is 24 */ 211 if (j == 3) // move 18 lines down 212 { 213 dst_16 += 324;//18*18; 214 ref += 432;//18*24; 215 } 216 } 217 218 ref -= 480;//20*24; 219 dst_16 -= 360;//20*18; 220 dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/ 221 222 for (j = 18; j > 0; j--) 223 { 224 for (i = 16; i > 0; i -= 4) 225 { 226 a = ref[-2]; 227 b = ref[-1]; 228 c = ref[0]; 229 d = ref[1]; 230 e = ref[2]; 231 f = ref[3]; 232 tmp32 = a + f - 5 * (b + e) + 20 * (c + d); 233 *dst_16++ = tmp32; 234 tmp32 = (tmp32 + 16) >> 5; 235 CLIP_RESULT(tmp32) 236 *dst++ = tmp32; 237 238 a = ref[4]; 239 tmp32 = b + a - 5 * (c + f) + 20 * (d + e); 240 *dst_16++ = tmp32; 241 tmp32 = (tmp32 + 16) >> 5; 242 CLIP_RESULT(tmp32) 243 *dst++ = tmp32; 244 245 b = ref[5]; 246 tmp32 = c + b - 5 * (d + a) + 20 * (e + f); 247 *dst_16++ = tmp32; 248 tmp32 = (tmp32 + 16) >> 5; 249 CLIP_RESULT(tmp32) 250 *dst++ = tmp32; 251 252 c = ref[6]; 253 tmp32 = d + c - 5 * (e + b) + 20 * (f + a); 254 *dst_16++ = tmp32; 255 tmp32 = (tmp32 + 16) >> 5; 256 CLIP_RESULT(tmp32) 257 *dst++ = tmp32; 258 259 ref += 4; 260 } 261 /* do the 17th column here */ 262 d = ref[3]; 263 tmp32 = e + d - 5 * (f + c) + 20 * (a + b); 264 *dst_16 = tmp32; 265 tmp32 = (tmp32 + 16) >> 5; 266 CLIP_RESULT(tmp32) 267 *dst = tmp32; 268 269 dst += 8; /* stride for dst is 24 */ 270 dst_16 += 2; /* stride for tmp_horz is 18 */ 271 ref += 8; /* stride for ref is 24 */ 272 } 273 274 275 /* Do middle point filtering*/ 276 src_16 = tmp_horz; /* 17 x 22 */ 277 dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/ 278 dst -= 24; // offset 279 for (i = 0; i < 17; i++) 280 { 281 for (j = 16; j > 0; j -= 4) 282 { 283 a = *src_16; 284 b = *(src_16 += 18); 285 c = *(src_16 += 18); 286 d = *(src_16 += 18); 287 e = *(src_16 += 18); 288 f = *(src_16 += 18); 289 290 tmp32 = a + f - 5 * (b + e) + 20 * (c + d); 291 tmp32 = (tmp32 + 512) >> 10; 292 CLIP_RESULT(tmp32) 293 *(dst += 24) = tmp32; 294 295 a = *(src_16 += 18); 296 tmp32 = b + a - 5 * (c + f) + 20 * (d + e); 297 tmp32 = (tmp32 + 512) >> 10; 298 CLIP_RESULT(tmp32) 299 *(dst += 24) = tmp32; 300 301 b = *(src_16 += 18); 302 tmp32 = c + b - 5 * (d + a) + 20 * (e + f); 303 tmp32 = (tmp32 + 512) >> 10; 304 CLIP_RESULT(tmp32) 305 *(dst += 24) = tmp32; 306 307 c = *(src_16 += 18); 308 tmp32 = d + c - 5 * (e + b) + 20 * (f + a); 309 tmp32 = (tmp32 + 512) >> 10; 310 CLIP_RESULT(tmp32) 311 *(dst += 24) = tmp32; 312 313 src_16 -= (18 << 2); 314 } 315 316 d = src_16[90]; // 18*5 317 tmp32 = e + d - 5 * (f + c) + 20 * (a + b); 318 tmp32 = (tmp32 + 512) >> 10; 319 CLIP_RESULT(tmp32) 320 dst[24] = tmp32; 321 322 src_16 -= ((18 << 4) - 1); 323 dst -= ((24 << 4) - 1); 324 } 325 326 /* do vertical interpolation */ 327 ref = subpel_pred + 2; 328 dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */ 329 dst -= 24; // offset 330 331 for (i = 2; i > 0; i--) 332 { 333 for (j = 16; j > 0; j -= 4) 334 { 335 a = *ref; 336 b = *(ref += 24); 337 c = *(ref += 24); 338 d = *(ref += 24); 339 e = *(ref += 24); 340 f = *(ref += 24); 341 342 tmp32 = a + f - 5 * (b + e) + 20 * (c + d); 343 tmp32 = (tmp32 + 16) >> 5; 344 CLIP_RESULT(tmp32) 345 *(dst += 24) = tmp32; // 10th 346 347 a = *(ref += 24); 348 tmp32 = b + a - 5 * (c + f) + 20 * (d + e); 349 tmp32 = (tmp32 + 16) >> 5; 350 CLIP_RESULT(tmp32) 351 *(dst += 24) = tmp32; // 10th 352 353 b = *(ref += 24); 354 tmp32 = c + b - 5 * (d + a) + 20 * (e + f); 355 tmp32 = (tmp32 + 16) >> 5; 356 CLIP_RESULT(tmp32) 357 *(dst += 24) = tmp32; // 10th 358 359 c = *(ref += 24); 360 tmp32 = d + c - 5 * (e + b) + 20 * (f + a); 361 tmp32 = (tmp32 + 16) >> 5; 362 CLIP_RESULT(tmp32) 363 *(dst += 24) = tmp32; // 10th 364 365 ref -= (24 << 2); 366 } 367 368 d = ref[120]; // 24*5 369 tmp32 = e + d - 5 * (f + c) + 20 * (a + b); 370 tmp32 = (tmp32 + 16) >> 5; 371 CLIP_RESULT(tmp32) 372 dst[24] = tmp32; // 10th 373 374 dst -= ((24 << 4) - 1); 375 ref -= ((24 << 4) - 1); 376 } 377 378 // note that using SIMD here doesn't help much, the cycle almost stays the same 379 // one can just use the above code and change the for(i=2 to for(i=18 380 for (i = 16; i > 0; i -= 4) 381 { 382 msk = 0; 383 for (j = 17; j > 0; j--) 384 { 385 a = *((uint32*)ref); /* load 4 bytes */ 386 b = (a >> 8) & 0xFF00FF; /* second and fourth byte */ 387 a &= 0xFF00FF; 388 389 c = *((uint32*)(ref + 120)); 390 d = (c >> 8) & 0xFF00FF; 391 c &= 0xFF00FF; 392 393 a += c; 394 b += d; 395 396 e = *((uint32*)(ref + 72)); /* e, f */ 397 f = (e >> 8) & 0xFF00FF; 398 e &= 0xFF00FF; 399 400 c = *((uint32*)(ref + 48)); /* c, d */ 401 d = (c >> 8) & 0xFF00FF; 402 c &= 0xFF00FF; 403 404 c += e; 405 d += f; 406 407 a += 20 * c; 408 b += 20 * d; 409 a += 0x100010; 410 b += 0x100010; 411 412 e = *((uint32*)(ref += 24)); /* e, f */ 413 f = (e >> 8) & 0xFF00FF; 414 e &= 0xFF00FF; 415 416 c = *((uint32*)(ref + 72)); /* c, d */ 417 d = (c >> 8) & 0xFF00FF; 418 c &= 0xFF00FF; 419 420 c += e; 421 d += f; 422 423 a -= 5 * c; 424 b -= 5 * d; 425 426 c = a << 16; 427 d = b << 16; 428 CLIP_UPPER16(a) 429 CLIP_UPPER16(c) 430 CLIP_UPPER16(b) 431 CLIP_UPPER16(d) 432 433 a |= (c >> 16); 434 b |= (d >> 16); 435 // a>>=5; 436 // b>>=5; 437 /* clip */ 438 // msk |= b; msk|=a; 439 // a &= 0xFF00FF; 440 // b &= 0xFF00FF; 441 a |= (b << 8); /* pack it back */ 442 443 *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned. 444 *((uint16*)(dst + 2)) = a >> 16; 445 446 } 447 dst -= 404; // 24*17-4 448 ref -= 404; 449 /* if(msk & 0xFF00FF00) // need clipping 450 { 451 VertInterpWClip(dst,ref); // re-do 4 column with clip 452 }*/ 453 } 454 455 return ; 456 } 457 458 void VertInterpWClip(uint8 *dst, uint8 *ref) 459 { 460 int i, j; 461 int a, b, c, d, e, f; 462 int32 tmp32; 463 464 dst -= 4; 465 ref -= 4; 466 467 for (i = 4; i > 0; i--) 468 { 469 for (j = 16; j > 0; j -= 4) 470 { 471 a = *ref; 472 b = *(ref += 24); 473 c = *(ref += 24); 474 d = *(ref += 24); 475 e = *(ref += 24); 476 f = *(ref += 24); 477 478 tmp32 = a + f - 5 * (b + e) + 20 * (c + d); 479 tmp32 = (tmp32 + 16) >> 5; 480 CLIP_RESULT(tmp32) 481 *(dst += 24) = tmp32; // 10th 482 483 a = *(ref += 24); 484 tmp32 = b + a - 5 * (c + f) + 20 * (d + e); 485 tmp32 = (tmp32 + 16) >> 5; 486 CLIP_RESULT(tmp32) 487 *(dst += 24) = tmp32; // 10th 488 489 b = *(ref += 24); 490 tmp32 = c + b - 5 * (d + a) + 20 * (e + f); 491 tmp32 = (tmp32 + 16) >> 5; 492 CLIP_RESULT(tmp32) 493 *(dst += 24) = tmp32; // 10th 494 495 c = *(ref += 24); 496 tmp32 = d + c - 5 * (e + b) + 20 * (f + a); 497 tmp32 = (tmp32 + 16) >> 5; 498 CLIP_RESULT(tmp32) 499 *(dst += 24) = tmp32; // 10th 500 501 ref -= (24 << 2); 502 } 503 504 d = ref[120]; // 24*5 505 tmp32 = e + d - 5 * (f + c) + 20 * (a + b); 506 tmp32 = (tmp32 + 16) >> 5; 507 CLIP_RESULT(tmp32) 508 dst[24] = tmp32; // 10th 509 510 dst -= ((24 << 4) - 1); 511 ref -= ((24 << 4) - 1); 512 } 513 514 return ; 515 } 516 517 518 void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos) 519 { 520 // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2 521 int i, j; 522 523 uint8 *c1 = qpel_cand; 524 uint8 *tl = bilin_base[0]; 525 uint8 *tr = bilin_base[1]; 526 uint8 *bl = bilin_base[2]; 527 uint8 *br = bilin_base[3]; 528 int a, b, c, d; 529 int offset = 1 - (384 * 7); 530 531 if (!(hpel_pos&1)) // diamond pattern 532 { 533 j = 16; 534 while (j--) 535 { 536 i = 16; 537 while (i--) 538 { 539 d = tr[24]; 540 a = *tr++; 541 b = bl[1]; 542 c = *br++; 543 544 *c1 = (c + a + 1) >> 1; 545 *(c1 += 384) = (b + a + 1) >> 1; /* c2 */ 546 *(c1 += 384) = (b + c + 1) >> 1; /* c3 */ 547 *(c1 += 384) = (b + d + 1) >> 1; /* c4 */ 548 549 b = *bl++; 550 551 *(c1 += 384) = (c + d + 1) >> 1; /* c5 */ 552 *(c1 += 384) = (b + d + 1) >> 1; /* c6 */ 553 *(c1 += 384) = (b + c + 1) >> 1; /* c7 */ 554 *(c1 += 384) = (b + a + 1) >> 1; /* c8 */ 555 556 c1 += offset; 557 } 558 // advance to the next line, pitch is 24 559 tl += 8; 560 tr += 8; 561 bl += 8; 562 br += 8; 563 c1 += 8; 564 } 565 } 566 else // star pattern 567 { 568 j = 16; 569 while (j--) 570 { 571 i = 16; 572 while (i--) 573 { 574 a = *br++; 575 b = *tr++; 576 c = tl[1]; 577 *c1 = (a + b + 1) >> 1; 578 b = bl[1]; 579 *(c1 += 384) = (a + c + 1) >> 1; /* c2 */ 580 c = tl[25]; 581 *(c1 += 384) = (a + b + 1) >> 1; /* c3 */ 582 b = tr[23]; 583 *(c1 += 384) = (a + c + 1) >> 1; /* c4 */ 584 c = tl[24]; 585 *(c1 += 384) = (a + b + 1) >> 1; /* c5 */ 586 b = *bl++; 587 *(c1 += 384) = (a + c + 1) >> 1; /* c6 */ 588 c = *tl++; 589 *(c1 += 384) = (a + b + 1) >> 1; /* c7 */ 590 *(c1 += 384) = (a + c + 1) >> 1; /* c8 */ 591 592 c1 += offset; 593 } 594 // advance to the next line, pitch is 24 595 tl += 8; 596 tr += 8; 597 bl += 8; 598 br += 8; 599 c1 += 8; 600 } 601 } 602 603 return ; 604 } 605 606 607 /* assuming cand always has a pitch of 24 */ 608 int SATD_MB(uint8 *cand, uint8 *cur, int dmin) 609 { 610 int cost; 611 612 613 dmin = (dmin << 16) | 24; 614 cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL); 615 616 return cost; 617 } 618 619 620 621 622 623