Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 #include "avcenc_lib.h"
     19 /* 3/29/01 fast half-pel search based on neighboring guess */
     20 /* value ranging from 0 to 4, high complexity (more accurate) to
     21    low complexity (less accurate) */
     22 #define HP_DISTANCE_TH      5 // 2  /* half-pel distance threshold */
     23 
     24 #define PREF_16_VEC 129     /* 1MV bias versus 4MVs*/
     25 
     26 const static int distance_tab[9][9] =   /* [hp_guess][k] */
     27 {
     28     {0, 1, 1, 1, 1, 1, 1, 1, 1},
     29     {1, 0, 1, 2, 3, 4, 3, 2, 1},
     30     {1, 0, 0, 0, 1, 2, 3, 2, 1},
     31     {1, 2, 1, 0, 1, 2, 3, 4, 3},
     32     {1, 2, 1, 0, 0, 0, 1, 2, 3},
     33     {1, 4, 3, 2, 1, 0, 1, 2, 3},
     34     {1, 2, 3, 2, 1, 0, 0, 0, 1},
     35     {1, 2, 3, 4, 3, 2, 1, 0, 1},
     36     {1, 0, 1, 2, 3, 2, 1, 0, 0}
     37 };
     38 
     39 #define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
     40                  x = 0xFF & (~(x>>31));}
     41 
     42 #define CLIP_UPPER16(x)     if((uint)x >= 0x20000000){ \
     43         x = 0xFF0000 & (~(x>>31));} \
     44         else { \
     45         x = (x>>5)&0xFF0000; \
     46         }
     47 
     48 /*=====================================================================
     49     Function:   AVCFindHalfPelMB
     50     Date:       10/31/2007
     51     Purpose:    Find half pel resolution MV surrounding the full-pel MV
     52 =====================================================================*/
     53 
     54 int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand,
     55                      int xpos, int ypos, int hp_guess, int cmvx, int cmvy)
     56 {
     57     AVCPictureData *currPic = encvid->common->currPic;
     58     int lx = currPic->pitch;
     59     int d, dmin, satd_min;
     60     uint8* cand;
     61     int lambda_motion = encvid->lambda_motion;
     62     uint8 *mvbits = encvid->mvbits;
     63     int mvcost;
     64     /* list of candidate to go through for half-pel search*/
     65     uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions
     66     uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */
     67 
     68     int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2};
     69     int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2};
     70     int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1};
     71     int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1};
     72     int h, hmin, q, qmin;
     73 
     74     OSCL_UNUSED_ARG(xpos);
     75     OSCL_UNUSED_ARG(ypos);
     76     OSCL_UNUSED_ARG(hp_guess);
     77 
     78     GenerateHalfPelPred(subpel_pred, ncand, lx);
     79 
     80     cur = encvid->currYMB; // pre-load current original MB
     81 
     82     cand = hpel_cand[0];
     83 
     84     // find cost for the current full-pel position
     85     dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD
     86     mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy);
     87     satd_min = dmin;
     88     dmin += mvcost;
     89     hmin = 0;
     90 
     91     /* find half-pel */
     92     for (h = 1; h < 9; h++)
     93     {
     94         d = SATD_MB(hpel_cand[h], cur, dmin);
     95         mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy);
     96         d += mvcost;
     97 
     98         if (d < dmin)
     99         {
    100             dmin = d;
    101             hmin = h;
    102             satd_min = d - mvcost;
    103         }
    104     }
    105 
    106     mot->sad = dmin;
    107     mot->x += xh[hmin];
    108     mot->y += yh[hmin];
    109     encvid->best_hpel_pos = hmin;
    110 
    111     /*** search for quarter-pel ****/
    112     GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin);
    113 
    114     encvid->best_qpel_pos = qmin = -1;
    115 
    116     for (q = 0; q < 8; q++)
    117     {
    118         d = SATD_MB(encvid->qpel_cand[q], cur, dmin);
    119         mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy);
    120         d += mvcost;
    121         if (d < dmin)
    122         {
    123             dmin = d;
    124             qmin = q;
    125             satd_min = d - mvcost;
    126         }
    127     }
    128 
    129     if (qmin != -1)
    130     {
    131         mot->sad = dmin;
    132         mot->x += xq[qmin];
    133         mot->y += yq[qmin];
    134         encvid->best_qpel_pos = qmin;
    135     }
    136 
    137     return satd_min;
    138 }
    139 
    140 
    141 
    142 /** This function generates sub-pel prediction around the full-pel candidate.
    143 Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */
    144 /** The sub-pel position is labeled in spiral manner from the center. */
    145 
    146 void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx)
    147 {
    148     /* let's do straightforward way first */
    149     uint8 *ref;
    150     uint8 *dst;
    151     uint8 tmp8;
    152     int32 tmp32;
    153     int16 tmp_horz[18*22], *dst_16, *src_16;
    154     register int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp register
    155     int msk;
    156     int i, j;
    157 
    158     /* first copy full-pel to the first array */
    159     /* to be optimized later based on byte-offset load */
    160     ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */
    161     dst = subpel_pred;
    162 
    163     dst -= 4; /* offset */
    164     for (j = 0; j < 22; j++) /* 24x22 */
    165     {
    166         i = 6;
    167         while (i > 0)
    168         {
    169             tmp32 = *ref++;
    170             tmp8 = *ref++;
    171             tmp32 |= (tmp8 << 8);
    172             tmp8 = *ref++;
    173             tmp32 |= (tmp8 << 16);
    174             tmp8 = *ref++;
    175             tmp32 |= (tmp8 << 24);
    176             *((uint32*)(dst += 4)) = tmp32;
    177             i--;
    178         }
    179         ref += (lx - 24);
    180     }
    181 
    182     /* from the first array, we do horizontal interp */
    183     ref = subpel_pred + 2;
    184     dst_16 = tmp_horz; /* 17 x 22 */
    185 
    186     for (j = 4; j > 0; j--)
    187     {
    188         for (i = 16; i > 0; i -= 4)
    189         {
    190             a = ref[-2];
    191             b = ref[-1];
    192             c = ref[0];
    193             d = ref[1];
    194             e = ref[2];
    195             f = ref[3];
    196             *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d);
    197             a = ref[4];
    198             *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e);
    199             b = ref[5];
    200             *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f);
    201             c = ref[6];
    202             *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a);
    203 
    204             ref += 4;
    205         }
    206         /* do the 17th column here */
    207         d = ref[3];
    208         *dst_16 =  e + d - 5 * (f + c) + 20 * (a + b);
    209         dst_16 += 2; /* stride for tmp_horz is 18 */
    210         ref += 8;  /* stride for ref is 24 */
    211         if (j == 3)  // move 18 lines down
    212         {
    213             dst_16 += 324;//18*18;
    214             ref += 432;//18*24;
    215         }
    216     }
    217 
    218     ref -= 480;//20*24;
    219     dst_16 -= 360;//20*18;
    220     dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/
    221 
    222     for (j = 18; j > 0; j--)
    223     {
    224         for (i = 16; i > 0; i -= 4)
    225         {
    226             a = ref[-2];
    227             b = ref[-1];
    228             c = ref[0];
    229             d = ref[1];
    230             e = ref[2];
    231             f = ref[3];
    232             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
    233             *dst_16++ = tmp32;
    234             tmp32 = (tmp32 + 16) >> 5;
    235             CLIP_RESULT(tmp32)
    236             *dst++ = tmp32;
    237 
    238             a = ref[4];
    239             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
    240             *dst_16++ = tmp32;
    241             tmp32 = (tmp32 + 16) >> 5;
    242             CLIP_RESULT(tmp32)
    243             *dst++ = tmp32;
    244 
    245             b = ref[5];
    246             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
    247             *dst_16++ = tmp32;
    248             tmp32 = (tmp32 + 16) >> 5;
    249             CLIP_RESULT(tmp32)
    250             *dst++ = tmp32;
    251 
    252             c = ref[6];
    253             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
    254             *dst_16++ = tmp32;
    255             tmp32 = (tmp32 + 16) >> 5;
    256             CLIP_RESULT(tmp32)
    257             *dst++ = tmp32;
    258 
    259             ref += 4;
    260         }
    261         /* do the 17th column here */
    262         d = ref[3];
    263         tmp32 =  e + d - 5 * (f + c) + 20 * (a + b);
    264         *dst_16 = tmp32;
    265         tmp32 = (tmp32 + 16) >> 5;
    266         CLIP_RESULT(tmp32)
    267         *dst = tmp32;
    268 
    269         dst += 8;  /* stride for dst is 24 */
    270         dst_16 += 2; /* stride for tmp_horz is 18 */
    271         ref += 8;  /* stride for ref is 24 */
    272     }
    273 
    274 
    275     /* Do middle point filtering*/
    276     src_16 = tmp_horz; /* 17 x 22 */
    277     dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/
    278     dst -= 24; // offset
    279     for (i = 0; i < 17; i++)
    280     {
    281         for (j = 16; j > 0; j -= 4)
    282         {
    283             a = *src_16;
    284             b = *(src_16 += 18);
    285             c = *(src_16 += 18);
    286             d = *(src_16 += 18);
    287             e = *(src_16 += 18);
    288             f = *(src_16 += 18);
    289 
    290             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
    291             tmp32 = (tmp32 + 512) >> 10;
    292             CLIP_RESULT(tmp32)
    293             *(dst += 24) = tmp32;
    294 
    295             a = *(src_16 += 18);
    296             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
    297             tmp32 = (tmp32 + 512) >> 10;
    298             CLIP_RESULT(tmp32)
    299             *(dst += 24) = tmp32;
    300 
    301             b = *(src_16 += 18);
    302             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
    303             tmp32 = (tmp32 + 512) >> 10;
    304             CLIP_RESULT(tmp32)
    305             *(dst += 24) = tmp32;
    306 
    307             c = *(src_16 += 18);
    308             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
    309             tmp32 = (tmp32 + 512) >> 10;
    310             CLIP_RESULT(tmp32)
    311             *(dst += 24) = tmp32;
    312 
    313             src_16 -= (18 << 2);
    314         }
    315 
    316         d = src_16[90]; // 18*5
    317         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
    318         tmp32 = (tmp32 + 512) >> 10;
    319         CLIP_RESULT(tmp32)
    320         dst[24] = tmp32;
    321 
    322         src_16 -= ((18 << 4) - 1);
    323         dst -= ((24 << 4) - 1);
    324     }
    325 
    326     /* do vertical interpolation */
    327     ref = subpel_pred + 2;
    328     dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */
    329     dst -= 24; // offset
    330 
    331     for (i = 2; i > 0; i--)
    332     {
    333         for (j = 16; j > 0; j -= 4)
    334         {
    335             a = *ref;
    336             b = *(ref += 24);
    337             c = *(ref += 24);
    338             d = *(ref += 24);
    339             e = *(ref += 24);
    340             f = *(ref += 24);
    341 
    342             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
    343             tmp32 = (tmp32 + 16) >> 5;
    344             CLIP_RESULT(tmp32)
    345             *(dst += 24) = tmp32;  // 10th
    346 
    347             a = *(ref += 24);
    348             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
    349             tmp32 = (tmp32 + 16) >> 5;
    350             CLIP_RESULT(tmp32)
    351             *(dst += 24) = tmp32;  // 10th
    352 
    353             b = *(ref += 24);
    354             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
    355             tmp32 = (tmp32 + 16) >> 5;
    356             CLIP_RESULT(tmp32)
    357             *(dst += 24) = tmp32;  // 10th
    358 
    359             c = *(ref += 24);
    360             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
    361             tmp32 = (tmp32 + 16) >> 5;
    362             CLIP_RESULT(tmp32)
    363             *(dst += 24) = tmp32;  // 10th
    364 
    365             ref -= (24 << 2);
    366         }
    367 
    368         d = ref[120]; // 24*5
    369         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
    370         tmp32 = (tmp32 + 16) >> 5;
    371         CLIP_RESULT(tmp32)
    372         dst[24] = tmp32;  // 10th
    373 
    374         dst -= ((24 << 4) - 1);
    375         ref -= ((24 << 4) - 1);
    376     }
    377 
    378     // note that using SIMD here doesn't help much, the cycle almost stays the same
    379     // one can just use the above code and change the for(i=2 to for(i=18
    380     for (i = 16; i > 0; i -= 4)
    381     {
    382         msk = 0;
    383         for (j = 17; j > 0; j--)
    384         {
    385             a = *((uint32*)ref); /* load 4 bytes */
    386             b = (a >> 8) & 0xFF00FF; /* second and fourth byte */
    387             a &= 0xFF00FF;
    388 
    389             c = *((uint32*)(ref + 120));
    390             d = (c >> 8) & 0xFF00FF;
    391             c &= 0xFF00FF;
    392 
    393             a += c;
    394             b += d;
    395 
    396             e = *((uint32*)(ref + 72)); /* e, f */
    397             f = (e >> 8) & 0xFF00FF;
    398             e &= 0xFF00FF;
    399 
    400             c = *((uint32*)(ref + 48)); /* c, d */
    401             d = (c >> 8) & 0xFF00FF;
    402             c &= 0xFF00FF;
    403 
    404             c += e;
    405             d += f;
    406 
    407             a += 20 * c;
    408             b += 20 * d;
    409             a += 0x100010;
    410             b += 0x100010;
    411 
    412             e = *((uint32*)(ref += 24)); /* e, f */
    413             f = (e >> 8) & 0xFF00FF;
    414             e &= 0xFF00FF;
    415 
    416             c = *((uint32*)(ref + 72)); /* c, d */
    417             d = (c >> 8) & 0xFF00FF;
    418             c &= 0xFF00FF;
    419 
    420             c += e;
    421             d += f;
    422 
    423             a -= 5 * c;
    424             b -= 5 * d;
    425 
    426             c = a << 16;
    427             d = b << 16;
    428             CLIP_UPPER16(a)
    429             CLIP_UPPER16(c)
    430             CLIP_UPPER16(b)
    431             CLIP_UPPER16(d)
    432 
    433             a |= (c >> 16);
    434             b |= (d >> 16);
    435             //  a>>=5;
    436             //  b>>=5;
    437             /* clip */
    438             //  msk |= b;  msk|=a;
    439             //  a &= 0xFF00FF;
    440             //  b &= 0xFF00FF;
    441             a |= (b << 8);  /* pack it back */
    442 
    443             *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned.
    444             *((uint16*)(dst + 2)) = a >> 16;
    445 
    446         }
    447         dst -= 404; // 24*17-4
    448         ref -= 404;
    449         /*      if(msk & 0xFF00FF00) // need clipping
    450                 {
    451                     VertInterpWClip(dst,ref); // re-do 4 column with clip
    452                 }*/
    453     }
    454 
    455     return ;
    456 }
    457 
    458 void VertInterpWClip(uint8 *dst, uint8 *ref)
    459 {
    460     int i, j;
    461     int a, b, c, d, e, f;
    462     int32 tmp32;
    463 
    464     dst -= 4;
    465     ref -= 4;
    466 
    467     for (i = 4; i > 0; i--)
    468     {
    469         for (j = 16; j > 0; j -= 4)
    470         {
    471             a = *ref;
    472             b = *(ref += 24);
    473             c = *(ref += 24);
    474             d = *(ref += 24);
    475             e = *(ref += 24);
    476             f = *(ref += 24);
    477 
    478             tmp32 = a + f - 5 * (b + e) + 20 * (c + d);
    479             tmp32 = (tmp32 + 16) >> 5;
    480             CLIP_RESULT(tmp32)
    481             *(dst += 24) = tmp32;  // 10th
    482 
    483             a = *(ref += 24);
    484             tmp32 = b + a - 5 * (c + f) + 20 * (d + e);
    485             tmp32 = (tmp32 + 16) >> 5;
    486             CLIP_RESULT(tmp32)
    487             *(dst += 24) = tmp32;  // 10th
    488 
    489             b = *(ref += 24);
    490             tmp32 = c + b - 5 * (d + a) + 20 * (e + f);
    491             tmp32 = (tmp32 + 16) >> 5;
    492             CLIP_RESULT(tmp32)
    493             *(dst += 24) = tmp32;  // 10th
    494 
    495             c = *(ref += 24);
    496             tmp32 = d + c - 5 * (e + b) + 20 * (f + a);
    497             tmp32 = (tmp32 + 16) >> 5;
    498             CLIP_RESULT(tmp32)
    499             *(dst += 24) = tmp32;  // 10th
    500 
    501             ref -= (24 << 2);
    502         }
    503 
    504         d = ref[120]; // 24*5
    505         tmp32 = e + d - 5 * (f + c) + 20 * (a + b);
    506         tmp32 = (tmp32 + 16) >> 5;
    507         CLIP_RESULT(tmp32)
    508         dst[24] = tmp32;  // 10th
    509 
    510         dst -= ((24 << 4) - 1);
    511         ref -= ((24 << 4) - 1);
    512     }
    513 
    514     return ;
    515 }
    516 
    517 
    518 void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos)
    519 {
    520     // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2
    521     int i, j;
    522 
    523     uint8 *c1 = qpel_cand;
    524     uint8 *tl = bilin_base[0];
    525     uint8 *tr = bilin_base[1];
    526     uint8 *bl = bilin_base[2];
    527     uint8 *br = bilin_base[3];
    528     int a, b, c, d;
    529     int offset = 1 - (384 * 7);
    530 
    531     if (!(hpel_pos&1)) // diamond pattern
    532     {
    533         j = 16;
    534         while (j--)
    535         {
    536             i = 16;
    537             while (i--)
    538             {
    539                 d = tr[24];
    540                 a = *tr++;
    541                 b = bl[1];
    542                 c = *br++;
    543 
    544                 *c1 = (c + a + 1) >> 1;
    545                 *(c1 += 384) = (b + a + 1) >> 1; /* c2 */
    546                 *(c1 += 384) = (b + c + 1) >> 1; /* c3 */
    547                 *(c1 += 384) = (b + d + 1) >> 1; /* c4 */
    548 
    549                 b = *bl++;
    550 
    551                 *(c1 += 384) = (c + d + 1) >> 1;  /* c5 */
    552                 *(c1 += 384) = (b + d + 1) >> 1;  /* c6 */
    553                 *(c1 += 384) = (b + c + 1) >> 1;  /* c7 */
    554                 *(c1 += 384) = (b + a + 1) >> 1;  /* c8 */
    555 
    556                 c1 += offset;
    557             }
    558             // advance to the next line, pitch is 24
    559             tl += 8;
    560             tr += 8;
    561             bl += 8;
    562             br += 8;
    563             c1 += 8;
    564         }
    565     }
    566     else // star pattern
    567     {
    568         j = 16;
    569         while (j--)
    570         {
    571             i = 16;
    572             while (i--)
    573             {
    574                 a = *br++;
    575                 b = *tr++;
    576                 c = tl[1];
    577                 *c1 = (a + b + 1) >> 1;
    578                 b = bl[1];
    579                 *(c1 += 384) = (a + c + 1) >> 1; /* c2 */
    580                 c = tl[25];
    581                 *(c1 += 384) = (a + b + 1) >> 1; /* c3 */
    582                 b = tr[23];
    583                 *(c1 += 384) = (a + c + 1) >> 1; /* c4 */
    584                 c = tl[24];
    585                 *(c1 += 384) = (a + b + 1) >> 1; /* c5 */
    586                 b = *bl++;
    587                 *(c1 += 384) = (a + c + 1) >> 1; /* c6 */
    588                 c = *tl++;
    589                 *(c1 += 384) = (a + b + 1) >> 1; /* c7 */
    590                 *(c1 += 384) = (a + c + 1) >> 1; /* c8 */
    591 
    592                 c1 += offset;
    593             }
    594             // advance to the next line, pitch is 24
    595             tl += 8;
    596             tr += 8;
    597             bl += 8;
    598             br += 8;
    599             c1 += 8;
    600         }
    601     }
    602 
    603     return ;
    604 }
    605 
    606 
    607 /* assuming cand always has a pitch of 24 */
    608 int SATD_MB(uint8 *cand, uint8 *cur, int dmin)
    609 {
    610     int cost;
    611 
    612 
    613     dmin = (dmin << 16) | 24;
    614     cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL);
    615 
    616     return cost;
    617 }
    618 
    619 
    620 
    621 
    622 
    623