Home | History | Annotate | Download | only in source
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*------------------------------------------------------------------------------
     18 
     19     Table of contents
     20 
     21      1. Include headers
     22      2. External compiler flags
     23      3. Module defines
     24      4. Local function prototypes
     25      5. Functions
     26 
     27 ------------------------------------------------------------------------------*/
     28 
     29 /*------------------------------------------------------------------------------
     30     1. Include headers
     31 ------------------------------------------------------------------------------*/
     32 
     33 #include "basetype.h"
     34 #include "h264bsd_reconstruct.h"
     35 #include "h264bsd_macroblock_layer.h"
     36 #include "h264bsd_image.h"
     37 #include "h264bsd_util.h"
     38 
     39 #ifdef H264DEC_OMXDL
     40 #include "omxtypes.h"
     41 #include "omxVC.h"
     42 #include "armVC.h"
     43 #endif /* H264DEC_OMXDL */
     44 
     45 /*------------------------------------------------------------------------------
     46     2. External compiler flags
     47 --------------------------------------------------------------------------------
     48 
     49 --------------------------------------------------------------------------------
     50     3. Module defines
     51 ------------------------------------------------------------------------------*/
     52 
     53 /* Switch off the following Lint messages for this file:
     54  * Info 701: Shift left of signed quantity (int)
     55  * Info 702: Shift right of signed quantity (int)
     56  */
     57 /*lint -e701 -e702 */
     58 
     59 /* Luma fractional-sample positions
     60  *
     61  *  G a b c H
     62  *  d e f g
     63  *  h i j k m
     64  *  n p q r
     65  *  M   s   N
     66  *
     67  *  G, H, M and N are integer sample positions
     68  *  a-s are fractional samples that need to be interpolated.
     69  */
     70 #ifndef H264DEC_OMXDL
     71 static const u32 lumaFracPos[4][4] = {
     72   /* G  d  h  n    a  e  i  p    b  f  j   q     c   g   k   r */
     73     {0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}, {12, 13, 14, 15}};
     74 #endif /* H264DEC_OMXDL */
     75 
     76 /* clipping table, defined in h264bsd_intra_prediction.c */
     77 extern const u8 h264bsdClip[];
     78 
     79 /*------------------------------------------------------------------------------
     80     4. Local function prototypes
     81 ------------------------------------------------------------------------------*/
     82 
     83 #ifndef H264DEC_OMXDL
     84 
     85 /*------------------------------------------------------------------------------
     86 
     87     Function: h264bsdInterpolateChromaHor
     88 
     89         Functional description:
     90           This function performs chroma interpolation in horizontal direction.
     91           Overfilling is done only if needed. Reference image (pRef) is
     92           read at correct position and the predicted part is written to
     93           macroblock's chrominance (predPartChroma)
     94         Inputs:
     95           pRef              pointer to reference frame Cb top-left corner
     96           x0                integer x-coordinate for prediction
     97           y0                integer y-coordinate for prediction
     98           width             width of the reference frame chrominance in pixels
     99           height            height of the reference frame chrominance in pixels
    100           xFrac             horizontal fraction for prediction in 1/8 pixels
    101           chromaPartWidth   width of the predicted part in pixels
    102           chromaPartHeight  height of the predicted part in pixels
    103         Outputs:
    104           predPartChroma    pointer where predicted part is written
    105 
    106 ------------------------------------------------------------------------------*/
    107 #ifndef H264DEC_ARM11
    108 void h264bsdInterpolateChromaHor(
    109   u8 *pRef,
    110   u8 *predPartChroma,
    111   i32 x0,
    112   i32 y0,
    113   u32 width,
    114   u32 height,
    115   u32 xFrac,
    116   u32 chromaPartWidth,
    117   u32 chromaPartHeight)
    118 {
    119 
    120 /* Variables */
    121 
    122     u32 x, y, tmp1, tmp2, tmp3, tmp4, c, val;
    123     u8 *ptrA, *cbr;
    124     u32 comp;
    125     u8 block[9*8*2];
    126 
    127 /* Code */
    128 
    129     ASSERT(predPartChroma);
    130     ASSERT(chromaPartWidth);
    131     ASSERT(chromaPartHeight);
    132     ASSERT(xFrac < 8);
    133     ASSERT(pRef);
    134 
    135     if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
    136         (y0 < 0) || ((u32)y0+chromaPartHeight > height))
    137     {
    138         h264bsdFillBlock(pRef, block, x0, y0, width, height,
    139             chromaPartWidth + 1, chromaPartHeight, chromaPartWidth + 1);
    140         pRef += width * height;
    141         h264bsdFillBlock(pRef, block + (chromaPartWidth+1)*chromaPartHeight,
    142             x0, y0, width, height, chromaPartWidth + 1,
    143             chromaPartHeight, chromaPartWidth + 1);
    144 
    145         pRef = block;
    146         x0 = 0;
    147         y0 = 0;
    148         width = chromaPartWidth+1;
    149         height = chromaPartHeight;
    150     }
    151 
    152     val = 8 - xFrac;
    153 
    154     for (comp = 0; comp <= 1; comp++)
    155     {
    156 
    157         ptrA = pRef + (comp * height + (u32)y0) * width + x0;
    158         cbr = predPartChroma + comp * 8 * 8;
    159 
    160         /* 2x2 pels per iteration
    161          * bilinear horizontal interpolation */
    162         for (y = (chromaPartHeight >> 1); y; y--)
    163         {
    164             for (x = (chromaPartWidth >> 1); x; x--)
    165             {
    166                 tmp1 = ptrA[width];
    167                 tmp2 = *ptrA++;
    168                 tmp3 = ptrA[width];
    169                 tmp4 = *ptrA++;
    170                 c = ((val * tmp1 + xFrac * tmp3) << 3) + 32;
    171                 c >>= 6;
    172                 cbr[8] = (u8)c;
    173                 c = ((val * tmp2 + xFrac * tmp4) << 3) + 32;
    174                 c >>= 6;
    175                 *cbr++ = (u8)c;
    176                 tmp1 = ptrA[width];
    177                 tmp2 = *ptrA;
    178                 c = ((val * tmp3 + xFrac * tmp1) << 3) + 32;
    179                 c >>= 6;
    180                 cbr[8] = (u8)c;
    181                 c = ((val * tmp4 + xFrac * tmp2) << 3) + 32;
    182                 c >>= 6;
    183                 *cbr++ = (u8)c;
    184             }
    185             cbr += 2*8 - chromaPartWidth;
    186             ptrA += 2*width - chromaPartWidth;
    187         }
    188     }
    189 
    190 }
    191 
    192 /*------------------------------------------------------------------------------
    193 
    194     Function: h264bsdInterpolateChromaVer
    195 
    196         Functional description:
    197           This function performs chroma interpolation in vertical direction.
    198           Overfilling is done only if needed. Reference image (pRef) is
    199           read at correct position and the predicted part is written to
    200           macroblock's chrominance (predPartChroma)
    201 
    202 ------------------------------------------------------------------------------*/
    203 
    204 void h264bsdInterpolateChromaVer(
    205   u8 *pRef,
    206   u8 *predPartChroma,
    207   i32 x0,
    208   i32 y0,
    209   u32 width,
    210   u32 height,
    211   u32 yFrac,
    212   u32 chromaPartWidth,
    213   u32 chromaPartHeight)
    214 {
    215 
    216 /* Variables */
    217 
    218     u32 x, y, tmp1, tmp2, tmp3, c, val;
    219     u8 *ptrA, *cbr;
    220     u32 comp;
    221     u8 block[9*8*2];
    222 
    223 /* Code */
    224 
    225     ASSERT(predPartChroma);
    226     ASSERT(chromaPartWidth);
    227     ASSERT(chromaPartHeight);
    228     ASSERT(yFrac < 8);
    229     ASSERT(pRef);
    230 
    231     if ((x0 < 0) || ((u32)x0+chromaPartWidth > width) ||
    232         (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
    233     {
    234         h264bsdFillBlock(pRef, block, x0, y0, width, height, chromaPartWidth,
    235             chromaPartHeight + 1, chromaPartWidth);
    236         pRef += width * height;
    237         h264bsdFillBlock(pRef, block + chromaPartWidth*(chromaPartHeight+1),
    238             x0, y0, width, height, chromaPartWidth,
    239             chromaPartHeight + 1, chromaPartWidth);
    240 
    241         pRef = block;
    242         x0 = 0;
    243         y0 = 0;
    244         width = chromaPartWidth;
    245         height = chromaPartHeight+1;
    246     }
    247 
    248     val = 8 - yFrac;
    249 
    250     for (comp = 0; comp <= 1; comp++)
    251     {
    252 
    253         ptrA = pRef + (comp * height + (u32)y0) * width + x0;
    254         cbr = predPartChroma + comp * 8 * 8;
    255 
    256         /* 2x2 pels per iteration
    257          * bilinear vertical interpolation */
    258         for (y = (chromaPartHeight >> 1); y; y--)
    259         {
    260             for (x = (chromaPartWidth >> 1); x; x--)
    261             {
    262                 tmp3 = ptrA[width*2];
    263                 tmp2 = ptrA[width];
    264                 tmp1 = *ptrA++;
    265                 c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
    266                 c >>= 6;
    267                 cbr[8] = (u8)c;
    268                 c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
    269                 c >>= 6;
    270                 *cbr++ = (u8)c;
    271                 tmp3 = ptrA[width*2];
    272                 tmp2 = ptrA[width];
    273                 tmp1 = *ptrA++;
    274                 c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
    275                 c >>= 6;
    276                 cbr[8] = (u8)c;
    277                 c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
    278                 c >>= 6;
    279                 *cbr++ = (u8)c;
    280             }
    281             cbr += 2*8 - chromaPartWidth;
    282             ptrA += 2*width - chromaPartWidth;
    283         }
    284     }
    285 
    286 }
    287 #endif
    288 /*------------------------------------------------------------------------------
    289 
    290     Function: h264bsdInterpolateChromaHorVer
    291 
    292         Functional description:
    293           This function performs chroma interpolation in horizontal and
    294           vertical direction. Overfilling is done only if needed. Reference
    295           image (ref) is read at correct position and the predicted part
    296           is written to macroblock's chrominance (predPartChroma)
    297 
    298 ------------------------------------------------------------------------------*/
    299 
    300 void h264bsdInterpolateChromaHorVer(
    301   u8 *ref,
    302   u8 *predPartChroma,
    303   i32 x0,
    304   i32 y0,
    305   u32 width,
    306   u32 height,
    307   u32 xFrac,
    308   u32 yFrac,
    309   u32 chromaPartWidth,
    310   u32 chromaPartHeight)
    311 {
    312     u8 block[9*9*2];
    313     u32 x, y, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, valX, valY, plus32 = 32;
    314     u32 comp;
    315     u8 *ptrA, *cbr;
    316 
    317 /* Code */
    318 
    319     ASSERT(predPartChroma);
    320     ASSERT(chromaPartWidth);
    321     ASSERT(chromaPartHeight);
    322     ASSERT(xFrac < 8);
    323     ASSERT(yFrac < 8);
    324     ASSERT(ref);
    325 
    326     if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
    327         (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
    328     {
    329         h264bsdFillBlock(ref, block, x0, y0, width, height,
    330             chromaPartWidth + 1, chromaPartHeight + 1, chromaPartWidth + 1);
    331         ref += width * height;
    332         h264bsdFillBlock(ref, block + (chromaPartWidth+1)*(chromaPartHeight+1),
    333             x0, y0, width, height, chromaPartWidth + 1,
    334             chromaPartHeight + 1, chromaPartWidth + 1);
    335 
    336         ref = block;
    337         x0 = 0;
    338         y0 = 0;
    339         width = chromaPartWidth+1;
    340         height = chromaPartHeight+1;
    341     }
    342 
    343     valX = 8 - xFrac;
    344     valY = 8 - yFrac;
    345 
    346     for (comp = 0; comp <= 1; comp++)
    347     {
    348 
    349         ptrA = ref + (comp * height + (u32)y0) * width + x0;
    350         cbr = predPartChroma + comp * 8 * 8;
    351 
    352         /* 2x2 pels per iteration
    353          * bilinear vertical and horizontal interpolation */
    354         for (y = (chromaPartHeight >> 1); y; y--)
    355         {
    356             tmp1 = *ptrA;
    357             tmp3 = ptrA[width];
    358             tmp5 = ptrA[width*2];
    359             tmp1 *= valY;
    360             tmp1 += tmp3 * yFrac;
    361             tmp3 *= valY;
    362             tmp3 += tmp5 * yFrac;
    363             for (x = (chromaPartWidth >> 1); x; x--)
    364             {
    365                 tmp2 = *++ptrA;
    366                 tmp4 = ptrA[width];
    367                 tmp6 = ptrA[width*2];
    368                 tmp2 *= valY;
    369                 tmp2 += tmp4 * yFrac;
    370                 tmp4 *= valY;
    371                 tmp4 += tmp6 * yFrac;
    372                 tmp1 = tmp1 * valX + plus32;
    373                 tmp3 = tmp3 * valX + plus32;
    374                 tmp1 += tmp2 * xFrac;
    375                 tmp1 >>= 6;
    376                 tmp3 += tmp4 * xFrac;
    377                 tmp3 >>= 6;
    378                 cbr[8] = (u8)tmp3;
    379                 *cbr++ = (u8)tmp1;
    380 
    381                 tmp1 = *++ptrA;
    382                 tmp3 = ptrA[width];
    383                 tmp5 = ptrA[width*2];
    384                 tmp1 *= valY;
    385                 tmp1 += tmp3 * yFrac;
    386                 tmp3 *= valY;
    387                 tmp3 += tmp5 * yFrac;
    388                 tmp2 = tmp2 * valX + plus32;
    389                 tmp4 = tmp4 * valX + plus32;
    390                 tmp2 += tmp1 * xFrac;
    391                 tmp2 >>= 6;
    392                 tmp4 += tmp3 * xFrac;
    393                 tmp4 >>= 6;
    394                 cbr[8] = (u8)tmp4;
    395                 *cbr++ = (u8)tmp2;
    396             }
    397             cbr += 2*8 - chromaPartWidth;
    398             ptrA += 2*width - chromaPartWidth;
    399         }
    400     }
    401 
    402 }
    403 
    404 /*------------------------------------------------------------------------------
    405 
    406     Function: PredictChroma
    407 
    408         Functional description:
    409           Top level chroma prediction function that calls the appropriate
    410           interpolation function. The output is written to macroblock array.
    411 
    412 ------------------------------------------------------------------------------*/
    413 
    414 static void PredictChroma(
    415   u8 *mbPartChroma,
    416   u32 xAL,
    417   u32 yAL,
    418   u32 partWidth,
    419   u32 partHeight,
    420   mv_t *mv,
    421   image_t *refPic)
    422 {
    423 
    424 /* Variables */
    425 
    426     u32 xFrac, yFrac, width, height, chromaPartWidth, chromaPartHeight;
    427     i32 xInt, yInt;
    428     u8 *ref;
    429 
    430 /* Code */
    431 
    432     ASSERT(mv);
    433     ASSERT(refPic);
    434     ASSERT(refPic->data);
    435     ASSERT(refPic->width);
    436     ASSERT(refPic->height);
    437 
    438     width  = 8 * refPic->width;
    439     height = 8 * refPic->height;
    440 
    441     xInt = (xAL >> 1) + (mv->hor >> 3);
    442     yInt = (yAL >> 1) + (mv->ver >> 3);
    443     xFrac = mv->hor & 0x7;
    444     yFrac = mv->ver & 0x7;
    445 
    446     chromaPartWidth  = partWidth >> 1;
    447     chromaPartHeight = partHeight >> 1;
    448     ref = refPic->data + 256 * refPic->width * refPic->height;
    449 
    450     if (xFrac && yFrac)
    451     {
    452         h264bsdInterpolateChromaHorVer(ref, mbPartChroma, xInt, yInt, width,
    453                 height, xFrac, yFrac, chromaPartWidth, chromaPartHeight);
    454     }
    455     else if (xFrac)
    456     {
    457         h264bsdInterpolateChromaHor(ref, mbPartChroma, xInt, yInt, width,
    458                 height, xFrac, chromaPartWidth, chromaPartHeight);
    459     }
    460     else if (yFrac)
    461     {
    462         h264bsdInterpolateChromaVer(ref, mbPartChroma, xInt, yInt, width,
    463                 height, yFrac, chromaPartWidth, chromaPartHeight);
    464     }
    465     else
    466     {
    467         h264bsdFillBlock(ref, mbPartChroma, xInt, yInt, width, height,
    468             chromaPartWidth, chromaPartHeight, 8);
    469         ref += width * height;
    470         h264bsdFillBlock(ref, mbPartChroma + 8*8, xInt, yInt, width, height,
    471             chromaPartWidth, chromaPartHeight, 8);
    472     }
    473 
    474 }
    475 
    476 
    477 /*------------------------------------------------------------------------------
    478 
    479     Function: h264bsdInterpolateVerHalf
    480 
    481         Functional description:
    482           Function to perform vertical interpolation of pixel position 'h'
    483           for a block. Overfilling is done only if needed. Reference
    484           image (ref) is read at correct position and the predicted part
    485           is written to macroblock array (mb)
    486 
    487 ------------------------------------------------------------------------------*/
    488 #ifndef H264DEC_ARM11
    489 void h264bsdInterpolateVerHalf(
    490   u8 *ref,
    491   u8 *mb,
    492   i32 x0,
    493   i32 y0,
    494   u32 width,
    495   u32 height,
    496   u32 partWidth,
    497   u32 partHeight)
    498 {
    499     u32 p1[21*21/4+1];
    500     u32 i, j;
    501     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    502     u8 *ptrC, *ptrV;
    503     const u8 *clp = h264bsdClip + 512;
    504 
    505     /* Code */
    506 
    507     ASSERT(ref);
    508     ASSERT(mb);
    509 
    510     if ((x0 < 0) || ((u32)x0+partWidth > width) ||
    511         (y0 < 0) || ((u32)y0+partHeight+5 > height))
    512     {
    513         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
    514                 partWidth, partHeight+5, partWidth);
    515 
    516         x0 = 0;
    517         y0 = 0;
    518         ref = (u8*)p1;
    519         width = partWidth;
    520     }
    521 
    522     ref += (u32)y0 * width + (u32)x0;
    523 
    524     ptrC = ref + width;
    525     ptrV = ptrC + 5*width;
    526 
    527     /* 4 pixels per iteration, interpolate using 5 vertical samples */
    528     for (i = (partHeight >> 2); i; i--)
    529     {
    530         /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
    531         for (j = partWidth; j; j--)
    532         {
    533             tmp4 = ptrV[-(i32)width*2];
    534             tmp5 = ptrV[-(i32)width];
    535             tmp1 = ptrV[width];
    536             tmp2 = ptrV[width*2];
    537             tmp6 = *ptrV++;
    538 
    539             tmp7 = tmp4 + tmp1;
    540             tmp2 -= (tmp7 << 2);
    541             tmp2 -= tmp7;
    542             tmp2 += 16;
    543             tmp7 = tmp5 + tmp6;
    544             tmp3 = ptrC[width*2];
    545             tmp2 += (tmp7 << 4);
    546             tmp2 += (tmp7 << 2);
    547             tmp2 += tmp3;
    548             tmp2 = clp[tmp2>>5];
    549             tmp1 += 16;
    550             mb[48] = (u8)tmp2;
    551 
    552             tmp7 = tmp3 + tmp6;
    553             tmp1 -= (tmp7 << 2);
    554             tmp1 -= tmp7;
    555             tmp7 = tmp4 + tmp5;
    556             tmp2 = ptrC[width];
    557             tmp1 += (tmp7 << 4);
    558             tmp1 += (tmp7 << 2);
    559             tmp1 += tmp2;
    560             tmp1 = clp[tmp1>>5];
    561             tmp6 += 16;
    562             mb[32] = (u8)tmp1;
    563 
    564             tmp7 = tmp2 + tmp5;
    565             tmp6 -= (tmp7 << 2);
    566             tmp6 -= tmp7;
    567             tmp7 = tmp4 + tmp3;
    568             tmp1 = *ptrC;
    569             tmp6 += (tmp7 << 4);
    570             tmp6 += (tmp7 << 2);
    571             tmp6 += tmp1;
    572             tmp6 = clp[tmp6>>5];
    573             tmp5 += 16;
    574             mb[16] = (u8)tmp6;
    575 
    576             tmp1 += tmp4;
    577             tmp5 -= (tmp1 << 2);
    578             tmp5 -= tmp1;
    579             tmp3 += tmp2;
    580             tmp6 = ptrC[-(i32)width];
    581             tmp5 += (tmp3 << 4);
    582             tmp5 += (tmp3 << 2);
    583             tmp5 += tmp6;
    584             tmp5 = clp[tmp5>>5];
    585             *mb++ = (u8)tmp5;
    586             ptrC++;
    587         }
    588         ptrC += 4*width - partWidth;
    589         ptrV += 4*width - partWidth;
    590         mb += 4*16 - partWidth;
    591     }
    592 
    593 }
    594 
    595 /*------------------------------------------------------------------------------
    596 
    597     Function: h264bsdInterpolateVerQuarter
    598 
    599         Functional description:
    600           Function to perform vertical interpolation of pixel position 'd'
    601           or 'n' for a block. Overfilling is done only if needed. Reference
    602           image (ref) is read at correct position and the predicted part
    603           is written to macroblock array (mb)
    604 
    605 ------------------------------------------------------------------------------*/
    606 
    607 void h264bsdInterpolateVerQuarter(
    608   u8 *ref,
    609   u8 *mb,
    610   i32 x0,
    611   i32 y0,
    612   u32 width,
    613   u32 height,
    614   u32 partWidth,
    615   u32 partHeight,
    616   u32 verOffset)    /* 0 for pixel d, 1 for pixel n */
    617 {
    618     u32 p1[21*21/4+1];
    619     u32 i, j;
    620     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    621     u8 *ptrC, *ptrV, *ptrInt;
    622     const u8 *clp = h264bsdClip + 512;
    623 
    624     /* Code */
    625 
    626     ASSERT(ref);
    627     ASSERT(mb);
    628 
    629     if ((x0 < 0) || ((u32)x0+partWidth > width) ||
    630         (y0 < 0) || ((u32)y0+partHeight+5 > height))
    631     {
    632         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
    633                 partWidth, partHeight+5, partWidth);
    634 
    635         x0 = 0;
    636         y0 = 0;
    637         ref = (u8*)p1;
    638         width = partWidth;
    639     }
    640 
    641     ref += (u32)y0 * width + (u32)x0;
    642 
    643     ptrC = ref + width;
    644     ptrV = ptrC + 5*width;
    645 
    646     /* Pointer to integer sample position, either M or R */
    647     ptrInt = ptrC + (2+verOffset)*width;
    648 
    649     /* 4 pixels per iteration
    650      * interpolate using 5 vertical samples and average between
    651      * interpolated value and integer sample value */
    652     for (i = (partHeight >> 2); i; i--)
    653     {
    654         /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
    655         for (j = partWidth; j; j--)
    656         {
    657             tmp4 = ptrV[-(i32)width*2];
    658             tmp5 = ptrV[-(i32)width];
    659             tmp1 = ptrV[width];
    660             tmp2 = ptrV[width*2];
    661             tmp6 = *ptrV++;
    662 
    663             tmp7 = tmp4 + tmp1;
    664             tmp2 -= (tmp7 << 2);
    665             tmp2 -= tmp7;
    666             tmp2 += 16;
    667             tmp7 = tmp5 + tmp6;
    668             tmp3 = ptrC[width*2];
    669             tmp2 += (tmp7 << 4);
    670             tmp2 += (tmp7 << 2);
    671             tmp2 += tmp3;
    672             tmp2 = clp[tmp2>>5];
    673             tmp7 = ptrInt[width*2];
    674             tmp1 += 16;
    675             tmp2++;
    676             mb[48] = (u8)((tmp2 + tmp7) >> 1);
    677 
    678             tmp7 = tmp3 + tmp6;
    679             tmp1 -= (tmp7 << 2);
    680             tmp1 -= tmp7;
    681             tmp7 = tmp4 + tmp5;
    682             tmp2 = ptrC[width];
    683             tmp1 += (tmp7 << 4);
    684             tmp1 += (tmp7 << 2);
    685             tmp1 += tmp2;
    686             tmp1 = clp[tmp1>>5];
    687             tmp7 = ptrInt[width];
    688             tmp6 += 16;
    689             tmp1++;
    690             mb[32] = (u8)((tmp1 + tmp7) >> 1);
    691 
    692             tmp7 = tmp2 + tmp5;
    693             tmp6 -= (tmp7 << 2);
    694             tmp6 -= tmp7;
    695             tmp7 = tmp4 + tmp3;
    696             tmp1 = *ptrC;
    697             tmp6 += (tmp7 << 4);
    698             tmp6 += (tmp7 << 2);
    699             tmp6 += tmp1;
    700             tmp6 = clp[tmp6>>5];
    701             tmp7 = *ptrInt;
    702             tmp5 += 16;
    703             tmp6++;
    704             mb[16] = (u8)((tmp6 + tmp7) >> 1);
    705 
    706             tmp1 += tmp4;
    707             tmp5 -= (tmp1 << 2);
    708             tmp5 -= tmp1;
    709             tmp3 += tmp2;
    710             tmp6 = ptrC[-(i32)width];
    711             tmp5 += (tmp3 << 4);
    712             tmp5 += (tmp3 << 2);
    713             tmp5 += tmp6;
    714             tmp5 = clp[tmp5>>5];
    715             tmp7 = ptrInt[-(i32)width];
    716             tmp5++;
    717             *mb++ = (u8)((tmp5 + tmp7) >> 1);
    718             ptrC++;
    719             ptrInt++;
    720         }
    721         ptrC += 4*width - partWidth;
    722         ptrV += 4*width - partWidth;
    723         ptrInt += 4*width - partWidth;
    724         mb += 4*16 - partWidth;
    725     }
    726 
    727 }
    728 
    729 /*------------------------------------------------------------------------------
    730 
    731     Function: h264bsdInterpolateHorHalf
    732 
    733         Functional description:
    734           Function to perform horizontal interpolation of pixel position 'b'
    735           for a block. Overfilling is done only if needed. Reference
    736           image (ref) is read at correct position and the predicted part
    737           is written to macroblock array (mb)
    738 
    739 ------------------------------------------------------------------------------*/
    740 
    741 void h264bsdInterpolateHorHalf(
    742   u8 *ref,
    743   u8 *mb,
    744   i32 x0,
    745   i32 y0,
    746   u32 width,
    747   u32 height,
    748   u32 partWidth,
    749   u32 partHeight)
    750 {
    751     u32 p1[21*21/4+1];
    752     u8 *ptrJ;
    753     u32 x, y;
    754     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    755     const u8 *clp = h264bsdClip + 512;
    756 
    757     /* Code */
    758 
    759     ASSERT(ref);
    760     ASSERT(mb);
    761     ASSERT((partWidth&0x3) == 0);
    762     ASSERT((partHeight&0x3) == 0);
    763 
    764     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
    765         (y0 < 0) || ((u32)y0+partHeight > height))
    766     {
    767         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
    768                 partWidth+5, partHeight, partWidth+5);
    769 
    770         x0 = 0;
    771         y0 = 0;
    772         ref = (u8*)p1;
    773         width = partWidth + 5;
    774     }
    775 
    776     ref += (u32)y0 * width + (u32)x0;
    777 
    778     ptrJ = ref + 5;
    779 
    780     for (y = partHeight; y; y--)
    781     {
    782         tmp6 = *(ptrJ - 5);
    783         tmp5 = *(ptrJ - 4);
    784         tmp4 = *(ptrJ - 3);
    785         tmp3 = *(ptrJ - 2);
    786         tmp2 = *(ptrJ - 1);
    787 
    788         /* calculate 4 pels per iteration */
    789         for (x = (partWidth >> 2); x; x--)
    790         {
    791             /* First pixel */
    792             tmp6 += 16;
    793             tmp7 = tmp3 + tmp4;
    794             tmp6 += (tmp7 << 4);
    795             tmp6 += (tmp7 << 2);
    796             tmp7 = tmp2 + tmp5;
    797             tmp1 = *ptrJ++;
    798             tmp6 -= (tmp7 << 2);
    799             tmp6 -= tmp7;
    800             tmp6 += tmp1;
    801             tmp6 = clp[tmp6>>5];
    802             /* Second pixel */
    803             tmp5 += 16;
    804             tmp7 = tmp2 + tmp3;
    805             *mb++ = (u8)tmp6;
    806             tmp5 += (tmp7 << 4);
    807             tmp5 += (tmp7 << 2);
    808             tmp7 = tmp1 + tmp4;
    809             tmp6 = *ptrJ++;
    810             tmp5 -= (tmp7 << 2);
    811             tmp5 -= tmp7;
    812             tmp5 += tmp6;
    813             tmp5 = clp[tmp5>>5];
    814             /* Third pixel */
    815             tmp4 += 16;
    816             tmp7 = tmp1 + tmp2;
    817             *mb++ = (u8)tmp5;
    818             tmp4 += (tmp7 << 4);
    819             tmp4 += (tmp7 << 2);
    820             tmp7 = tmp6 + tmp3;
    821             tmp5 = *ptrJ++;
    822             tmp4 -= (tmp7 << 2);
    823             tmp4 -= tmp7;
    824             tmp4 += tmp5;
    825             tmp4 = clp[tmp4>>5];
    826             /* Fourth pixel */
    827             tmp3 += 16;
    828             tmp7 = tmp6 + tmp1;
    829             *mb++ = (u8)tmp4;
    830             tmp3 += (tmp7 << 4);
    831             tmp3 += (tmp7 << 2);
    832             tmp7 = tmp5 + tmp2;
    833             tmp4 = *ptrJ++;
    834             tmp3 -= (tmp7 << 2);
    835             tmp3 -= tmp7;
    836             tmp3 += tmp4;
    837             tmp3 = clp[tmp3>>5];
    838             tmp7 = tmp4;
    839             tmp4 = tmp6;
    840             tmp6 = tmp2;
    841             tmp2 = tmp7;
    842             *mb++ = (u8)tmp3;
    843             tmp3 = tmp5;
    844             tmp5 = tmp1;
    845         }
    846         ptrJ += width - partWidth;
    847         mb += 16 - partWidth;
    848     }
    849 
    850 }
    851 
    852 /*------------------------------------------------------------------------------
    853 
    854     Function: h264bsdInterpolateHorQuarter
    855 
    856         Functional description:
    857           Function to perform horizontal interpolation of pixel position 'a'
    858           or 'c' for a block. Overfilling is done only if needed. Reference
    859           image (ref) is read at correct position and the predicted part
    860           is written to macroblock array (mb)
    861 
    862 ------------------------------------------------------------------------------*/
    863 
    864 void h264bsdInterpolateHorQuarter(
    865   u8 *ref,
    866   u8 *mb,
    867   i32 x0,
    868   i32 y0,
    869   u32 width,
    870   u32 height,
    871   u32 partWidth,
    872   u32 partHeight,
    873   u32 horOffset) /* 0 for pixel a, 1 for pixel c */
    874 {
    875     u32 p1[21*21/4+1];
    876     u8 *ptrJ;
    877     u32 x, y;
    878     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    879     const u8 *clp = h264bsdClip + 512;
    880 
    881     /* Code */
    882 
    883     ASSERT(ref);
    884     ASSERT(mb);
    885 
    886     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
    887         (y0 < 0) || ((u32)y0+partHeight > height))
    888     {
    889         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
    890                 partWidth+5, partHeight, partWidth+5);
    891 
    892         x0 = 0;
    893         y0 = 0;
    894         ref = (u8*)p1;
    895         width = partWidth + 5;
    896     }
    897 
    898     ref += (u32)y0 * width + (u32)x0;
    899 
    900     ptrJ = ref + 5;
    901 
    902     for (y = partHeight; y; y--)
    903     {
    904         tmp6 = *(ptrJ - 5);
    905         tmp5 = *(ptrJ - 4);
    906         tmp4 = *(ptrJ - 3);
    907         tmp3 = *(ptrJ - 2);
    908         tmp2 = *(ptrJ - 1);
    909 
    910         /* calculate 4 pels per iteration */
    911         for (x = (partWidth >> 2); x; x--)
    912         {
    913             /* First pixel */
    914             tmp6 += 16;
    915             tmp7 = tmp3 + tmp4;
    916             tmp6 += (tmp7 << 4);
    917             tmp6 += (tmp7 << 2);
    918             tmp7 = tmp2 + tmp5;
    919             tmp1 = *ptrJ++;
    920             tmp6 -= (tmp7 << 2);
    921             tmp6 -= tmp7;
    922             tmp6 += tmp1;
    923             tmp6 = clp[tmp6>>5];
    924             tmp5 += 16;
    925             if (!horOffset)
    926                 tmp6 += tmp4;
    927             else
    928                 tmp6 += tmp3;
    929             *mb++ = (u8)((tmp6 + 1) >> 1);
    930             /* Second pixel */
    931             tmp7 = tmp2 + tmp3;
    932             tmp5 += (tmp7 << 4);
    933             tmp5 += (tmp7 << 2);
    934             tmp7 = tmp1 + tmp4;
    935             tmp6 = *ptrJ++;
    936             tmp5 -= (tmp7 << 2);
    937             tmp5 -= tmp7;
    938             tmp5 += tmp6;
    939             tmp5 = clp[tmp5>>5];
    940             tmp4 += 16;
    941             if (!horOffset)
    942                 tmp5 += tmp3;
    943             else
    944                 tmp5 += tmp2;
    945             *mb++ = (u8)((tmp5 + 1) >> 1);
    946             /* Third pixel */
    947             tmp7 = tmp1 + tmp2;
    948             tmp4 += (tmp7 << 4);
    949             tmp4 += (tmp7 << 2);
    950             tmp7 = tmp6 + tmp3;
    951             tmp5 = *ptrJ++;
    952             tmp4 -= (tmp7 << 2);
    953             tmp4 -= tmp7;
    954             tmp4 += tmp5;
    955             tmp4 = clp[tmp4>>5];
    956             tmp3 += 16;
    957             if (!horOffset)
    958                 tmp4 += tmp2;
    959             else
    960                 tmp4 += tmp1;
    961             *mb++ = (u8)((tmp4 + 1) >> 1);
    962             /* Fourth pixel */
    963             tmp7 = tmp6 + tmp1;
    964             tmp3 += (tmp7 << 4);
    965             tmp3 += (tmp7 << 2);
    966             tmp7 = tmp5 + tmp2;
    967             tmp4 = *ptrJ++;
    968             tmp3 -= (tmp7 << 2);
    969             tmp3 -= tmp7;
    970             tmp3 += tmp4;
    971             tmp3 = clp[tmp3>>5];
    972             if (!horOffset)
    973                 tmp3 += tmp1;
    974             else
    975                 tmp3 += tmp6;
    976             *mb++ = (u8)((tmp3 + 1) >> 1);
    977             tmp3 = tmp5;
    978             tmp5 = tmp1;
    979             tmp7 = tmp4;
    980             tmp4 = tmp6;
    981             tmp6 = tmp2;
    982             tmp2 = tmp7;
    983         }
    984         ptrJ += width - partWidth;
    985         mb += 16 - partWidth;
    986     }
    987 
    988 }
    989 
    990 /*------------------------------------------------------------------------------
    991 
    992     Function: h264bsdInterpolateHorVerQuarter
    993 
    994         Functional description:
    995           Function to perform horizontal and vertical interpolation of pixel
    996           position 'e', 'g', 'p' or 'r' for a block. Overfilling is done only
    997           if needed. Reference image (ref) is read at correct position and
    998           the predicted part is written to macroblock array (mb)
    999 
   1000 ------------------------------------------------------------------------------*/
   1001 
   1002 void h264bsdInterpolateHorVerQuarter(
   1003   u8 *ref,
   1004   u8 *mb,
   1005   i32 x0,
   1006   i32 y0,
   1007   u32 width,
   1008   u32 height,
   1009   u32 partWidth,
   1010   u32 partHeight,
   1011   u32 horVerOffset) /* 0 for pixel e, 1 for pixel g,
   1012                        2 for pixel p, 3 for pixel r */
   1013 {
   1014     u32 p1[21*21/4+1];
   1015     u8 *ptrC, *ptrJ, *ptrV;
   1016     u32 x, y;
   1017     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1018     const u8 *clp = h264bsdClip + 512;
   1019 
   1020     /* Code */
   1021 
   1022     ASSERT(ref);
   1023     ASSERT(mb);
   1024 
   1025     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
   1026         (y0 < 0) || ((u32)y0+partHeight+5 > height))
   1027     {
   1028         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
   1029                 partWidth+5, partHeight+5, partWidth+5);
   1030 
   1031         x0 = 0;
   1032         y0 = 0;
   1033         ref = (u8*)p1;
   1034         width = partWidth+5;
   1035     }
   1036 
   1037     /* Ref points to G + (-2, -2) */
   1038     ref += (u32)y0 * width + (u32)x0;
   1039 
   1040     /* ptrJ points to either J or Q, depending on vertical offset */
   1041     ptrJ = ref + (((horVerOffset & 0x2) >> 1) + 2) * width + 5;
   1042 
   1043     /* ptrC points to either C or D, depending on horizontal offset */
   1044     ptrC = ref + width + 2 + (horVerOffset & 0x1);
   1045 
   1046     for (y = partHeight; y; y--)
   1047     {
   1048         tmp6 = *(ptrJ - 5);
   1049         tmp5 = *(ptrJ - 4);
   1050         tmp4 = *(ptrJ - 3);
   1051         tmp3 = *(ptrJ - 2);
   1052         tmp2 = *(ptrJ - 1);
   1053 
   1054         /* Horizontal interpolation, calculate 4 pels per iteration */
   1055         for (x = (partWidth >> 2); x; x--)
   1056         {
   1057             /* First pixel */
   1058             tmp6 += 16;
   1059             tmp7 = tmp3 + tmp4;
   1060             tmp6 += (tmp7 << 4);
   1061             tmp6 += (tmp7 << 2);
   1062             tmp7 = tmp2 + tmp5;
   1063             tmp1 = *ptrJ++;
   1064             tmp6 -= (tmp7 << 2);
   1065             tmp6 -= tmp7;
   1066             tmp6 += tmp1;
   1067             tmp6 = clp[tmp6>>5];
   1068             /* Second pixel */
   1069             tmp5 += 16;
   1070             tmp7 = tmp2 + tmp3;
   1071             *mb++ = (u8)tmp6;
   1072             tmp5 += (tmp7 << 4);
   1073             tmp5 += (tmp7 << 2);
   1074             tmp7 = tmp1 + tmp4;
   1075             tmp6 = *ptrJ++;
   1076             tmp5 -= (tmp7 << 2);
   1077             tmp5 -= tmp7;
   1078             tmp5 += tmp6;
   1079             tmp5 = clp[tmp5>>5];
   1080             /* Third pixel */
   1081             tmp4 += 16;
   1082             tmp7 = tmp1 + tmp2;
   1083             *mb++ = (u8)tmp5;
   1084             tmp4 += (tmp7 << 4);
   1085             tmp4 += (tmp7 << 2);
   1086             tmp7 = tmp6 + tmp3;
   1087             tmp5 = *ptrJ++;
   1088             tmp4 -= (tmp7 << 2);
   1089             tmp4 -= tmp7;
   1090             tmp4 += tmp5;
   1091             tmp4 = clp[tmp4>>5];
   1092             /* Fourth pixel */
   1093             tmp3 += 16;
   1094             tmp7 = tmp6 + tmp1;
   1095             *mb++ = (u8)tmp4;
   1096             tmp3 += (tmp7 << 4);
   1097             tmp3 += (tmp7 << 2);
   1098             tmp7 = tmp5 + tmp2;
   1099             tmp4 = *ptrJ++;
   1100             tmp3 -= (tmp7 << 2);
   1101             tmp3 -= tmp7;
   1102             tmp3 += tmp4;
   1103             tmp3 = clp[tmp3>>5];
   1104             tmp7 = tmp4;
   1105             tmp4 = tmp6;
   1106             tmp6 = tmp2;
   1107             tmp2 = tmp7;
   1108             *mb++ = (u8)tmp3;
   1109             tmp3 = tmp5;
   1110             tmp5 = tmp1;
   1111         }
   1112         ptrJ += width - partWidth;
   1113         mb += 16 - partWidth;
   1114     }
   1115 
   1116     mb -= 16*partHeight;
   1117     ptrV = ptrC + 5*width;
   1118 
   1119     for (y = (partHeight >> 2); y; y--)
   1120     {
   1121         /* Vertical interpolation and averaging, 4 pels per iteration */
   1122         for (x = partWidth; x; x--)
   1123         {
   1124             tmp4 = ptrV[-(i32)width*2];
   1125             tmp5 = ptrV[-(i32)width];
   1126             tmp1 = ptrV[width];
   1127             tmp2 = ptrV[width*2];
   1128             tmp6 = *ptrV++;
   1129 
   1130             tmp7 = tmp4 + tmp1;
   1131             tmp2 -= (tmp7 << 2);
   1132             tmp2 -= tmp7;
   1133             tmp2 += 16;
   1134             tmp7 = tmp5 + tmp6;
   1135             tmp3 = ptrC[width*2];
   1136             tmp2 += (tmp7 << 4);
   1137             tmp2 += (tmp7 << 2);
   1138             tmp2 += tmp3;
   1139             tmp7 = clp[tmp2>>5];
   1140             tmp2 = mb[48];
   1141             tmp1 += 16;
   1142             tmp7++;
   1143             mb[48] = (u8)((tmp2 + tmp7) >> 1);
   1144 
   1145             tmp7 = tmp3 + tmp6;
   1146             tmp1 -= (tmp7 << 2);
   1147             tmp1 -= tmp7;
   1148             tmp7 = tmp4 + tmp5;
   1149             tmp2 = ptrC[width];
   1150             tmp1 += (tmp7 << 4);
   1151             tmp1 += (tmp7 << 2);
   1152             tmp1 += tmp2;
   1153             tmp7 = clp[tmp1>>5];
   1154             tmp1 = mb[32];
   1155             tmp6 += 16;
   1156             tmp7++;
   1157             mb[32] = (u8)((tmp1 + tmp7) >> 1);
   1158 
   1159             tmp1 = *ptrC;
   1160             tmp7 = tmp2 + tmp5;
   1161             tmp6 -= (tmp7 << 2);
   1162             tmp6 -= tmp7;
   1163             tmp7 = tmp4 + tmp3;
   1164             tmp6 += (tmp7 << 4);
   1165             tmp6 += (tmp7 << 2);
   1166             tmp6 += tmp1;
   1167             tmp7 = clp[tmp6>>5];
   1168             tmp6 = mb[16];
   1169             tmp5 += 16;
   1170             tmp7++;
   1171             mb[16] = (u8)((tmp6 + tmp7) >> 1);
   1172 
   1173             tmp6 = ptrC[-(i32)width];
   1174             tmp1 += tmp4;
   1175             tmp5 -= (tmp1 << 2);
   1176             tmp5 -= tmp1;
   1177             tmp3 += tmp2;
   1178             tmp5 += (tmp3 << 4);
   1179             tmp5 += (tmp3 << 2);
   1180             tmp5 += tmp6;
   1181             tmp7 = clp[tmp5>>5];
   1182             tmp5 = *mb;
   1183             tmp7++;
   1184             *mb++ = (u8)((tmp5 + tmp7) >> 1);
   1185             ptrC++;
   1186 
   1187         }
   1188         ptrC += 4*width - partWidth;
   1189         ptrV += 4*width - partWidth;
   1190         mb += 4*16 - partWidth;
   1191     }
   1192 
   1193 }
   1194 #endif
   1195 
   1196 /*------------------------------------------------------------------------------
   1197 
   1198     Function: h264bsdInterpolateMidHalf
   1199 
   1200         Functional description:
   1201           Function to perform horizontal and vertical interpolation of pixel
   1202           position 'j' for a block. Overfilling is done only if needed.
   1203           Reference image (ref) is read at correct position and the predicted
   1204           part is written to macroblock array (mb)
   1205 
   1206 ------------------------------------------------------------------------------*/
   1207 
   1208 void h264bsdInterpolateMidHalf(
   1209   u8 *ref,
   1210   u8 *mb,
   1211   i32 x0,
   1212   i32 y0,
   1213   u32 width,
   1214   u32 height,
   1215   u32 partWidth,
   1216   u32 partHeight)
   1217 {
   1218     u32 p1[21*21/4+1];
   1219     u32 x, y;
   1220     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1221     i32 *ptrC, *ptrV, *b1;
   1222     u8  *ptrJ;
   1223     i32 table[21*16];
   1224     const u8 *clp = h264bsdClip + 512;
   1225 
   1226     /* Code */
   1227 
   1228     ASSERT(ref);
   1229     ASSERT(mb);
   1230 
   1231     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
   1232         (y0 < 0) || ((u32)y0+partHeight+5 > height))
   1233     {
   1234         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
   1235                 partWidth+5, partHeight+5, partWidth+5);
   1236 
   1237         x0 = 0;
   1238         y0 = 0;
   1239         ref = (u8*)p1;
   1240         width = partWidth+5;
   1241     }
   1242 
   1243     ref += (u32)y0 * width + (u32)x0;
   1244 
   1245     b1 = table;
   1246     ptrJ = ref + 5;
   1247 
   1248     /* First step: calculate intermediate values for
   1249      * horizontal interpolation */
   1250     for (y = partHeight + 5; y; y--)
   1251     {
   1252         tmp6 = *(ptrJ - 5);
   1253         tmp5 = *(ptrJ - 4);
   1254         tmp4 = *(ptrJ - 3);
   1255         tmp3 = *(ptrJ - 2);
   1256         tmp2 = *(ptrJ - 1);
   1257 
   1258         /* 4 pels per iteration */
   1259         for (x = (partWidth >> 2); x; x--)
   1260         {
   1261             /* First pixel */
   1262             tmp7 = tmp3 + tmp4;
   1263             tmp6 += (tmp7 << 4);
   1264             tmp6 += (tmp7 << 2);
   1265             tmp7 = tmp2 + tmp5;
   1266             tmp1 = *ptrJ++;
   1267             tmp6 -= (tmp7 << 2);
   1268             tmp6 -= tmp7;
   1269             tmp6 += tmp1;
   1270             *b1++ = tmp6;
   1271             /* Second pixel */
   1272             tmp7 = tmp2 + tmp3;
   1273             tmp5 += (tmp7 << 4);
   1274             tmp5 += (tmp7 << 2);
   1275             tmp7 = tmp1 + tmp4;
   1276             tmp6 = *ptrJ++;
   1277             tmp5 -= (tmp7 << 2);
   1278             tmp5 -= tmp7;
   1279             tmp5 += tmp6;
   1280             *b1++ = tmp5;
   1281             /* Third pixel */
   1282             tmp7 = tmp1 + tmp2;
   1283             tmp4 += (tmp7 << 4);
   1284             tmp4 += (tmp7 << 2);
   1285             tmp7 = tmp6 + tmp3;
   1286             tmp5 = *ptrJ++;
   1287             tmp4 -= (tmp7 << 2);
   1288             tmp4 -= tmp7;
   1289             tmp4 += tmp5;
   1290             *b1++ = tmp4;
   1291             /* Fourth pixel */
   1292             tmp7 = tmp6 + tmp1;
   1293             tmp3 += (tmp7 << 4);
   1294             tmp3 += (tmp7 << 2);
   1295             tmp7 = tmp5 + tmp2;
   1296             tmp4 = *ptrJ++;
   1297             tmp3 -= (tmp7 << 2);
   1298             tmp3 -= tmp7;
   1299             tmp3 += tmp4;
   1300             *b1++ = tmp3;
   1301             tmp7 = tmp4;
   1302             tmp4 = tmp6;
   1303             tmp6 = tmp2;
   1304             tmp2 = tmp7;
   1305             tmp3 = tmp5;
   1306             tmp5 = tmp1;
   1307         }
   1308         ptrJ += width - partWidth;
   1309     }
   1310 
   1311     /* Second step: calculate vertical interpolation */
   1312     ptrC = table + partWidth;
   1313     ptrV = ptrC + 5*partWidth;
   1314     for (y = (partHeight >> 2); y; y--)
   1315     {
   1316         /* 4 pels per iteration */
   1317         for (x = partWidth; x; x--)
   1318         {
   1319             tmp4 = ptrV[-(i32)partWidth*2];
   1320             tmp5 = ptrV[-(i32)partWidth];
   1321             tmp1 = ptrV[partWidth];
   1322             tmp2 = ptrV[partWidth*2];
   1323             tmp6 = *ptrV++;
   1324 
   1325             tmp7 = tmp4 + tmp1;
   1326             tmp2 -= (tmp7 << 2);
   1327             tmp2 -= tmp7;
   1328             tmp2 += 512;
   1329             tmp7 = tmp5 + tmp6;
   1330             tmp3 = ptrC[partWidth*2];
   1331             tmp2 += (tmp7 << 4);
   1332             tmp2 += (tmp7 << 2);
   1333             tmp2 += tmp3;
   1334             tmp7 = clp[tmp2>>10];
   1335             tmp1 += 512;
   1336             mb[48] = (u8)tmp7;
   1337 
   1338             tmp7 = tmp3 + tmp6;
   1339             tmp1 -= (tmp7 << 2);
   1340             tmp1 -= tmp7;
   1341             tmp7 = tmp4 + tmp5;
   1342             tmp2 = ptrC[partWidth];
   1343             tmp1 += (tmp7 << 4);
   1344             tmp1 += (tmp7 << 2);
   1345             tmp1 += tmp2;
   1346             tmp7 = clp[tmp1>>10];
   1347             tmp6 += 512;
   1348             mb[32] = (u8)tmp7;
   1349 
   1350             tmp1 = *ptrC;
   1351             tmp7 = tmp2 + tmp5;
   1352             tmp6 -= (tmp7 << 2);
   1353             tmp6 -= tmp7;
   1354             tmp7 = tmp4 + tmp3;
   1355             tmp6 += (tmp7 << 4);
   1356             tmp6 += (tmp7 << 2);
   1357             tmp6 += tmp1;
   1358             tmp7 = clp[tmp6>>10];
   1359             tmp5 += 512;
   1360             mb[16] = (u8)tmp7;
   1361 
   1362             tmp6 = ptrC[-(i32)partWidth];
   1363             tmp1 += tmp4;
   1364             tmp5 -= (tmp1 << 2);
   1365             tmp5 -= tmp1;
   1366             tmp3 += tmp2;
   1367             tmp5 += (tmp3 << 4);
   1368             tmp5 += (tmp3 << 2);
   1369             tmp5 += tmp6;
   1370             tmp7 = clp[tmp5>>10];
   1371             *mb++ = (u8)tmp7;
   1372             ptrC++;
   1373         }
   1374         mb += 4*16 - partWidth;
   1375         ptrC += 3*partWidth;
   1376         ptrV += 3*partWidth;
   1377     }
   1378 
   1379 }
   1380 
   1381 
   1382 /*------------------------------------------------------------------------------
   1383 
   1384     Function: h264bsdInterpolateMidVerQuarter
   1385 
   1386         Functional description:
   1387           Function to perform horizontal and vertical interpolation of pixel
   1388           position 'f' or 'q' for a block. Overfilling is done only if needed.
   1389           Reference image (ref) is read at correct position and the predicted
   1390           part is written to macroblock array (mb)
   1391 
   1392 ------------------------------------------------------------------------------*/
   1393 
   1394 void h264bsdInterpolateMidVerQuarter(
   1395   u8 *ref,
   1396   u8 *mb,
   1397   i32 x0,
   1398   i32 y0,
   1399   u32 width,
   1400   u32 height,
   1401   u32 partWidth,
   1402   u32 partHeight,
   1403   u32 verOffset)    /* 0 for pixel f, 1 for pixel q */
   1404 {
   1405     u32 p1[21*21/4+1];
   1406     u32 x, y;
   1407     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1408     i32 *ptrC, *ptrV, *ptrInt, *b1;
   1409     u8  *ptrJ;
   1410     i32 table[21*16];
   1411     const u8 *clp = h264bsdClip + 512;
   1412 
   1413     /* Code */
   1414 
   1415     ASSERT(ref);
   1416     ASSERT(mb);
   1417 
   1418     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
   1419         (y0 < 0) || ((u32)y0+partHeight+5 > height))
   1420     {
   1421         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
   1422                 partWidth+5, partHeight+5, partWidth+5);
   1423 
   1424         x0 = 0;
   1425         y0 = 0;
   1426         ref = (u8*)p1;
   1427         width = partWidth+5;
   1428     }
   1429 
   1430     ref += (u32)y0 * width + (u32)x0;
   1431 
   1432     b1 = table;
   1433     ptrJ = ref + 5;
   1434 
   1435     /* First step: calculate intermediate values for
   1436      * horizontal interpolation */
   1437     for (y = partHeight + 5; y; y--)
   1438     {
   1439         tmp6 = *(ptrJ - 5);
   1440         tmp5 = *(ptrJ - 4);
   1441         tmp4 = *(ptrJ - 3);
   1442         tmp3 = *(ptrJ - 2);
   1443         tmp2 = *(ptrJ - 1);
   1444         for (x = (partWidth >> 2); x; x--)
   1445         {
   1446             /* First pixel */
   1447             tmp7 = tmp3 + tmp4;
   1448             tmp6 += (tmp7 << 4);
   1449             tmp6 += (tmp7 << 2);
   1450             tmp7 = tmp2 + tmp5;
   1451             tmp1 = *ptrJ++;
   1452             tmp6 -= (tmp7 << 2);
   1453             tmp6 -= tmp7;
   1454             tmp6 += tmp1;
   1455             *b1++ = tmp6;
   1456             /* Second pixel */
   1457             tmp7 = tmp2 + tmp3;
   1458             tmp5 += (tmp7 << 4);
   1459             tmp5 += (tmp7 << 2);
   1460             tmp7 = tmp1 + tmp4;
   1461             tmp6 = *ptrJ++;
   1462             tmp5 -= (tmp7 << 2);
   1463             tmp5 -= tmp7;
   1464             tmp5 += tmp6;
   1465             *b1++ = tmp5;
   1466             /* Third pixel */
   1467             tmp7 = tmp1 + tmp2;
   1468             tmp4 += (tmp7 << 4);
   1469             tmp4 += (tmp7 << 2);
   1470             tmp7 = tmp6 + tmp3;
   1471             tmp5 = *ptrJ++;
   1472             tmp4 -= (tmp7 << 2);
   1473             tmp4 -= tmp7;
   1474             tmp4 += tmp5;
   1475             *b1++ = tmp4;
   1476             /* Fourth pixel */
   1477             tmp7 = tmp6 + tmp1;
   1478             tmp3 += (tmp7 << 4);
   1479             tmp3 += (tmp7 << 2);
   1480             tmp7 = tmp5 + tmp2;
   1481             tmp4 = *ptrJ++;
   1482             tmp3 -= (tmp7 << 2);
   1483             tmp3 -= tmp7;
   1484             tmp3 += tmp4;
   1485             *b1++ = tmp3;
   1486             tmp7 = tmp4;
   1487             tmp4 = tmp6;
   1488             tmp6 = tmp2;
   1489             tmp2 = tmp7;
   1490             tmp3 = tmp5;
   1491             tmp5 = tmp1;
   1492         }
   1493         ptrJ += width - partWidth;
   1494     }
   1495 
   1496     /* Second step: calculate vertical interpolation and average */
   1497     ptrC = table + partWidth;
   1498     ptrV = ptrC + 5*partWidth;
   1499     /* Pointer to integer sample position, either M or R */
   1500     ptrInt = ptrC + (2+verOffset)*partWidth;
   1501     for (y = (partHeight >> 2); y; y--)
   1502     {
   1503         for (x = partWidth; x; x--)
   1504         {
   1505             tmp4 = ptrV[-(i32)partWidth*2];
   1506             tmp5 = ptrV[-(i32)partWidth];
   1507             tmp1 = ptrV[partWidth];
   1508             tmp2 = ptrV[partWidth*2];
   1509             tmp6 = *ptrV++;
   1510 
   1511             tmp7 = tmp4 + tmp1;
   1512             tmp2 -= (tmp7 << 2);
   1513             tmp2 -= tmp7;
   1514             tmp2 += 512;
   1515             tmp7 = tmp5 + tmp6;
   1516             tmp3 = ptrC[partWidth*2];
   1517             tmp2 += (tmp7 << 4);
   1518             tmp2 += (tmp7 << 2);
   1519             tmp7 = ptrInt[partWidth*2];
   1520             tmp2 += tmp3;
   1521             tmp2 = clp[tmp2>>10];
   1522             tmp7 += 16;
   1523             tmp7 = clp[tmp7>>5];
   1524             tmp1 += 512;
   1525             tmp2++;
   1526             mb[48] = (u8)((tmp7 + tmp2) >> 1);
   1527 
   1528             tmp7 = tmp3 + tmp6;
   1529             tmp1 -= (tmp7 << 2);
   1530             tmp1 -= tmp7;
   1531             tmp7 = tmp4 + tmp5;
   1532             tmp2 = ptrC[partWidth];
   1533             tmp1 += (tmp7 << 4);
   1534             tmp1 += (tmp7 << 2);
   1535             tmp7 = ptrInt[partWidth];
   1536             tmp1 += tmp2;
   1537             tmp1 = clp[tmp1>>10];
   1538             tmp7 += 16;
   1539             tmp7 = clp[tmp7>>5];
   1540             tmp6 += 512;
   1541             tmp1++;
   1542             mb[32] = (u8)((tmp7 + tmp1) >> 1);
   1543 
   1544             tmp1 = *ptrC;
   1545             tmp7 = tmp2 + tmp5;
   1546             tmp6 -= (tmp7 << 2);
   1547             tmp6 -= tmp7;
   1548             tmp7 = tmp4 + tmp3;
   1549             tmp6 += (tmp7 << 4);
   1550             tmp6 += (tmp7 << 2);
   1551             tmp7 = *ptrInt;
   1552             tmp6 += tmp1;
   1553             tmp6 = clp[tmp6>>10];
   1554             tmp7 += 16;
   1555             tmp7 = clp[tmp7>>5];
   1556             tmp5 += 512;
   1557             tmp6++;
   1558             mb[16] = (u8)((tmp7 + tmp6) >> 1);
   1559 
   1560             tmp6 = ptrC[-(i32)partWidth];
   1561             tmp1 += tmp4;
   1562             tmp5 -= (tmp1 << 2);
   1563             tmp5 -= tmp1;
   1564             tmp3 += tmp2;
   1565             tmp5 += (tmp3 << 4);
   1566             tmp5 += (tmp3 << 2);
   1567             tmp7 = ptrInt[-(i32)partWidth];
   1568             tmp5 += tmp6;
   1569             tmp5 = clp[tmp5>>10];
   1570             tmp7 += 16;
   1571             tmp7 = clp[tmp7>>5];
   1572             tmp5++;
   1573             *mb++ = (u8)((tmp7 + tmp5) >> 1);
   1574             ptrC++;
   1575             ptrInt++;
   1576         }
   1577         mb += 4*16 - partWidth;
   1578         ptrC += 3*partWidth;
   1579         ptrV += 3*partWidth;
   1580         ptrInt += 3*partWidth;
   1581     }
   1582 
   1583 }
   1584 
   1585 
   1586 /*------------------------------------------------------------------------------
   1587 
   1588     Function: h264bsdInterpolateMidHorQuarter
   1589 
   1590         Functional description:
   1591           Function to perform horizontal and vertical interpolation of pixel
   1592           position 'i' or 'k' for a block. Overfilling is done only if needed.
   1593           Reference image (ref) is read at correct position and the predicted
   1594           part is written to macroblock array (mb)
   1595 
   1596 ------------------------------------------------------------------------------*/
   1597 
   1598 void h264bsdInterpolateMidHorQuarter(
   1599   u8 *ref,
   1600   u8 *mb,
   1601   i32 x0,
   1602   i32 y0,
   1603   u32 width,
   1604   u32 height,
   1605   u32 partWidth,
   1606   u32 partHeight,
   1607   u32 horOffset)    /* 0 for pixel i, 1 for pixel k */
   1608 {
   1609     u32 p1[21*21/4+1];
   1610     u32 x, y;
   1611     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1612     i32 *ptrJ, *ptrInt, *h1;
   1613     u8  *ptrC, *ptrV;
   1614     i32 table[21*16];
   1615     i32 tableWidth = (i32)partWidth+5;
   1616     const u8 *clp = h264bsdClip + 512;
   1617 
   1618     /* Code */
   1619 
   1620     ASSERT(ref);
   1621     ASSERT(mb);
   1622 
   1623     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
   1624         (y0 < 0) || ((u32)y0+partHeight+5 > height))
   1625     {
   1626         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
   1627                 partWidth+5, partHeight+5, partWidth+5);
   1628 
   1629         x0 = 0;
   1630         y0 = 0;
   1631         ref = (u8*)p1;
   1632         width = partWidth+5;
   1633     }
   1634 
   1635     ref += (u32)y0 * width + (u32)x0;
   1636 
   1637     h1 = table + tableWidth;
   1638     ptrC = ref + width;
   1639     ptrV = ptrC + 5*width;
   1640 
   1641     /* First step: calculate intermediate values for
   1642      * vertical interpolation */
   1643     for (y = (partHeight >> 2); y; y--)
   1644     {
   1645         for (x = (u32)tableWidth; x; x--)
   1646         {
   1647             tmp4 = ptrV[-(i32)width*2];
   1648             tmp5 = ptrV[-(i32)width];
   1649             tmp1 = ptrV[width];
   1650             tmp2 = ptrV[width*2];
   1651             tmp6 = *ptrV++;
   1652 
   1653             tmp7 = tmp4 + tmp1;
   1654             tmp2 -= (tmp7 << 2);
   1655             tmp2 -= tmp7;
   1656             tmp7 = tmp5 + tmp6;
   1657             tmp3 = ptrC[width*2];
   1658             tmp2 += (tmp7 << 4);
   1659             tmp2 += (tmp7 << 2);
   1660             tmp2 += tmp3;
   1661             h1[tableWidth*2] = tmp2;
   1662 
   1663             tmp7 = tmp3 + tmp6;
   1664             tmp1 -= (tmp7 << 2);
   1665             tmp1 -= tmp7;
   1666             tmp7 = tmp4 + tmp5;
   1667             tmp2 = ptrC[width];
   1668             tmp1 += (tmp7 << 4);
   1669             tmp1 += (tmp7 << 2);
   1670             tmp1 += tmp2;
   1671             h1[tableWidth] = tmp1;
   1672 
   1673             tmp1 = *ptrC;
   1674             tmp7 = tmp2 + tmp5;
   1675             tmp6 -= (tmp7 << 2);
   1676             tmp6 -= tmp7;
   1677             tmp7 = tmp4 + tmp3;
   1678             tmp6 += (tmp7 << 4);
   1679             tmp6 += (tmp7 << 2);
   1680             tmp6 += tmp1;
   1681             *h1 = tmp6;
   1682 
   1683             tmp6 = ptrC[-(i32)width];
   1684             tmp1 += tmp4;
   1685             tmp5 -= (tmp1 << 2);
   1686             tmp5 -= tmp1;
   1687             tmp3 += tmp2;
   1688             tmp5 += (tmp3 << 4);
   1689             tmp5 += (tmp3 << 2);
   1690             tmp5 += tmp6;
   1691             h1[-tableWidth] = tmp5;
   1692             h1++;
   1693             ptrC++;
   1694         }
   1695         ptrC += 4*width - partWidth - 5;
   1696         ptrV += 4*width - partWidth - 5;
   1697         h1 += 3*tableWidth;
   1698     }
   1699 
   1700     /* Second step: calculate horizontal interpolation and average */
   1701     ptrJ = table + 5;
   1702     /* Pointer to integer sample position, either G or H */
   1703     ptrInt = table + 2 + horOffset;
   1704     for (y = partHeight; y; y--)
   1705     {
   1706         tmp6 = *(ptrJ - 5);
   1707         tmp5 = *(ptrJ - 4);
   1708         tmp4 = *(ptrJ - 3);
   1709         tmp3 = *(ptrJ - 2);
   1710         tmp2 = *(ptrJ - 1);
   1711         for (x = (partWidth>>2); x; x--)
   1712         {
   1713             /* First pixel */
   1714             tmp6 += 512;
   1715             tmp7 = tmp3 + tmp4;
   1716             tmp6 += (tmp7 << 4);
   1717             tmp6 += (tmp7 << 2);
   1718             tmp7 = tmp2 + tmp5;
   1719             tmp1 = *ptrJ++;
   1720             tmp6 -= (tmp7 << 2);
   1721             tmp6 -= tmp7;
   1722             tmp7 = *ptrInt++;
   1723             tmp6 += tmp1;
   1724             tmp6 = clp[tmp6 >> 10];
   1725             tmp7 += 16;
   1726             tmp7 = clp[tmp7 >> 5];
   1727             tmp5 += 512;
   1728             tmp6++;
   1729             *mb++ = (u8)((tmp6 + tmp7) >> 1);
   1730             /* Second pixel */
   1731             tmp7 = tmp2 + tmp3;
   1732             tmp5 += (tmp7 << 4);
   1733             tmp5 += (tmp7 << 2);
   1734             tmp7 = tmp1 + tmp4;
   1735             tmp6 = *ptrJ++;
   1736             tmp5 -= (tmp7 << 2);
   1737             tmp5 -= tmp7;
   1738             tmp7 = *ptrInt++;
   1739             tmp5 += tmp6;
   1740             tmp5 = clp[tmp5 >> 10];
   1741             tmp7 += 16;
   1742             tmp7 = clp[tmp7 >> 5];
   1743             tmp4 += 512;
   1744             tmp5++;
   1745             *mb++ = (u8)((tmp5 + tmp7) >> 1);
   1746             /* Third pixel */
   1747             tmp7 = tmp1 + tmp2;
   1748             tmp4 += (tmp7 << 4);
   1749             tmp4 += (tmp7 << 2);
   1750             tmp7 = tmp6 + tmp3;
   1751             tmp5 = *ptrJ++;
   1752             tmp4 -= (tmp7 << 2);
   1753             tmp4 -= tmp7;
   1754             tmp7 = *ptrInt++;
   1755             tmp4 += tmp5;
   1756             tmp4 = clp[tmp4 >> 10];
   1757             tmp7 += 16;
   1758             tmp7 = clp[tmp7 >> 5];
   1759             tmp3 += 512;
   1760             tmp4++;
   1761             *mb++ = (u8)((tmp4 + tmp7) >> 1);
   1762             /* Fourth pixel */
   1763             tmp7 = tmp6 + tmp1;
   1764             tmp3 += (tmp7 << 4);
   1765             tmp3 += (tmp7 << 2);
   1766             tmp7 = tmp5 + tmp2;
   1767             tmp4 = *ptrJ++;
   1768             tmp3 -= (tmp7 << 2);
   1769             tmp3 -= tmp7;
   1770             tmp7 = *ptrInt++;
   1771             tmp3 += tmp4;
   1772             tmp3 = clp[tmp3 >> 10];
   1773             tmp7 += 16;
   1774             tmp7 = clp[tmp7 >> 5];
   1775             tmp3++;
   1776             *mb++ = (u8)((tmp3 + tmp7) >> 1);
   1777             tmp3 = tmp5;
   1778             tmp5 = tmp1;
   1779             tmp7 = tmp4;
   1780             tmp4 = tmp6;
   1781             tmp6 = tmp2;
   1782             tmp2 = tmp7;
   1783         }
   1784         ptrJ += 5;
   1785         ptrInt += 5;
   1786         mb += 16 - partWidth;
   1787     }
   1788 
   1789 }
   1790 
   1791 
   1792 /*------------------------------------------------------------------------------
   1793 
   1794     Function: h264bsdPredictSamples
   1795 
   1796         Functional description:
   1797           This function reconstructs a prediction for a macroblock partition.
   1798           The prediction is either copied or interpolated using the reference
   1799           frame and the motion vector. Both luminance and chrominance parts are
   1800           predicted. The prediction is stored in given macroblock array (data).
   1801         Inputs:
   1802           data          pointer to macroblock array (384 bytes) for output
   1803           mv            pointer to motion vector used for prediction
   1804           refPic        pointer to reference picture structure
   1805           xA            x-coordinate for current macroblock
   1806           yA            y-coordinate for current macroblock
   1807           partX         x-offset for partition in macroblock
   1808           partY         y-offset for partition in macroblock
   1809           partWidth     width of partition
   1810           partHeight    height of partition
   1811         Outputs:
   1812           data          macroblock array (16x16+8x8+8x8) where predicted
   1813                         partition is stored at correct position
   1814 
   1815 ------------------------------------------------------------------------------*/
   1816 
   1817 void h264bsdPredictSamples(
   1818   u8 *data,
   1819   mv_t *mv,
   1820   image_t *refPic,
   1821   u32 xA,
   1822   u32 yA,
   1823   u32 partX,
   1824   u32 partY,
   1825   u32 partWidth,
   1826   u32 partHeight)
   1827 
   1828 {
   1829 
   1830 /* Variables */
   1831 
   1832     u32 xFrac, yFrac, width, height;
   1833     i32 xInt, yInt;
   1834     u8 *lumaPartData;
   1835 
   1836 /* Code */
   1837 
   1838     ASSERT(data);
   1839     ASSERT(mv);
   1840     ASSERT(partWidth);
   1841     ASSERT(partHeight);
   1842     ASSERT(refPic);
   1843     ASSERT(refPic->data);
   1844     ASSERT(refPic->width);
   1845     ASSERT(refPic->height);
   1846 
   1847     /* luma */
   1848     lumaPartData = data + 16*partY + partX;
   1849 
   1850     xFrac = mv->hor & 0x3;
   1851     yFrac = mv->ver & 0x3;
   1852 
   1853     width = 16 * refPic->width;
   1854     height = 16 * refPic->height;
   1855 
   1856     xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
   1857     yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
   1858 
   1859     ASSERT(lumaFracPos[xFrac][yFrac] < 16);
   1860 
   1861     switch (lumaFracPos[xFrac][yFrac])
   1862     {
   1863         case 0: /* G */
   1864             h264bsdFillBlock(refPic->data, lumaPartData,
   1865                     xInt,yInt,width,height,partWidth,partHeight,16);
   1866             break;
   1867         case 1: /* d */
   1868             h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
   1869                     xInt, yInt-2, width, height, partWidth, partHeight, 0);
   1870             break;
   1871         case 2: /* h */
   1872             h264bsdInterpolateVerHalf(refPic->data, lumaPartData,
   1873                     xInt, yInt-2, width, height, partWidth, partHeight);
   1874             break;
   1875         case 3: /* n */
   1876             h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
   1877                     xInt, yInt-2, width, height, partWidth, partHeight, 1);
   1878             break;
   1879         case 4: /* a */
   1880             h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
   1881                     xInt-2, yInt, width, height, partWidth, partHeight, 0);
   1882             break;
   1883         case 5: /* e */
   1884             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
   1885                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
   1886             break;
   1887         case 6: /* i */
   1888             h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
   1889                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
   1890             break;
   1891         case 7: /* p */
   1892             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
   1893                     xInt-2, yInt-2, width, height, partWidth, partHeight, 2);
   1894             break;
   1895         case 8: /* b */
   1896             h264bsdInterpolateHorHalf(refPic->data, lumaPartData,
   1897                     xInt-2, yInt, width, height, partWidth, partHeight);
   1898             break;
   1899         case 9: /* f */
   1900             h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
   1901                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
   1902             break;
   1903         case 10: /* j */
   1904             h264bsdInterpolateMidHalf(refPic->data, lumaPartData,
   1905                     xInt-2, yInt-2, width, height, partWidth, partHeight);
   1906             break;
   1907         case 11: /* q */
   1908             h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
   1909                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
   1910             break;
   1911         case 12: /* c */
   1912             h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
   1913                     xInt-2, yInt, width, height, partWidth, partHeight, 1);
   1914             break;
   1915         case 13: /* g */
   1916             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
   1917                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
   1918             break;
   1919         case 14: /* k */
   1920             h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
   1921                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
   1922             break;
   1923         default: /* case 15, r */
   1924             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
   1925                     xInt-2, yInt-2, width, height, partWidth, partHeight, 3);
   1926             break;
   1927     }
   1928 
   1929     /* chroma */
   1930     PredictChroma(
   1931       data + 16*16 + (partY>>1)*8 + (partX>>1),
   1932       xA + partX,
   1933       yA + partY,
   1934       partWidth,
   1935       partHeight,
   1936       mv,
   1937       refPic);
   1938 
   1939 }
   1940 
   1941 #else /* H264DEC_OMXDL */
   1942 /*------------------------------------------------------------------------------
   1943 
   1944     Function: h264bsdPredictSamples
   1945 
   1946         Functional description:
   1947           This function reconstructs a prediction for a macroblock partition.
   1948           The prediction is either copied or interpolated using the reference
   1949           frame and the motion vector. Both luminance and chrominance parts are
   1950           predicted. The prediction is stored in given macroblock array (data).
   1951         Inputs:
   1952           data          pointer to macroblock array (384 bytes) for output
   1953           mv            pointer to motion vector used for prediction
   1954           refPic        pointer to reference picture structure
   1955           xA            x-coordinate for current macroblock
   1956           yA            y-coordinate for current macroblock
   1957           partX         x-offset for partition in macroblock
   1958           partY         y-offset for partition in macroblock
   1959           partWidth     width of partition
   1960           partHeight    height of partition
   1961         Outputs:
   1962           data          macroblock array (16x16+8x8+8x8) where predicted
   1963                         partition is stored at correct position
   1964 
   1965 ------------------------------------------------------------------------------*/
   1966 
   1967 /*lint -e{550} Symbol 'res' not accessed */
   1968 void h264bsdPredictSamples(
   1969   u8 *data,
   1970   mv_t *mv,
   1971   image_t *refPic,
   1972   u32 colAndRow,
   1973   u32 part,
   1974   u8 *pFill)
   1975 
   1976 {
   1977 
   1978 /* Variables */
   1979 
   1980     u32 xFrac, yFrac;
   1981     u32 width, height;
   1982     i32 xInt, yInt, x0, y0;
   1983     u8 *partData, *ref;
   1984     OMXSize roi;
   1985     u32 fillWidth;
   1986     u32 fillHeight;
   1987     OMXResult res;
   1988     u32 xA, yA;
   1989     u32 partX, partY;
   1990     u32 partWidth, partHeight;
   1991 
   1992 /* Code */
   1993 
   1994     ASSERT(data);
   1995     ASSERT(mv);
   1996     ASSERT(refPic);
   1997     ASSERT(refPic->data);
   1998     ASSERT(refPic->width);
   1999     ASSERT(refPic->height);
   2000 
   2001     xA = (colAndRow & 0xFFFF0000) >> 16;
   2002     yA = (colAndRow & 0x0000FFFF);
   2003 
   2004     partX = (part & 0xFF000000) >> 24;
   2005     partY = (part & 0x00FF0000) >> 16;
   2006     partWidth = (part & 0x0000FF00) >> 8;
   2007     partHeight = (part & 0x000000FF);
   2008 
   2009     ASSERT(partWidth);
   2010     ASSERT(partHeight);
   2011 
   2012     /* luma */
   2013     partData = data + 16*partY + partX;
   2014 
   2015     xFrac = mv->hor & 0x3;
   2016     yFrac = mv->ver & 0x3;
   2017 
   2018     width = 16 * refPic->width;
   2019     height = 16 * refPic->height;
   2020 
   2021     xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
   2022     yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
   2023 
   2024     x0 = (xFrac) ? xInt-2 : xInt;
   2025     y0 = (yFrac) ? yInt-2 : yInt;
   2026 
   2027     if (xFrac)
   2028     {
   2029         if (partWidth == 16)
   2030             fillWidth = 32;
   2031         else
   2032             fillWidth = 16;
   2033     }
   2034     else
   2035         fillWidth = (partWidth*2);
   2036     if (yFrac)
   2037         fillHeight = partHeight+5;
   2038     else
   2039         fillHeight = partHeight;
   2040 
   2041 
   2042     if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
   2043         (y0 < 0) || ((u32)y0+fillHeight > height))
   2044     {
   2045         h264bsdFillBlock(refPic->data, (u8*)pFill, x0, y0, width, height,
   2046                 fillWidth, fillHeight, fillWidth);
   2047 
   2048         x0 = 0;
   2049         y0 = 0;
   2050         ref = pFill;
   2051         width = fillWidth;
   2052         if (yFrac)
   2053             ref += 2*width;
   2054         if (xFrac)
   2055             ref += 2;
   2056     }
   2057     else
   2058     {
   2059         /*lint --e(737) Loss of sign */
   2060         ref = refPic->data + yInt*width + xInt;
   2061     }
   2062     /* Luma interpolation */
   2063     roi.width = (i32)partWidth;
   2064     roi.height = (i32)partHeight;
   2065 
   2066     res = omxVCM4P10_InterpolateLuma(ref, (i32)width, partData, 16,
   2067                                         (i32)xFrac, (i32)yFrac, roi);
   2068     ASSERT(res == 0);
   2069 
   2070     /* Chroma */
   2071     width  = 8 * refPic->width;
   2072     height = 8 * refPic->height;
   2073 
   2074     x0 = ((xA + partX) >> 1) + (mv->hor >> 3);
   2075     y0 = ((yA + partY) >> 1) + (mv->ver >> 3);
   2076     xFrac = mv->hor & 0x7;
   2077     yFrac = mv->ver & 0x7;
   2078 
   2079     ref = refPic->data + 256 * refPic->width * refPic->height;
   2080 
   2081     roi.width = (i32)(partWidth >> 1);
   2082     fillWidth = ((partWidth >> 1) + 8) & ~0x7;
   2083     roi.height = (i32)(partHeight >> 1);
   2084     fillHeight = (partHeight >> 1) + 1;
   2085 
   2086     if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
   2087         (y0 < 0) || ((u32)y0+fillHeight > height))
   2088     {
   2089         h264bsdFillBlock(ref, pFill, x0, y0, width, height,
   2090             fillWidth, fillHeight, fillWidth);
   2091         ref += width * height;
   2092         h264bsdFillBlock(ref, pFill + fillWidth*fillHeight,
   2093             x0, y0, width, height, fillWidth,
   2094             fillHeight, fillWidth);
   2095 
   2096         ref = pFill;
   2097         x0 = 0;
   2098         y0 = 0;
   2099         width = fillWidth;
   2100         height = fillHeight;
   2101     }
   2102 
   2103     partData = data + 16*16 + (partY>>1)*8 + (partX>>1);
   2104 
   2105     /* Chroma interpolation */
   2106     /*lint --e(737) Loss of sign */
   2107     ref += y0 * width + x0;
   2108     res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
   2109                             (u32)roi.width, (u32)roi.height, xFrac, yFrac);
   2110     ASSERT(res == 0);
   2111     partData += 8 * 8;
   2112     ref += height * width;
   2113     res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
   2114                             (u32)roi.width, (u32)roi.height, xFrac, yFrac);
   2115     ASSERT(res == 0);
   2116 
   2117 }
   2118 
   2119 #endif /* H264DEC_OMXDL */
   2120 
   2121 
   2122 /*------------------------------------------------------------------------------
   2123 
   2124     Function: FillRow1
   2125 
   2126         Functional description:
   2127           This function gets a row of reference pels in a 'normal' case when no
   2128           overfilling is necessary.
   2129 
   2130 ------------------------------------------------------------------------------*/
   2131 
   2132 static void FillRow1(
   2133   u8 *ref,
   2134   u8 *fill,
   2135   i32 left,
   2136   i32 center,
   2137   i32 right)
   2138 {
   2139 
   2140     ASSERT(ref);
   2141     ASSERT(fill);
   2142 
   2143     H264SwDecMemcpy(fill, ref, (u32)center);
   2144 
   2145     /*lint -e(715) */
   2146 }
   2147 
   2148 
   2149 /*------------------------------------------------------------------------------
   2150 
   2151     Function: h264bsdFillRow7
   2152 
   2153         Functional description:
   2154           This function gets a row of reference pels when horizontal coordinate
   2155           is partly negative or partly greater than reference picture width
   2156           (overfilling some pels on left and/or right edge).
   2157         Inputs:
   2158           ref       pointer to reference samples
   2159           left      amount of pixels to overfill on left-edge
   2160           center    amount of pixels to copy
   2161           right     amount of pixels to overfill on right-edge
   2162         Outputs:
   2163           fill      pointer where samples are stored
   2164 
   2165 ------------------------------------------------------------------------------*/
   2166 #ifndef H264DEC_NEON
   2167 void h264bsdFillRow7(
   2168   u8 *ref,
   2169   u8 *fill,
   2170   i32 left,
   2171   i32 center,
   2172   i32 right)
   2173 {
   2174     u8 tmp;
   2175 
   2176     ASSERT(ref);
   2177     ASSERT(fill);
   2178 
   2179     if (left)
   2180         tmp = *ref;
   2181 
   2182     for ( ; left; left--)
   2183         /*lint -esym(644,tmp)  tmp is initialized if used */
   2184         *fill++ = tmp;
   2185 
   2186     for ( ; center; center--)
   2187         *fill++ = *ref++;
   2188 
   2189     if (right)
   2190         tmp = ref[-1];
   2191 
   2192     for ( ; right; right--)
   2193         /*lint -esym(644,tmp)  tmp is initialized if used */
   2194         *fill++ = tmp;
   2195 }
   2196 #endif
   2197 /*------------------------------------------------------------------------------
   2198 
   2199     Function: h264bsdFillBlock
   2200 
   2201         Functional description:
   2202           This function gets a block of reference pels. It determines whether
   2203           overfilling is needed or not and repeatedly calls an appropriate
   2204           function (by using a function pointer) that fills one row the block.
   2205         Inputs:
   2206           ref               pointer to reference frame
   2207           x0                x-coordinate for block
   2208           y0                y-coordinate for block
   2209           width             width of reference frame
   2210           height            height of reference frame
   2211           blockWidth        width of block
   2212           blockHeight       height of block
   2213           fillScanLength    length of a line in output array (pixels)
   2214         Outputs:
   2215           fill              pointer to array where output block is written
   2216 
   2217 ------------------------------------------------------------------------------*/
   2218 
   2219 void h264bsdFillBlock(
   2220   u8 *ref,
   2221   u8 *fill,
   2222   i32 x0,
   2223   i32 y0,
   2224   u32 width,
   2225   u32 height,
   2226   u32 blockWidth,
   2227   u32 blockHeight,
   2228   u32 fillScanLength)
   2229 
   2230 {
   2231 
   2232 /* Variables */
   2233 
   2234     i32 xstop, ystop;
   2235     void (*fp)(u8*, u8*, i32, i32, i32);
   2236     i32 left, x, right;
   2237     i32 top, y, bottom;
   2238 
   2239 /* Code */
   2240 
   2241     ASSERT(ref);
   2242     ASSERT(fill);
   2243     ASSERT(width);
   2244     ASSERT(height);
   2245     ASSERT(fill);
   2246     ASSERT(blockWidth);
   2247     ASSERT(blockHeight);
   2248 
   2249     xstop = x0 + (i32)blockWidth;
   2250     ystop = y0 + (i32)blockHeight;
   2251 
   2252     /* Choose correct function whether overfilling on left-edge or right-edge
   2253      * is needed or not */
   2254     if (x0 >= 0 && xstop <= (i32)width)
   2255         fp = FillRow1;
   2256     else
   2257         fp = h264bsdFillRow7;
   2258 
   2259     if (ystop < 0)
   2260         y0 = -(i32)blockHeight;
   2261 
   2262     if (xstop < 0)
   2263         x0 = -(i32)blockWidth;
   2264 
   2265     if (y0 > (i32)height)
   2266         y0 = (i32)height;
   2267 
   2268     if (x0 > (i32)width)
   2269         x0 = (i32)width;
   2270 
   2271     xstop = x0 + (i32)blockWidth;
   2272     ystop = y0 + (i32)blockHeight;
   2273 
   2274     if (x0 > 0)
   2275         ref += x0;
   2276 
   2277     if (y0 > 0)
   2278         ref += y0 * (i32)width;
   2279 
   2280     left = x0 < 0 ? -x0 : 0;
   2281     right = xstop > (i32)width ? xstop - (i32)width : 0;
   2282     x = (i32)blockWidth - left - right;
   2283 
   2284     top = y0 < 0 ? -y0 : 0;
   2285     bottom = ystop > (i32)height ? ystop - (i32)height : 0;
   2286     y = (i32)blockHeight - top - bottom;
   2287 
   2288     /* Top-overfilling */
   2289     for ( ; top; top-- )
   2290     {
   2291         (*fp)(ref, fill, left, x, right);
   2292         fill += fillScanLength;
   2293     }
   2294 
   2295     /* Lines inside reference image */
   2296     for ( ; y; y-- )
   2297     {
   2298         (*fp)(ref, fill, left, x, right);
   2299         ref += width;
   2300         fill += fillScanLength;
   2301     }
   2302 
   2303     ref -= width;
   2304 
   2305     /* Bottom-overfilling */
   2306     for ( ; bottom; bottom-- )
   2307     {
   2308         (*fp)(ref, fill, left, x, right);
   2309         fill += fillScanLength;
   2310     }
   2311 }
   2312 
   2313 /*lint +e701 +e702 */
   2314 
   2315 
   2316