Home | History | Annotate | Download | only in source
      1 /*
      2  * Copyright (C) 2009 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*------------------------------------------------------------------------------
     18 
     19     Table of contents
     20 
     21      1. Include headers
     22      2. External compiler flags
     23      3. Module defines
     24      4. Local function prototypes
     25      5. Functions
     26 
     27 ------------------------------------------------------------------------------*/
     28 
     29 /*------------------------------------------------------------------------------
     30     1. Include headers
     31 ------------------------------------------------------------------------------*/
     32 
     33 #include "basetype.h"
     34 #include "h264bsd_reconstruct.h"
     35 #include "h264bsd_macroblock_layer.h"
     36 #include "h264bsd_image.h"
     37 #include "h264bsd_util.h"
     38 
     39 #ifdef H264DEC_OMXDL
     40 #include "omxtypes.h"
     41 #include "omxVC.h"
     42 #include "armVC.h"
     43 #endif /* H264DEC_OMXDL */
     44 
     45 #define UNUSED(x) (void)(x)
     46 
     47 /*------------------------------------------------------------------------------
     48     2. External compiler flags
     49 --------------------------------------------------------------------------------
     50 
     51 --------------------------------------------------------------------------------
     52     3. Module defines
     53 ------------------------------------------------------------------------------*/
     54 
     55 /* Switch off the following Lint messages for this file:
     56  * Info 701: Shift left of signed quantity (int)
     57  * Info 702: Shift right of signed quantity (int)
     58  */
     59 /*lint -e701 -e702 */
     60 
     61 /* Luma fractional-sample positions
     62  *
     63  *  G a b c H
     64  *  d e f g
     65  *  h i j k m
     66  *  n p q r
     67  *  M   s   N
     68  *
     69  *  G, H, M and N are integer sample positions
     70  *  a-s are fractional samples that need to be interpolated.
     71  */
     72 #ifndef H264DEC_OMXDL
     73 static const u32 lumaFracPos[4][4] = {
     74   /* G  d  h  n    a  e  i  p    b  f  j   q     c   g   k   r */
     75     {0, 1, 2, 3}, {4, 5, 6, 7}, {8, 9, 10, 11}, {12, 13, 14, 15}};
     76 #endif /* H264DEC_OMXDL */
     77 
     78 /* clipping table, defined in h264bsd_intra_prediction.c */
     79 extern const u8 h264bsdClip[];
     80 
     81 /*------------------------------------------------------------------------------
     82     4. Local function prototypes
     83 ------------------------------------------------------------------------------*/
     84 
     85 #ifndef H264DEC_OMXDL
     86 
     87 /*------------------------------------------------------------------------------
     88 
     89     Function: h264bsdInterpolateChromaHor
     90 
     91         Functional description:
     92           This function performs chroma interpolation in horizontal direction.
     93           Overfilling is done only if needed. Reference image (pRef) is
     94           read at correct position and the predicted part is written to
     95           macroblock's chrominance (predPartChroma)
     96         Inputs:
     97           pRef              pointer to reference frame Cb top-left corner
     98           x0                integer x-coordinate for prediction
     99           y0                integer y-coordinate for prediction
    100           width             width of the reference frame chrominance in pixels
    101           height            height of the reference frame chrominance in pixels
    102           xFrac             horizontal fraction for prediction in 1/8 pixels
    103           chromaPartWidth   width of the predicted part in pixels
    104           chromaPartHeight  height of the predicted part in pixels
    105         Outputs:
    106           predPartChroma    pointer where predicted part is written
    107 
    108 ------------------------------------------------------------------------------*/
    109 #ifndef H264DEC_ARM11
    110 void h264bsdInterpolateChromaHor(
    111   u8 *pRef,
    112   u8 *predPartChroma,
    113   i32 x0,
    114   i32 y0,
    115   u32 width,
    116   u32 height,
    117   u32 xFrac,
    118   u32 chromaPartWidth,
    119   u32 chromaPartHeight)
    120 {
    121 
    122 /* Variables */
    123 
    124     u32 x, y, tmp1, tmp2, tmp3, tmp4, c, val;
    125     u8 *ptrA, *cbr;
    126     u32 comp;
    127     u8 block[9*8*2];
    128 
    129 /* Code */
    130 
    131     ASSERT(predPartChroma);
    132     ASSERT(chromaPartWidth);
    133     ASSERT(chromaPartHeight);
    134     ASSERT(xFrac < 8);
    135     ASSERT(pRef);
    136 
    137     if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
    138         (y0 < 0) || ((u32)y0+chromaPartHeight > height))
    139     {
    140         h264bsdFillBlock(pRef, block, x0, y0, width, height,
    141             chromaPartWidth + 1, chromaPartHeight, chromaPartWidth + 1);
    142         pRef += width * height;
    143         h264bsdFillBlock(pRef, block + (chromaPartWidth+1)*chromaPartHeight,
    144             x0, y0, width, height, chromaPartWidth + 1,
    145             chromaPartHeight, chromaPartWidth + 1);
    146 
    147         pRef = block;
    148         x0 = 0;
    149         y0 = 0;
    150         width = chromaPartWidth+1;
    151         height = chromaPartHeight;
    152     }
    153 
    154     val = 8 - xFrac;
    155 
    156     for (comp = 0; comp <= 1; comp++)
    157     {
    158 
    159         ptrA = pRef + (comp * height + (u32)y0) * width + x0;
    160         cbr = predPartChroma + comp * 8 * 8;
    161 
    162         /* 2x2 pels per iteration
    163          * bilinear horizontal interpolation */
    164         for (y = (chromaPartHeight >> 1); y; y--)
    165         {
    166             for (x = (chromaPartWidth >> 1); x; x--)
    167             {
    168                 tmp1 = ptrA[width];
    169                 tmp2 = *ptrA++;
    170                 tmp3 = ptrA[width];
    171                 tmp4 = *ptrA++;
    172                 c = ((val * tmp1 + xFrac * tmp3) << 3) + 32;
    173                 c >>= 6;
    174                 cbr[8] = (u8)c;
    175                 c = ((val * tmp2 + xFrac * tmp4) << 3) + 32;
    176                 c >>= 6;
    177                 *cbr++ = (u8)c;
    178                 tmp1 = ptrA[width];
    179                 tmp2 = *ptrA;
    180                 c = ((val * tmp3 + xFrac * tmp1) << 3) + 32;
    181                 c >>= 6;
    182                 cbr[8] = (u8)c;
    183                 c = ((val * tmp4 + xFrac * tmp2) << 3) + 32;
    184                 c >>= 6;
    185                 *cbr++ = (u8)c;
    186             }
    187             cbr += 2*8 - chromaPartWidth;
    188             ptrA += 2*width - chromaPartWidth;
    189         }
    190     }
    191 
    192 }
    193 
    194 /*------------------------------------------------------------------------------
    195 
    196     Function: h264bsdInterpolateChromaVer
    197 
    198         Functional description:
    199           This function performs chroma interpolation in vertical direction.
    200           Overfilling is done only if needed. Reference image (pRef) is
    201           read at correct position and the predicted part is written to
    202           macroblock's chrominance (predPartChroma)
    203 
    204 ------------------------------------------------------------------------------*/
    205 
    206 void h264bsdInterpolateChromaVer(
    207   u8 *pRef,
    208   u8 *predPartChroma,
    209   i32 x0,
    210   i32 y0,
    211   u32 width,
    212   u32 height,
    213   u32 yFrac,
    214   u32 chromaPartWidth,
    215   u32 chromaPartHeight)
    216 {
    217 
    218 /* Variables */
    219 
    220     u32 x, y, tmp1, tmp2, tmp3, c, val;
    221     u8 *ptrA, *cbr;
    222     u32 comp;
    223     u8 block[9*8*2];
    224 
    225 /* Code */
    226 
    227     ASSERT(predPartChroma);
    228     ASSERT(chromaPartWidth);
    229     ASSERT(chromaPartHeight);
    230     ASSERT(yFrac < 8);
    231     ASSERT(pRef);
    232 
    233     if ((x0 < 0) || ((u32)x0+chromaPartWidth > width) ||
    234         (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
    235     {
    236         h264bsdFillBlock(pRef, block, x0, y0, width, height, chromaPartWidth,
    237             chromaPartHeight + 1, chromaPartWidth);
    238         pRef += width * height;
    239         h264bsdFillBlock(pRef, block + chromaPartWidth*(chromaPartHeight+1),
    240             x0, y0, width, height, chromaPartWidth,
    241             chromaPartHeight + 1, chromaPartWidth);
    242 
    243         pRef = block;
    244         x0 = 0;
    245         y0 = 0;
    246         width = chromaPartWidth;
    247         height = chromaPartHeight+1;
    248     }
    249 
    250     val = 8 - yFrac;
    251 
    252     for (comp = 0; comp <= 1; comp++)
    253     {
    254 
    255         ptrA = pRef + (comp * height + (u32)y0) * width + x0;
    256         cbr = predPartChroma + comp * 8 * 8;
    257 
    258         /* 2x2 pels per iteration
    259          * bilinear vertical interpolation */
    260         for (y = (chromaPartHeight >> 1); y; y--)
    261         {
    262             for (x = (chromaPartWidth >> 1); x; x--)
    263             {
    264                 tmp3 = ptrA[width*2];
    265                 tmp2 = ptrA[width];
    266                 tmp1 = *ptrA++;
    267                 c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
    268                 c >>= 6;
    269                 cbr[8] = (u8)c;
    270                 c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
    271                 c >>= 6;
    272                 *cbr++ = (u8)c;
    273                 tmp3 = ptrA[width*2];
    274                 tmp2 = ptrA[width];
    275                 tmp1 = *ptrA++;
    276                 c = ((val * tmp2 + yFrac * tmp3) << 3) + 32;
    277                 c >>= 6;
    278                 cbr[8] = (u8)c;
    279                 c = ((val * tmp1 + yFrac * tmp2) << 3) + 32;
    280                 c >>= 6;
    281                 *cbr++ = (u8)c;
    282             }
    283             cbr += 2*8 - chromaPartWidth;
    284             ptrA += 2*width - chromaPartWidth;
    285         }
    286     }
    287 
    288 }
    289 #endif
    290 /*------------------------------------------------------------------------------
    291 
    292     Function: h264bsdInterpolateChromaHorVer
    293 
    294         Functional description:
    295           This function performs chroma interpolation in horizontal and
    296           vertical direction. Overfilling is done only if needed. Reference
    297           image (ref) is read at correct position and the predicted part
    298           is written to macroblock's chrominance (predPartChroma)
    299 
    300 ------------------------------------------------------------------------------*/
    301 
    302 void h264bsdInterpolateChromaHorVer(
    303   u8 *ref,
    304   u8 *predPartChroma,
    305   i32 x0,
    306   i32 y0,
    307   u32 width,
    308   u32 height,
    309   u32 xFrac,
    310   u32 yFrac,
    311   u32 chromaPartWidth,
    312   u32 chromaPartHeight)
    313 {
    314     u8 block[9*9*2];
    315     u32 x, y, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, valX, valY, plus32 = 32;
    316     u32 comp;
    317     u8 *ptrA, *cbr;
    318 
    319 /* Code */
    320 
    321     ASSERT(predPartChroma);
    322     ASSERT(chromaPartWidth);
    323     ASSERT(chromaPartHeight);
    324     ASSERT(xFrac < 8);
    325     ASSERT(yFrac < 8);
    326     ASSERT(ref);
    327 
    328     if ((x0 < 0) || ((u32)x0+chromaPartWidth+1 > width) ||
    329         (y0 < 0) || ((u32)y0+chromaPartHeight+1 > height))
    330     {
    331         h264bsdFillBlock(ref, block, x0, y0, width, height,
    332             chromaPartWidth + 1, chromaPartHeight + 1, chromaPartWidth + 1);
    333         ref += width * height;
    334         h264bsdFillBlock(ref, block + (chromaPartWidth+1)*(chromaPartHeight+1),
    335             x0, y0, width, height, chromaPartWidth + 1,
    336             chromaPartHeight + 1, chromaPartWidth + 1);
    337 
    338         ref = block;
    339         x0 = 0;
    340         y0 = 0;
    341         width = chromaPartWidth+1;
    342         height = chromaPartHeight+1;
    343     }
    344 
    345     valX = 8 - xFrac;
    346     valY = 8 - yFrac;
    347 
    348     for (comp = 0; comp <= 1; comp++)
    349     {
    350 
    351         ptrA = ref + (comp * height + (u32)y0) * width + x0;
    352         cbr = predPartChroma + comp * 8 * 8;
    353 
    354         /* 2x2 pels per iteration
    355          * bilinear vertical and horizontal interpolation */
    356         for (y = (chromaPartHeight >> 1); y; y--)
    357         {
    358             tmp1 = *ptrA;
    359             tmp3 = ptrA[width];
    360             tmp5 = ptrA[width*2];
    361             tmp1 *= valY;
    362             tmp1 += tmp3 * yFrac;
    363             tmp3 *= valY;
    364             tmp3 += tmp5 * yFrac;
    365             for (x = (chromaPartWidth >> 1); x; x--)
    366             {
    367                 tmp2 = *++ptrA;
    368                 tmp4 = ptrA[width];
    369                 tmp6 = ptrA[width*2];
    370                 tmp2 *= valY;
    371                 tmp2 += tmp4 * yFrac;
    372                 tmp4 *= valY;
    373                 tmp4 += tmp6 * yFrac;
    374                 tmp1 = tmp1 * valX + plus32;
    375                 tmp3 = tmp3 * valX + plus32;
    376                 tmp1 += tmp2 * xFrac;
    377                 tmp1 >>= 6;
    378                 tmp3 += tmp4 * xFrac;
    379                 tmp3 >>= 6;
    380                 cbr[8] = (u8)tmp3;
    381                 *cbr++ = (u8)tmp1;
    382 
    383                 tmp1 = *++ptrA;
    384                 tmp3 = ptrA[width];
    385                 tmp5 = ptrA[width*2];
    386                 tmp1 *= valY;
    387                 tmp1 += tmp3 * yFrac;
    388                 tmp3 *= valY;
    389                 tmp3 += tmp5 * yFrac;
    390                 tmp2 = tmp2 * valX + plus32;
    391                 tmp4 = tmp4 * valX + plus32;
    392                 tmp2 += tmp1 * xFrac;
    393                 tmp2 >>= 6;
    394                 tmp4 += tmp3 * xFrac;
    395                 tmp4 >>= 6;
    396                 cbr[8] = (u8)tmp4;
    397                 *cbr++ = (u8)tmp2;
    398             }
    399             cbr += 2*8 - chromaPartWidth;
    400             ptrA += 2*width - chromaPartWidth;
    401         }
    402     }
    403 
    404 }
    405 
    406 /*------------------------------------------------------------------------------
    407 
    408     Function: PredictChroma
    409 
    410         Functional description:
    411           Top level chroma prediction function that calls the appropriate
    412           interpolation function. The output is written to macroblock array.
    413 
    414 ------------------------------------------------------------------------------*/
    415 
    416 static void PredictChroma(
    417   u8 *mbPartChroma,
    418   u32 xAL,
    419   u32 yAL,
    420   u32 partWidth,
    421   u32 partHeight,
    422   mv_t *mv,
    423   image_t *refPic)
    424 {
    425 
    426 /* Variables */
    427 
    428     u32 xFrac, yFrac, width, height, chromaPartWidth, chromaPartHeight;
    429     i32 xInt, yInt;
    430     u8 *ref;
    431 
    432 /* Code */
    433 
    434     ASSERT(mv);
    435     ASSERT(refPic);
    436     ASSERT(refPic->data);
    437     ASSERT(refPic->width);
    438     ASSERT(refPic->height);
    439 
    440     width  = 8 * refPic->width;
    441     height = 8 * refPic->height;
    442 
    443     xInt = (xAL >> 1) + (mv->hor >> 3);
    444     yInt = (yAL >> 1) + (mv->ver >> 3);
    445     xFrac = mv->hor & 0x7;
    446     yFrac = mv->ver & 0x7;
    447 
    448     chromaPartWidth  = partWidth >> 1;
    449     chromaPartHeight = partHeight >> 1;
    450     ref = refPic->data + 256 * refPic->width * refPic->height;
    451 
    452     if (xFrac && yFrac)
    453     {
    454         h264bsdInterpolateChromaHorVer(ref, mbPartChroma, xInt, yInt, width,
    455                 height, xFrac, yFrac, chromaPartWidth, chromaPartHeight);
    456     }
    457     else if (xFrac)
    458     {
    459         h264bsdInterpolateChromaHor(ref, mbPartChroma, xInt, yInt, width,
    460                 height, xFrac, chromaPartWidth, chromaPartHeight);
    461     }
    462     else if (yFrac)
    463     {
    464         h264bsdInterpolateChromaVer(ref, mbPartChroma, xInt, yInt, width,
    465                 height, yFrac, chromaPartWidth, chromaPartHeight);
    466     }
    467     else
    468     {
    469         h264bsdFillBlock(ref, mbPartChroma, xInt, yInt, width, height,
    470             chromaPartWidth, chromaPartHeight, 8);
    471         ref += width * height;
    472         h264bsdFillBlock(ref, mbPartChroma + 8*8, xInt, yInt, width, height,
    473             chromaPartWidth, chromaPartHeight, 8);
    474     }
    475 
    476 }
    477 
    478 
    479 /*------------------------------------------------------------------------------
    480 
    481     Function: h264bsdInterpolateVerHalf
    482 
    483         Functional description:
    484           Function to perform vertical interpolation of pixel position 'h'
    485           for a block. Overfilling is done only if needed. Reference
    486           image (ref) is read at correct position and the predicted part
    487           is written to macroblock array (mb)
    488 
    489 ------------------------------------------------------------------------------*/
    490 #ifndef H264DEC_ARM11
    491 void h264bsdInterpolateVerHalf(
    492   u8 *ref,
    493   u8 *mb,
    494   i32 x0,
    495   i32 y0,
    496   u32 width,
    497   u32 height,
    498   u32 partWidth,
    499   u32 partHeight)
    500 {
    501     u32 p1[21*21/4+1];
    502     u32 i, j;
    503     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    504     u8 *ptrC, *ptrV;
    505     const u8 *clp = h264bsdClip + 512;
    506 
    507     /* Code */
    508 
    509     ASSERT(ref);
    510     ASSERT(mb);
    511 
    512     if ((x0 < 0) || ((u32)x0+partWidth > width) ||
    513         (y0 < 0) || ((u32)y0+partHeight+5 > height))
    514     {
    515         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
    516                 partWidth, partHeight+5, partWidth);
    517 
    518         x0 = 0;
    519         y0 = 0;
    520         ref = (u8*)p1;
    521         width = partWidth;
    522     }
    523 
    524     ref += (u32)y0 * width + (u32)x0;
    525 
    526     ptrC = ref + width;
    527     ptrV = ptrC + 5*width;
    528 
    529     /* 4 pixels per iteration, interpolate using 5 vertical samples */
    530     for (i = (partHeight >> 2); i; i--)
    531     {
    532         /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
    533         for (j = partWidth; j; j--)
    534         {
    535             tmp4 = ptrV[-(i32)width*2];
    536             tmp5 = ptrV[-(i32)width];
    537             tmp1 = ptrV[width];
    538             tmp2 = ptrV[width*2];
    539             tmp6 = *ptrV++;
    540 
    541             tmp7 = tmp4 + tmp1;
    542             tmp2 -= (tmp7 << 2);
    543             tmp2 -= tmp7;
    544             tmp2 += 16;
    545             tmp7 = tmp5 + tmp6;
    546             tmp3 = ptrC[width*2];
    547             tmp2 += (tmp7 << 4);
    548             tmp2 += (tmp7 << 2);
    549             tmp2 += tmp3;
    550             tmp2 = clp[tmp2>>5];
    551             tmp1 += 16;
    552             mb[48] = (u8)tmp2;
    553 
    554             tmp7 = tmp3 + tmp6;
    555             tmp1 -= (tmp7 << 2);
    556             tmp1 -= tmp7;
    557             tmp7 = tmp4 + tmp5;
    558             tmp2 = ptrC[width];
    559             tmp1 += (tmp7 << 4);
    560             tmp1 += (tmp7 << 2);
    561             tmp1 += tmp2;
    562             tmp1 = clp[tmp1>>5];
    563             tmp6 += 16;
    564             mb[32] = (u8)tmp1;
    565 
    566             tmp7 = tmp2 + tmp5;
    567             tmp6 -= (tmp7 << 2);
    568             tmp6 -= tmp7;
    569             tmp7 = tmp4 + tmp3;
    570             tmp1 = *ptrC;
    571             tmp6 += (tmp7 << 4);
    572             tmp6 += (tmp7 << 2);
    573             tmp6 += tmp1;
    574             tmp6 = clp[tmp6>>5];
    575             tmp5 += 16;
    576             mb[16] = (u8)tmp6;
    577 
    578             tmp1 += tmp4;
    579             tmp5 -= (tmp1 << 2);
    580             tmp5 -= tmp1;
    581             tmp3 += tmp2;
    582             tmp6 = ptrC[-(i32)width];
    583             tmp5 += (tmp3 << 4);
    584             tmp5 += (tmp3 << 2);
    585             tmp5 += tmp6;
    586             tmp5 = clp[tmp5>>5];
    587             *mb++ = (u8)tmp5;
    588             ptrC++;
    589         }
    590         ptrC += 4*width - partWidth;
    591         ptrV += 4*width - partWidth;
    592         mb += 4*16 - partWidth;
    593     }
    594 
    595 }
    596 
    597 /*------------------------------------------------------------------------------
    598 
    599     Function: h264bsdInterpolateVerQuarter
    600 
    601         Functional description:
    602           Function to perform vertical interpolation of pixel position 'd'
    603           or 'n' for a block. Overfilling is done only if needed. Reference
    604           image (ref) is read at correct position and the predicted part
    605           is written to macroblock array (mb)
    606 
    607 ------------------------------------------------------------------------------*/
    608 
    609 void h264bsdInterpolateVerQuarter(
    610   u8 *ref,
    611   u8 *mb,
    612   i32 x0,
    613   i32 y0,
    614   u32 width,
    615   u32 height,
    616   u32 partWidth,
    617   u32 partHeight,
    618   u32 verOffset)    /* 0 for pixel d, 1 for pixel n */
    619 {
    620     u32 p1[21*21/4+1];
    621     u32 i, j;
    622     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    623     u8 *ptrC, *ptrV, *ptrInt;
    624     const u8 *clp = h264bsdClip + 512;
    625 
    626     /* Code */
    627 
    628     ASSERT(ref);
    629     ASSERT(mb);
    630 
    631     if ((x0 < 0) || ((u32)x0+partWidth > width) ||
    632         (y0 < 0) || ((u32)y0+partHeight+5 > height))
    633     {
    634         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
    635                 partWidth, partHeight+5, partWidth);
    636 
    637         x0 = 0;
    638         y0 = 0;
    639         ref = (u8*)p1;
    640         width = partWidth;
    641     }
    642 
    643     ref += (u32)y0 * width + (u32)x0;
    644 
    645     ptrC = ref + width;
    646     ptrV = ptrC + 5*width;
    647 
    648     /* Pointer to integer sample position, either M or R */
    649     ptrInt = ptrC + (2+verOffset)*width;
    650 
    651     /* 4 pixels per iteration
    652      * interpolate using 5 vertical samples and average between
    653      * interpolated value and integer sample value */
    654     for (i = (partHeight >> 2); i; i--)
    655     {
    656         /* h1 = (16 + A + 16(G+M) + 4(G+M) - 4(C+R) - (C+R) + T) >> 5 */
    657         for (j = partWidth; j; j--)
    658         {
    659             tmp4 = ptrV[-(i32)width*2];
    660             tmp5 = ptrV[-(i32)width];
    661             tmp1 = ptrV[width];
    662             tmp2 = ptrV[width*2];
    663             tmp6 = *ptrV++;
    664 
    665             tmp7 = tmp4 + tmp1;
    666             tmp2 -= (tmp7 << 2);
    667             tmp2 -= tmp7;
    668             tmp2 += 16;
    669             tmp7 = tmp5 + tmp6;
    670             tmp3 = ptrC[width*2];
    671             tmp2 += (tmp7 << 4);
    672             tmp2 += (tmp7 << 2);
    673             tmp2 += tmp3;
    674             tmp2 = clp[tmp2>>5];
    675             tmp7 = ptrInt[width*2];
    676             tmp1 += 16;
    677             tmp2++;
    678             mb[48] = (u8)((tmp2 + tmp7) >> 1);
    679 
    680             tmp7 = tmp3 + tmp6;
    681             tmp1 -= (tmp7 << 2);
    682             tmp1 -= tmp7;
    683             tmp7 = tmp4 + tmp5;
    684             tmp2 = ptrC[width];
    685             tmp1 += (tmp7 << 4);
    686             tmp1 += (tmp7 << 2);
    687             tmp1 += tmp2;
    688             tmp1 = clp[tmp1>>5];
    689             tmp7 = ptrInt[width];
    690             tmp6 += 16;
    691             tmp1++;
    692             mb[32] = (u8)((tmp1 + tmp7) >> 1);
    693 
    694             tmp7 = tmp2 + tmp5;
    695             tmp6 -= (tmp7 << 2);
    696             tmp6 -= tmp7;
    697             tmp7 = tmp4 + tmp3;
    698             tmp1 = *ptrC;
    699             tmp6 += (tmp7 << 4);
    700             tmp6 += (tmp7 << 2);
    701             tmp6 += tmp1;
    702             tmp6 = clp[tmp6>>5];
    703             tmp7 = *ptrInt;
    704             tmp5 += 16;
    705             tmp6++;
    706             mb[16] = (u8)((tmp6 + tmp7) >> 1);
    707 
    708             tmp1 += tmp4;
    709             tmp5 -= (tmp1 << 2);
    710             tmp5 -= tmp1;
    711             tmp3 += tmp2;
    712             tmp6 = ptrC[-(i32)width];
    713             tmp5 += (tmp3 << 4);
    714             tmp5 += (tmp3 << 2);
    715             tmp5 += tmp6;
    716             tmp5 = clp[tmp5>>5];
    717             tmp7 = ptrInt[-(i32)width];
    718             tmp5++;
    719             *mb++ = (u8)((tmp5 + tmp7) >> 1);
    720             ptrC++;
    721             ptrInt++;
    722         }
    723         ptrC += 4*width - partWidth;
    724         ptrV += 4*width - partWidth;
    725         ptrInt += 4*width - partWidth;
    726         mb += 4*16 - partWidth;
    727     }
    728 
    729 }
    730 
    731 /*------------------------------------------------------------------------------
    732 
    733     Function: h264bsdInterpolateHorHalf
    734 
    735         Functional description:
    736           Function to perform horizontal interpolation of pixel position 'b'
    737           for a block. Overfilling is done only if needed. Reference
    738           image (ref) is read at correct position and the predicted part
    739           is written to macroblock array (mb)
    740 
    741 ------------------------------------------------------------------------------*/
    742 
    743 void h264bsdInterpolateHorHalf(
    744   u8 *ref,
    745   u8 *mb,
    746   i32 x0,
    747   i32 y0,
    748   u32 width,
    749   u32 height,
    750   u32 partWidth,
    751   u32 partHeight)
    752 {
    753     u32 p1[21*21/4+1];
    754     u8 *ptrJ;
    755     u32 x, y;
    756     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    757     const u8 *clp = h264bsdClip + 512;
    758 
    759     /* Code */
    760 
    761     ASSERT(ref);
    762     ASSERT(mb);
    763     ASSERT((partWidth&0x3) == 0);
    764     ASSERT((partHeight&0x3) == 0);
    765 
    766     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
    767         (y0 < 0) || ((u32)y0+partHeight > height))
    768     {
    769         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
    770                 partWidth+5, partHeight, partWidth+5);
    771 
    772         x0 = 0;
    773         y0 = 0;
    774         ref = (u8*)p1;
    775         width = partWidth + 5;
    776     }
    777 
    778     ref += (u32)y0 * width + (u32)x0;
    779 
    780     ptrJ = ref + 5;
    781 
    782     for (y = partHeight; y; y--)
    783     {
    784         tmp6 = *(ptrJ - 5);
    785         tmp5 = *(ptrJ - 4);
    786         tmp4 = *(ptrJ - 3);
    787         tmp3 = *(ptrJ - 2);
    788         tmp2 = *(ptrJ - 1);
    789 
    790         /* calculate 4 pels per iteration */
    791         for (x = (partWidth >> 2); x; x--)
    792         {
    793             /* First pixel */
    794             tmp6 += 16;
    795             tmp7 = tmp3 + tmp4;
    796             tmp6 += (tmp7 << 4);
    797             tmp6 += (tmp7 << 2);
    798             tmp7 = tmp2 + tmp5;
    799             tmp1 = *ptrJ++;
    800             tmp6 -= (tmp7 << 2);
    801             tmp6 -= tmp7;
    802             tmp6 += tmp1;
    803             tmp6 = clp[tmp6>>5];
    804             /* Second pixel */
    805             tmp5 += 16;
    806             tmp7 = tmp2 + tmp3;
    807             *mb++ = (u8)tmp6;
    808             tmp5 += (tmp7 << 4);
    809             tmp5 += (tmp7 << 2);
    810             tmp7 = tmp1 + tmp4;
    811             tmp6 = *ptrJ++;
    812             tmp5 -= (tmp7 << 2);
    813             tmp5 -= tmp7;
    814             tmp5 += tmp6;
    815             tmp5 = clp[tmp5>>5];
    816             /* Third pixel */
    817             tmp4 += 16;
    818             tmp7 = tmp1 + tmp2;
    819             *mb++ = (u8)tmp5;
    820             tmp4 += (tmp7 << 4);
    821             tmp4 += (tmp7 << 2);
    822             tmp7 = tmp6 + tmp3;
    823             tmp5 = *ptrJ++;
    824             tmp4 -= (tmp7 << 2);
    825             tmp4 -= tmp7;
    826             tmp4 += tmp5;
    827             tmp4 = clp[tmp4>>5];
    828             /* Fourth pixel */
    829             tmp3 += 16;
    830             tmp7 = tmp6 + tmp1;
    831             *mb++ = (u8)tmp4;
    832             tmp3 += (tmp7 << 4);
    833             tmp3 += (tmp7 << 2);
    834             tmp7 = tmp5 + tmp2;
    835             tmp4 = *ptrJ++;
    836             tmp3 -= (tmp7 << 2);
    837             tmp3 -= tmp7;
    838             tmp3 += tmp4;
    839             tmp3 = clp[tmp3>>5];
    840             tmp7 = tmp4;
    841             tmp4 = tmp6;
    842             tmp6 = tmp2;
    843             tmp2 = tmp7;
    844             *mb++ = (u8)tmp3;
    845             tmp3 = tmp5;
    846             tmp5 = tmp1;
    847         }
    848         ptrJ += width - partWidth;
    849         mb += 16 - partWidth;
    850     }
    851 
    852 }
    853 
    854 /*------------------------------------------------------------------------------
    855 
    856     Function: h264bsdInterpolateHorQuarter
    857 
    858         Functional description:
    859           Function to perform horizontal interpolation of pixel position 'a'
    860           or 'c' for a block. Overfilling is done only if needed. Reference
    861           image (ref) is read at correct position and the predicted part
    862           is written to macroblock array (mb)
    863 
    864 ------------------------------------------------------------------------------*/
    865 
    866 void h264bsdInterpolateHorQuarter(
    867   u8 *ref,
    868   u8 *mb,
    869   i32 x0,
    870   i32 y0,
    871   u32 width,
    872   u32 height,
    873   u32 partWidth,
    874   u32 partHeight,
    875   u32 horOffset) /* 0 for pixel a, 1 for pixel c */
    876 {
    877     u32 p1[21*21/4+1];
    878     u8 *ptrJ;
    879     u32 x, y;
    880     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    881     const u8 *clp = h264bsdClip + 512;
    882 
    883     /* Code */
    884 
    885     ASSERT(ref);
    886     ASSERT(mb);
    887 
    888     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
    889         (y0 < 0) || ((u32)y0+partHeight > height))
    890     {
    891         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
    892                 partWidth+5, partHeight, partWidth+5);
    893 
    894         x0 = 0;
    895         y0 = 0;
    896         ref = (u8*)p1;
    897         width = partWidth + 5;
    898     }
    899 
    900     ref += (u32)y0 * width + (u32)x0;
    901 
    902     ptrJ = ref + 5;
    903 
    904     for (y = partHeight; y; y--)
    905     {
    906         tmp6 = *(ptrJ - 5);
    907         tmp5 = *(ptrJ - 4);
    908         tmp4 = *(ptrJ - 3);
    909         tmp3 = *(ptrJ - 2);
    910         tmp2 = *(ptrJ - 1);
    911 
    912         /* calculate 4 pels per iteration */
    913         for (x = (partWidth >> 2); x; x--)
    914         {
    915             /* First pixel */
    916             tmp6 += 16;
    917             tmp7 = tmp3 + tmp4;
    918             tmp6 += (tmp7 << 4);
    919             tmp6 += (tmp7 << 2);
    920             tmp7 = tmp2 + tmp5;
    921             tmp1 = *ptrJ++;
    922             tmp6 -= (tmp7 << 2);
    923             tmp6 -= tmp7;
    924             tmp6 += tmp1;
    925             tmp6 = clp[tmp6>>5];
    926             tmp5 += 16;
    927             if (!horOffset)
    928                 tmp6 += tmp4;
    929             else
    930                 tmp6 += tmp3;
    931             *mb++ = (u8)((tmp6 + 1) >> 1);
    932             /* Second pixel */
    933             tmp7 = tmp2 + tmp3;
    934             tmp5 += (tmp7 << 4);
    935             tmp5 += (tmp7 << 2);
    936             tmp7 = tmp1 + tmp4;
    937             tmp6 = *ptrJ++;
    938             tmp5 -= (tmp7 << 2);
    939             tmp5 -= tmp7;
    940             tmp5 += tmp6;
    941             tmp5 = clp[tmp5>>5];
    942             tmp4 += 16;
    943             if (!horOffset)
    944                 tmp5 += tmp3;
    945             else
    946                 tmp5 += tmp2;
    947             *mb++ = (u8)((tmp5 + 1) >> 1);
    948             /* Third pixel */
    949             tmp7 = tmp1 + tmp2;
    950             tmp4 += (tmp7 << 4);
    951             tmp4 += (tmp7 << 2);
    952             tmp7 = tmp6 + tmp3;
    953             tmp5 = *ptrJ++;
    954             tmp4 -= (tmp7 << 2);
    955             tmp4 -= tmp7;
    956             tmp4 += tmp5;
    957             tmp4 = clp[tmp4>>5];
    958             tmp3 += 16;
    959             if (!horOffset)
    960                 tmp4 += tmp2;
    961             else
    962                 tmp4 += tmp1;
    963             *mb++ = (u8)((tmp4 + 1) >> 1);
    964             /* Fourth pixel */
    965             tmp7 = tmp6 + tmp1;
    966             tmp3 += (tmp7 << 4);
    967             tmp3 += (tmp7 << 2);
    968             tmp7 = tmp5 + tmp2;
    969             tmp4 = *ptrJ++;
    970             tmp3 -= (tmp7 << 2);
    971             tmp3 -= tmp7;
    972             tmp3 += tmp4;
    973             tmp3 = clp[tmp3>>5];
    974             if (!horOffset)
    975                 tmp3 += tmp1;
    976             else
    977                 tmp3 += tmp6;
    978             *mb++ = (u8)((tmp3 + 1) >> 1);
    979             tmp3 = tmp5;
    980             tmp5 = tmp1;
    981             tmp7 = tmp4;
    982             tmp4 = tmp6;
    983             tmp6 = tmp2;
    984             tmp2 = tmp7;
    985         }
    986         ptrJ += width - partWidth;
    987         mb += 16 - partWidth;
    988     }
    989 
    990 }
    991 
    992 /*------------------------------------------------------------------------------
    993 
    994     Function: h264bsdInterpolateHorVerQuarter
    995 
    996         Functional description:
    997           Function to perform horizontal and vertical interpolation of pixel
    998           position 'e', 'g', 'p' or 'r' for a block. Overfilling is done only
    999           if needed. Reference image (ref) is read at correct position and
   1000           the predicted part is written to macroblock array (mb)
   1001 
   1002 ------------------------------------------------------------------------------*/
   1003 
   1004 void h264bsdInterpolateHorVerQuarter(
   1005   u8 *ref,
   1006   u8 *mb,
   1007   i32 x0,
   1008   i32 y0,
   1009   u32 width,
   1010   u32 height,
   1011   u32 partWidth,
   1012   u32 partHeight,
   1013   u32 horVerOffset) /* 0 for pixel e, 1 for pixel g,
   1014                        2 for pixel p, 3 for pixel r */
   1015 {
   1016     u32 p1[21*21/4+1];
   1017     u8 *ptrC, *ptrJ, *ptrV;
   1018     u32 x, y;
   1019     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1020     const u8 *clp = h264bsdClip + 512;
   1021 
   1022     /* Code */
   1023 
   1024     ASSERT(ref);
   1025     ASSERT(mb);
   1026 
   1027     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
   1028         (y0 < 0) || ((u32)y0+partHeight+5 > height))
   1029     {
   1030         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
   1031                 partWidth+5, partHeight+5, partWidth+5);
   1032 
   1033         x0 = 0;
   1034         y0 = 0;
   1035         ref = (u8*)p1;
   1036         width = partWidth+5;
   1037     }
   1038 
   1039     /* Ref points to G + (-2, -2) */
   1040     ref += (u32)y0 * width + (u32)x0;
   1041 
   1042     /* ptrJ points to either J or Q, depending on vertical offset */
   1043     ptrJ = ref + (((horVerOffset & 0x2) >> 1) + 2) * width + 5;
   1044 
   1045     /* ptrC points to either C or D, depending on horizontal offset */
   1046     ptrC = ref + width + 2 + (horVerOffset & 0x1);
   1047 
   1048     for (y = partHeight; y; y--)
   1049     {
   1050         tmp6 = *(ptrJ - 5);
   1051         tmp5 = *(ptrJ - 4);
   1052         tmp4 = *(ptrJ - 3);
   1053         tmp3 = *(ptrJ - 2);
   1054         tmp2 = *(ptrJ - 1);
   1055 
   1056         /* Horizontal interpolation, calculate 4 pels per iteration */
   1057         for (x = (partWidth >> 2); x; x--)
   1058         {
   1059             /* First pixel */
   1060             tmp6 += 16;
   1061             tmp7 = tmp3 + tmp4;
   1062             tmp6 += (tmp7 << 4);
   1063             tmp6 += (tmp7 << 2);
   1064             tmp7 = tmp2 + tmp5;
   1065             tmp1 = *ptrJ++;
   1066             tmp6 -= (tmp7 << 2);
   1067             tmp6 -= tmp7;
   1068             tmp6 += tmp1;
   1069             tmp6 = clp[tmp6>>5];
   1070             /* Second pixel */
   1071             tmp5 += 16;
   1072             tmp7 = tmp2 + tmp3;
   1073             *mb++ = (u8)tmp6;
   1074             tmp5 += (tmp7 << 4);
   1075             tmp5 += (tmp7 << 2);
   1076             tmp7 = tmp1 + tmp4;
   1077             tmp6 = *ptrJ++;
   1078             tmp5 -= (tmp7 << 2);
   1079             tmp5 -= tmp7;
   1080             tmp5 += tmp6;
   1081             tmp5 = clp[tmp5>>5];
   1082             /* Third pixel */
   1083             tmp4 += 16;
   1084             tmp7 = tmp1 + tmp2;
   1085             *mb++ = (u8)tmp5;
   1086             tmp4 += (tmp7 << 4);
   1087             tmp4 += (tmp7 << 2);
   1088             tmp7 = tmp6 + tmp3;
   1089             tmp5 = *ptrJ++;
   1090             tmp4 -= (tmp7 << 2);
   1091             tmp4 -= tmp7;
   1092             tmp4 += tmp5;
   1093             tmp4 = clp[tmp4>>5];
   1094             /* Fourth pixel */
   1095             tmp3 += 16;
   1096             tmp7 = tmp6 + tmp1;
   1097             *mb++ = (u8)tmp4;
   1098             tmp3 += (tmp7 << 4);
   1099             tmp3 += (tmp7 << 2);
   1100             tmp7 = tmp5 + tmp2;
   1101             tmp4 = *ptrJ++;
   1102             tmp3 -= (tmp7 << 2);
   1103             tmp3 -= tmp7;
   1104             tmp3 += tmp4;
   1105             tmp3 = clp[tmp3>>5];
   1106             tmp7 = tmp4;
   1107             tmp4 = tmp6;
   1108             tmp6 = tmp2;
   1109             tmp2 = tmp7;
   1110             *mb++ = (u8)tmp3;
   1111             tmp3 = tmp5;
   1112             tmp5 = tmp1;
   1113         }
   1114         ptrJ += width - partWidth;
   1115         mb += 16 - partWidth;
   1116     }
   1117 
   1118     mb -= 16*partHeight;
   1119     ptrV = ptrC + 5*width;
   1120 
   1121     for (y = (partHeight >> 2); y; y--)
   1122     {
   1123         /* Vertical interpolation and averaging, 4 pels per iteration */
   1124         for (x = partWidth; x; x--)
   1125         {
   1126             tmp4 = ptrV[-(i32)width*2];
   1127             tmp5 = ptrV[-(i32)width];
   1128             tmp1 = ptrV[width];
   1129             tmp2 = ptrV[width*2];
   1130             tmp6 = *ptrV++;
   1131 
   1132             tmp7 = tmp4 + tmp1;
   1133             tmp2 -= (tmp7 << 2);
   1134             tmp2 -= tmp7;
   1135             tmp2 += 16;
   1136             tmp7 = tmp5 + tmp6;
   1137             tmp3 = ptrC[width*2];
   1138             tmp2 += (tmp7 << 4);
   1139             tmp2 += (tmp7 << 2);
   1140             tmp2 += tmp3;
   1141             tmp7 = clp[tmp2>>5];
   1142             tmp2 = mb[48];
   1143             tmp1 += 16;
   1144             tmp7++;
   1145             mb[48] = (u8)((tmp2 + tmp7) >> 1);
   1146 
   1147             tmp7 = tmp3 + tmp6;
   1148             tmp1 -= (tmp7 << 2);
   1149             tmp1 -= tmp7;
   1150             tmp7 = tmp4 + tmp5;
   1151             tmp2 = ptrC[width];
   1152             tmp1 += (tmp7 << 4);
   1153             tmp1 += (tmp7 << 2);
   1154             tmp1 += tmp2;
   1155             tmp7 = clp[tmp1>>5];
   1156             tmp1 = mb[32];
   1157             tmp6 += 16;
   1158             tmp7++;
   1159             mb[32] = (u8)((tmp1 + tmp7) >> 1);
   1160 
   1161             tmp1 = *ptrC;
   1162             tmp7 = tmp2 + tmp5;
   1163             tmp6 -= (tmp7 << 2);
   1164             tmp6 -= tmp7;
   1165             tmp7 = tmp4 + tmp3;
   1166             tmp6 += (tmp7 << 4);
   1167             tmp6 += (tmp7 << 2);
   1168             tmp6 += tmp1;
   1169             tmp7 = clp[tmp6>>5];
   1170             tmp6 = mb[16];
   1171             tmp5 += 16;
   1172             tmp7++;
   1173             mb[16] = (u8)((tmp6 + tmp7) >> 1);
   1174 
   1175             tmp6 = ptrC[-(i32)width];
   1176             tmp1 += tmp4;
   1177             tmp5 -= (tmp1 << 2);
   1178             tmp5 -= tmp1;
   1179             tmp3 += tmp2;
   1180             tmp5 += (tmp3 << 4);
   1181             tmp5 += (tmp3 << 2);
   1182             tmp5 += tmp6;
   1183             tmp7 = clp[tmp5>>5];
   1184             tmp5 = *mb;
   1185             tmp7++;
   1186             *mb++ = (u8)((tmp5 + tmp7) >> 1);
   1187             ptrC++;
   1188 
   1189         }
   1190         ptrC += 4*width - partWidth;
   1191         ptrV += 4*width - partWidth;
   1192         mb += 4*16 - partWidth;
   1193     }
   1194 
   1195 }
   1196 #endif
   1197 
   1198 /*------------------------------------------------------------------------------
   1199 
   1200     Function: h264bsdInterpolateMidHalf
   1201 
   1202         Functional description:
   1203           Function to perform horizontal and vertical interpolation of pixel
   1204           position 'j' for a block. Overfilling is done only if needed.
   1205           Reference image (ref) is read at correct position and the predicted
   1206           part is written to macroblock array (mb)
   1207 
   1208 ------------------------------------------------------------------------------*/
   1209 
   1210 void h264bsdInterpolateMidHalf(
   1211   u8 *ref,
   1212   u8 *mb,
   1213   i32 x0,
   1214   i32 y0,
   1215   u32 width,
   1216   u32 height,
   1217   u32 partWidth,
   1218   u32 partHeight)
   1219 {
   1220     u32 p1[21*21/4+1];
   1221     u32 x, y;
   1222     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1223     i32 *ptrC, *ptrV, *b1;
   1224     u8  *ptrJ;
   1225     i32 table[21*16];
   1226     const u8 *clp = h264bsdClip + 512;
   1227 
   1228     /* Code */
   1229 
   1230     ASSERT(ref);
   1231     ASSERT(mb);
   1232 
   1233     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
   1234         (y0 < 0) || ((u32)y0+partHeight+5 > height))
   1235     {
   1236         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
   1237                 partWidth+5, partHeight+5, partWidth+5);
   1238 
   1239         x0 = 0;
   1240         y0 = 0;
   1241         ref = (u8*)p1;
   1242         width = partWidth+5;
   1243     }
   1244 
   1245     ref += (u32)y0 * width + (u32)x0;
   1246 
   1247     b1 = table;
   1248     ptrJ = ref + 5;
   1249 
   1250     /* First step: calculate intermediate values for
   1251      * horizontal interpolation */
   1252     for (y = partHeight + 5; y; y--)
   1253     {
   1254         tmp6 = *(ptrJ - 5);
   1255         tmp5 = *(ptrJ - 4);
   1256         tmp4 = *(ptrJ - 3);
   1257         tmp3 = *(ptrJ - 2);
   1258         tmp2 = *(ptrJ - 1);
   1259 
   1260         /* 4 pels per iteration */
   1261         for (x = (partWidth >> 2); x; x--)
   1262         {
   1263             /* First pixel */
   1264             tmp7 = tmp3 + tmp4;
   1265             tmp6 += (tmp7 << 4);
   1266             tmp6 += (tmp7 << 2);
   1267             tmp7 = tmp2 + tmp5;
   1268             tmp1 = *ptrJ++;
   1269             tmp6 -= (tmp7 << 2);
   1270             tmp6 -= tmp7;
   1271             tmp6 += tmp1;
   1272             *b1++ = tmp6;
   1273             /* Second pixel */
   1274             tmp7 = tmp2 + tmp3;
   1275             tmp5 += (tmp7 << 4);
   1276             tmp5 += (tmp7 << 2);
   1277             tmp7 = tmp1 + tmp4;
   1278             tmp6 = *ptrJ++;
   1279             tmp5 -= (tmp7 << 2);
   1280             tmp5 -= tmp7;
   1281             tmp5 += tmp6;
   1282             *b1++ = tmp5;
   1283             /* Third pixel */
   1284             tmp7 = tmp1 + tmp2;
   1285             tmp4 += (tmp7 << 4);
   1286             tmp4 += (tmp7 << 2);
   1287             tmp7 = tmp6 + tmp3;
   1288             tmp5 = *ptrJ++;
   1289             tmp4 -= (tmp7 << 2);
   1290             tmp4 -= tmp7;
   1291             tmp4 += tmp5;
   1292             *b1++ = tmp4;
   1293             /* Fourth pixel */
   1294             tmp7 = tmp6 + tmp1;
   1295             tmp3 += (tmp7 << 4);
   1296             tmp3 += (tmp7 << 2);
   1297             tmp7 = tmp5 + tmp2;
   1298             tmp4 = *ptrJ++;
   1299             tmp3 -= (tmp7 << 2);
   1300             tmp3 -= tmp7;
   1301             tmp3 += tmp4;
   1302             *b1++ = tmp3;
   1303             tmp7 = tmp4;
   1304             tmp4 = tmp6;
   1305             tmp6 = tmp2;
   1306             tmp2 = tmp7;
   1307             tmp3 = tmp5;
   1308             tmp5 = tmp1;
   1309         }
   1310         ptrJ += width - partWidth;
   1311     }
   1312 
   1313     /* Second step: calculate vertical interpolation */
   1314     ptrC = table + partWidth;
   1315     ptrV = ptrC + 5*partWidth;
   1316     for (y = (partHeight >> 2); y; y--)
   1317     {
   1318         /* 4 pels per iteration */
   1319         for (x = partWidth; x; x--)
   1320         {
   1321             tmp4 = ptrV[-(i32)partWidth*2];
   1322             tmp5 = ptrV[-(i32)partWidth];
   1323             tmp1 = ptrV[partWidth];
   1324             tmp2 = ptrV[partWidth*2];
   1325             tmp6 = *ptrV++;
   1326 
   1327             tmp7 = tmp4 + tmp1;
   1328             tmp2 -= (tmp7 << 2);
   1329             tmp2 -= tmp7;
   1330             tmp2 += 512;
   1331             tmp7 = tmp5 + tmp6;
   1332             tmp3 = ptrC[partWidth*2];
   1333             tmp2 += (tmp7 << 4);
   1334             tmp2 += (tmp7 << 2);
   1335             tmp2 += tmp3;
   1336             tmp7 = clp[tmp2>>10];
   1337             tmp1 += 512;
   1338             mb[48] = (u8)tmp7;
   1339 
   1340             tmp7 = tmp3 + tmp6;
   1341             tmp1 -= (tmp7 << 2);
   1342             tmp1 -= tmp7;
   1343             tmp7 = tmp4 + tmp5;
   1344             tmp2 = ptrC[partWidth];
   1345             tmp1 += (tmp7 << 4);
   1346             tmp1 += (tmp7 << 2);
   1347             tmp1 += tmp2;
   1348             tmp7 = clp[tmp1>>10];
   1349             tmp6 += 512;
   1350             mb[32] = (u8)tmp7;
   1351 
   1352             tmp1 = *ptrC;
   1353             tmp7 = tmp2 + tmp5;
   1354             tmp6 -= (tmp7 << 2);
   1355             tmp6 -= tmp7;
   1356             tmp7 = tmp4 + tmp3;
   1357             tmp6 += (tmp7 << 4);
   1358             tmp6 += (tmp7 << 2);
   1359             tmp6 += tmp1;
   1360             tmp7 = clp[tmp6>>10];
   1361             tmp5 += 512;
   1362             mb[16] = (u8)tmp7;
   1363 
   1364             tmp6 = ptrC[-(i32)partWidth];
   1365             tmp1 += tmp4;
   1366             tmp5 -= (tmp1 << 2);
   1367             tmp5 -= tmp1;
   1368             tmp3 += tmp2;
   1369             tmp5 += (tmp3 << 4);
   1370             tmp5 += (tmp3 << 2);
   1371             tmp5 += tmp6;
   1372             tmp7 = clp[tmp5>>10];
   1373             *mb++ = (u8)tmp7;
   1374             ptrC++;
   1375         }
   1376         mb += 4*16 - partWidth;
   1377         ptrC += 3*partWidth;
   1378         ptrV += 3*partWidth;
   1379     }
   1380 
   1381 }
   1382 
   1383 
   1384 /*------------------------------------------------------------------------------
   1385 
   1386     Function: h264bsdInterpolateMidVerQuarter
   1387 
   1388         Functional description:
   1389           Function to perform horizontal and vertical interpolation of pixel
   1390           position 'f' or 'q' for a block. Overfilling is done only if needed.
   1391           Reference image (ref) is read at correct position and the predicted
   1392           part is written to macroblock array (mb)
   1393 
   1394 ------------------------------------------------------------------------------*/
   1395 
   1396 void h264bsdInterpolateMidVerQuarter(
   1397   u8 *ref,
   1398   u8 *mb,
   1399   i32 x0,
   1400   i32 y0,
   1401   u32 width,
   1402   u32 height,
   1403   u32 partWidth,
   1404   u32 partHeight,
   1405   u32 verOffset)    /* 0 for pixel f, 1 for pixel q */
   1406 {
   1407     u32 p1[21*21/4+1];
   1408     u32 x, y;
   1409     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1410     i32 *ptrC, *ptrV, *ptrInt, *b1;
   1411     u8  *ptrJ;
   1412     i32 table[21*16];
   1413     const u8 *clp = h264bsdClip + 512;
   1414 
   1415     /* Code */
   1416 
   1417     ASSERT(ref);
   1418     ASSERT(mb);
   1419 
   1420     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
   1421         (y0 < 0) || ((u32)y0+partHeight+5 > height))
   1422     {
   1423         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
   1424                 partWidth+5, partHeight+5, partWidth+5);
   1425 
   1426         x0 = 0;
   1427         y0 = 0;
   1428         ref = (u8*)p1;
   1429         width = partWidth+5;
   1430     }
   1431 
   1432     ref += (u32)y0 * width + (u32)x0;
   1433 
   1434     b1 = table;
   1435     ptrJ = ref + 5;
   1436 
   1437     /* First step: calculate intermediate values for
   1438      * horizontal interpolation */
   1439     for (y = partHeight + 5; y; y--)
   1440     {
   1441         tmp6 = *(ptrJ - 5);
   1442         tmp5 = *(ptrJ - 4);
   1443         tmp4 = *(ptrJ - 3);
   1444         tmp3 = *(ptrJ - 2);
   1445         tmp2 = *(ptrJ - 1);
   1446         for (x = (partWidth >> 2); x; x--)
   1447         {
   1448             /* First pixel */
   1449             tmp7 = tmp3 + tmp4;
   1450             tmp6 += (tmp7 << 4);
   1451             tmp6 += (tmp7 << 2);
   1452             tmp7 = tmp2 + tmp5;
   1453             tmp1 = *ptrJ++;
   1454             tmp6 -= (tmp7 << 2);
   1455             tmp6 -= tmp7;
   1456             tmp6 += tmp1;
   1457             *b1++ = tmp6;
   1458             /* Second pixel */
   1459             tmp7 = tmp2 + tmp3;
   1460             tmp5 += (tmp7 << 4);
   1461             tmp5 += (tmp7 << 2);
   1462             tmp7 = tmp1 + tmp4;
   1463             tmp6 = *ptrJ++;
   1464             tmp5 -= (tmp7 << 2);
   1465             tmp5 -= tmp7;
   1466             tmp5 += tmp6;
   1467             *b1++ = tmp5;
   1468             /* Third pixel */
   1469             tmp7 = tmp1 + tmp2;
   1470             tmp4 += (tmp7 << 4);
   1471             tmp4 += (tmp7 << 2);
   1472             tmp7 = tmp6 + tmp3;
   1473             tmp5 = *ptrJ++;
   1474             tmp4 -= (tmp7 << 2);
   1475             tmp4 -= tmp7;
   1476             tmp4 += tmp5;
   1477             *b1++ = tmp4;
   1478             /* Fourth pixel */
   1479             tmp7 = tmp6 + tmp1;
   1480             tmp3 += (tmp7 << 4);
   1481             tmp3 += (tmp7 << 2);
   1482             tmp7 = tmp5 + tmp2;
   1483             tmp4 = *ptrJ++;
   1484             tmp3 -= (tmp7 << 2);
   1485             tmp3 -= tmp7;
   1486             tmp3 += tmp4;
   1487             *b1++ = tmp3;
   1488             tmp7 = tmp4;
   1489             tmp4 = tmp6;
   1490             tmp6 = tmp2;
   1491             tmp2 = tmp7;
   1492             tmp3 = tmp5;
   1493             tmp5 = tmp1;
   1494         }
   1495         ptrJ += width - partWidth;
   1496     }
   1497 
   1498     /* Second step: calculate vertical interpolation and average */
   1499     ptrC = table + partWidth;
   1500     ptrV = ptrC + 5*partWidth;
   1501     /* Pointer to integer sample position, either M or R */
   1502     ptrInt = ptrC + (2+verOffset)*partWidth;
   1503     for (y = (partHeight >> 2); y; y--)
   1504     {
   1505         for (x = partWidth; x; x--)
   1506         {
   1507             tmp4 = ptrV[-(i32)partWidth*2];
   1508             tmp5 = ptrV[-(i32)partWidth];
   1509             tmp1 = ptrV[partWidth];
   1510             tmp2 = ptrV[partWidth*2];
   1511             tmp6 = *ptrV++;
   1512 
   1513             tmp7 = tmp4 + tmp1;
   1514             tmp2 -= (tmp7 << 2);
   1515             tmp2 -= tmp7;
   1516             tmp2 += 512;
   1517             tmp7 = tmp5 + tmp6;
   1518             tmp3 = ptrC[partWidth*2];
   1519             tmp2 += (tmp7 << 4);
   1520             tmp2 += (tmp7 << 2);
   1521             tmp7 = ptrInt[partWidth*2];
   1522             tmp2 += tmp3;
   1523             tmp2 = clp[tmp2>>10];
   1524             tmp7 += 16;
   1525             tmp7 = clp[tmp7>>5];
   1526             tmp1 += 512;
   1527             tmp2++;
   1528             mb[48] = (u8)((tmp7 + tmp2) >> 1);
   1529 
   1530             tmp7 = tmp3 + tmp6;
   1531             tmp1 -= (tmp7 << 2);
   1532             tmp1 -= tmp7;
   1533             tmp7 = tmp4 + tmp5;
   1534             tmp2 = ptrC[partWidth];
   1535             tmp1 += (tmp7 << 4);
   1536             tmp1 += (tmp7 << 2);
   1537             tmp7 = ptrInt[partWidth];
   1538             tmp1 += tmp2;
   1539             tmp1 = clp[tmp1>>10];
   1540             tmp7 += 16;
   1541             tmp7 = clp[tmp7>>5];
   1542             tmp6 += 512;
   1543             tmp1++;
   1544             mb[32] = (u8)((tmp7 + tmp1) >> 1);
   1545 
   1546             tmp1 = *ptrC;
   1547             tmp7 = tmp2 + tmp5;
   1548             tmp6 -= (tmp7 << 2);
   1549             tmp6 -= tmp7;
   1550             tmp7 = tmp4 + tmp3;
   1551             tmp6 += (tmp7 << 4);
   1552             tmp6 += (tmp7 << 2);
   1553             tmp7 = *ptrInt;
   1554             tmp6 += tmp1;
   1555             tmp6 = clp[tmp6>>10];
   1556             tmp7 += 16;
   1557             tmp7 = clp[tmp7>>5];
   1558             tmp5 += 512;
   1559             tmp6++;
   1560             mb[16] = (u8)((tmp7 + tmp6) >> 1);
   1561 
   1562             tmp6 = ptrC[-(i32)partWidth];
   1563             tmp1 += tmp4;
   1564             tmp5 -= (tmp1 << 2);
   1565             tmp5 -= tmp1;
   1566             tmp3 += tmp2;
   1567             tmp5 += (tmp3 << 4);
   1568             tmp5 += (tmp3 << 2);
   1569             tmp7 = ptrInt[-(i32)partWidth];
   1570             tmp5 += tmp6;
   1571             tmp5 = clp[tmp5>>10];
   1572             tmp7 += 16;
   1573             tmp7 = clp[tmp7>>5];
   1574             tmp5++;
   1575             *mb++ = (u8)((tmp7 + tmp5) >> 1);
   1576             ptrC++;
   1577             ptrInt++;
   1578         }
   1579         mb += 4*16 - partWidth;
   1580         ptrC += 3*partWidth;
   1581         ptrV += 3*partWidth;
   1582         ptrInt += 3*partWidth;
   1583     }
   1584 
   1585 }
   1586 
   1587 
   1588 /*------------------------------------------------------------------------------
   1589 
   1590     Function: h264bsdInterpolateMidHorQuarter
   1591 
   1592         Functional description:
   1593           Function to perform horizontal and vertical interpolation of pixel
   1594           position 'i' or 'k' for a block. Overfilling is done only if needed.
   1595           Reference image (ref) is read at correct position and the predicted
   1596           part is written to macroblock array (mb)
   1597 
   1598 ------------------------------------------------------------------------------*/
   1599 
   1600 void h264bsdInterpolateMidHorQuarter(
   1601   u8 *ref,
   1602   u8 *mb,
   1603   i32 x0,
   1604   i32 y0,
   1605   u32 width,
   1606   u32 height,
   1607   u32 partWidth,
   1608   u32 partHeight,
   1609   u32 horOffset)    /* 0 for pixel i, 1 for pixel k */
   1610 {
   1611     u32 p1[21*21/4+1];
   1612     u32 x, y;
   1613     i32 tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   1614     i32 *ptrJ, *ptrInt, *h1;
   1615     u8  *ptrC, *ptrV;
   1616     i32 table[21*16];
   1617     i32 tableWidth = (i32)partWidth+5;
   1618     const u8 *clp = h264bsdClip + 512;
   1619 
   1620     /* Code */
   1621 
   1622     ASSERT(ref);
   1623     ASSERT(mb);
   1624 
   1625     if ((x0 < 0) || ((u32)x0+partWidth+5 > width) ||
   1626         (y0 < 0) || ((u32)y0+partHeight+5 > height))
   1627     {
   1628         h264bsdFillBlock(ref, (u8*)p1, x0, y0, width, height,
   1629                 partWidth+5, partHeight+5, partWidth+5);
   1630 
   1631         x0 = 0;
   1632         y0 = 0;
   1633         ref = (u8*)p1;
   1634         width = partWidth+5;
   1635     }
   1636 
   1637     ref += (u32)y0 * width + (u32)x0;
   1638 
   1639     h1 = table + tableWidth;
   1640     ptrC = ref + width;
   1641     ptrV = ptrC + 5*width;
   1642 
   1643     /* First step: calculate intermediate values for
   1644      * vertical interpolation */
   1645     for (y = (partHeight >> 2); y; y--)
   1646     {
   1647         for (x = (u32)tableWidth; x; x--)
   1648         {
   1649             tmp4 = ptrV[-(i32)width*2];
   1650             tmp5 = ptrV[-(i32)width];
   1651             tmp1 = ptrV[width];
   1652             tmp2 = ptrV[width*2];
   1653             tmp6 = *ptrV++;
   1654 
   1655             tmp7 = tmp4 + tmp1;
   1656             tmp2 -= (tmp7 << 2);
   1657             tmp2 -= tmp7;
   1658             tmp7 = tmp5 + tmp6;
   1659             tmp3 = ptrC[width*2];
   1660             tmp2 += (tmp7 << 4);
   1661             tmp2 += (tmp7 << 2);
   1662             tmp2 += tmp3;
   1663             h1[tableWidth*2] = tmp2;
   1664 
   1665             tmp7 = tmp3 + tmp6;
   1666             tmp1 -= (tmp7 << 2);
   1667             tmp1 -= tmp7;
   1668             tmp7 = tmp4 + tmp5;
   1669             tmp2 = ptrC[width];
   1670             tmp1 += (tmp7 << 4);
   1671             tmp1 += (tmp7 << 2);
   1672             tmp1 += tmp2;
   1673             h1[tableWidth] = tmp1;
   1674 
   1675             tmp1 = *ptrC;
   1676             tmp7 = tmp2 + tmp5;
   1677             tmp6 -= (tmp7 << 2);
   1678             tmp6 -= tmp7;
   1679             tmp7 = tmp4 + tmp3;
   1680             tmp6 += (tmp7 << 4);
   1681             tmp6 += (tmp7 << 2);
   1682             tmp6 += tmp1;
   1683             *h1 = tmp6;
   1684 
   1685             tmp6 = ptrC[-(i32)width];
   1686             tmp1 += tmp4;
   1687             tmp5 -= (tmp1 << 2);
   1688             tmp5 -= tmp1;
   1689             tmp3 += tmp2;
   1690             tmp5 += (tmp3 << 4);
   1691             tmp5 += (tmp3 << 2);
   1692             tmp5 += tmp6;
   1693             h1[-tableWidth] = tmp5;
   1694             h1++;
   1695             ptrC++;
   1696         }
   1697         ptrC += 4*width - partWidth - 5;
   1698         ptrV += 4*width - partWidth - 5;
   1699         h1 += 3*tableWidth;
   1700     }
   1701 
   1702     /* Second step: calculate horizontal interpolation and average */
   1703     ptrJ = table + 5;
   1704     /* Pointer to integer sample position, either G or H */
   1705     ptrInt = table + 2 + horOffset;
   1706     for (y = partHeight; y; y--)
   1707     {
   1708         tmp6 = *(ptrJ - 5);
   1709         tmp5 = *(ptrJ - 4);
   1710         tmp4 = *(ptrJ - 3);
   1711         tmp3 = *(ptrJ - 2);
   1712         tmp2 = *(ptrJ - 1);
   1713         for (x = (partWidth>>2); x; x--)
   1714         {
   1715             /* First pixel */
   1716             tmp6 += 512;
   1717             tmp7 = tmp3 + tmp4;
   1718             tmp6 += (tmp7 << 4);
   1719             tmp6 += (tmp7 << 2);
   1720             tmp7 = tmp2 + tmp5;
   1721             tmp1 = *ptrJ++;
   1722             tmp6 -= (tmp7 << 2);
   1723             tmp6 -= tmp7;
   1724             tmp7 = *ptrInt++;
   1725             tmp6 += tmp1;
   1726             tmp6 = clp[tmp6 >> 10];
   1727             tmp7 += 16;
   1728             tmp7 = clp[tmp7 >> 5];
   1729             tmp5 += 512;
   1730             tmp6++;
   1731             *mb++ = (u8)((tmp6 + tmp7) >> 1);
   1732             /* Second pixel */
   1733             tmp7 = tmp2 + tmp3;
   1734             tmp5 += (tmp7 << 4);
   1735             tmp5 += (tmp7 << 2);
   1736             tmp7 = tmp1 + tmp4;
   1737             tmp6 = *ptrJ++;
   1738             tmp5 -= (tmp7 << 2);
   1739             tmp5 -= tmp7;
   1740             tmp7 = *ptrInt++;
   1741             tmp5 += tmp6;
   1742             tmp5 = clp[tmp5 >> 10];
   1743             tmp7 += 16;
   1744             tmp7 = clp[tmp7 >> 5];
   1745             tmp4 += 512;
   1746             tmp5++;
   1747             *mb++ = (u8)((tmp5 + tmp7) >> 1);
   1748             /* Third pixel */
   1749             tmp7 = tmp1 + tmp2;
   1750             tmp4 += (tmp7 << 4);
   1751             tmp4 += (tmp7 << 2);
   1752             tmp7 = tmp6 + tmp3;
   1753             tmp5 = *ptrJ++;
   1754             tmp4 -= (tmp7 << 2);
   1755             tmp4 -= tmp7;
   1756             tmp7 = *ptrInt++;
   1757             tmp4 += tmp5;
   1758             tmp4 = clp[tmp4 >> 10];
   1759             tmp7 += 16;
   1760             tmp7 = clp[tmp7 >> 5];
   1761             tmp3 += 512;
   1762             tmp4++;
   1763             *mb++ = (u8)((tmp4 + tmp7) >> 1);
   1764             /* Fourth pixel */
   1765             tmp7 = tmp6 + tmp1;
   1766             tmp3 += (tmp7 << 4);
   1767             tmp3 += (tmp7 << 2);
   1768             tmp7 = tmp5 + tmp2;
   1769             tmp4 = *ptrJ++;
   1770             tmp3 -= (tmp7 << 2);
   1771             tmp3 -= tmp7;
   1772             tmp7 = *ptrInt++;
   1773             tmp3 += tmp4;
   1774             tmp3 = clp[tmp3 >> 10];
   1775             tmp7 += 16;
   1776             tmp7 = clp[tmp7 >> 5];
   1777             tmp3++;
   1778             *mb++ = (u8)((tmp3 + tmp7) >> 1);
   1779             tmp3 = tmp5;
   1780             tmp5 = tmp1;
   1781             tmp7 = tmp4;
   1782             tmp4 = tmp6;
   1783             tmp6 = tmp2;
   1784             tmp2 = tmp7;
   1785         }
   1786         ptrJ += 5;
   1787         ptrInt += 5;
   1788         mb += 16 - partWidth;
   1789     }
   1790 
   1791 }
   1792 
   1793 
   1794 /*------------------------------------------------------------------------------
   1795 
   1796     Function: h264bsdPredictSamples
   1797 
   1798         Functional description:
   1799           This function reconstructs a prediction for a macroblock partition.
   1800           The prediction is either copied or interpolated using the reference
   1801           frame and the motion vector. Both luminance and chrominance parts are
   1802           predicted. The prediction is stored in given macroblock array (data).
   1803         Inputs:
   1804           data          pointer to macroblock array (384 bytes) for output
   1805           mv            pointer to motion vector used for prediction
   1806           refPic        pointer to reference picture structure
   1807           xA            x-coordinate for current macroblock
   1808           yA            y-coordinate for current macroblock
   1809           partX         x-offset for partition in macroblock
   1810           partY         y-offset for partition in macroblock
   1811           partWidth     width of partition
   1812           partHeight    height of partition
   1813         Outputs:
   1814           data          macroblock array (16x16+8x8+8x8) where predicted
   1815                         partition is stored at correct position
   1816 
   1817 ------------------------------------------------------------------------------*/
   1818 
   1819 void h264bsdPredictSamples(
   1820   u8 *data,
   1821   mv_t *mv,
   1822   image_t *refPic,
   1823   u32 xA,
   1824   u32 yA,
   1825   u32 partX,
   1826   u32 partY,
   1827   u32 partWidth,
   1828   u32 partHeight)
   1829 
   1830 {
   1831 
   1832 /* Variables */
   1833 
   1834     u32 xFrac, yFrac, width, height;
   1835     i32 xInt, yInt;
   1836     u8 *lumaPartData;
   1837 
   1838 /* Code */
   1839 
   1840     ASSERT(data);
   1841     ASSERT(mv);
   1842     ASSERT(partWidth);
   1843     ASSERT(partHeight);
   1844     ASSERT(refPic);
   1845     ASSERT(refPic->data);
   1846     ASSERT(refPic->width);
   1847     ASSERT(refPic->height);
   1848 
   1849     /* luma */
   1850     lumaPartData = data + 16*partY + partX;
   1851 
   1852     xFrac = mv->hor & 0x3;
   1853     yFrac = mv->ver & 0x3;
   1854 
   1855     width = 16 * refPic->width;
   1856     height = 16 * refPic->height;
   1857 
   1858     xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
   1859     yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
   1860 
   1861     ASSERT(lumaFracPos[xFrac][yFrac] < 16);
   1862 
   1863     switch (lumaFracPos[xFrac][yFrac])
   1864     {
   1865         case 0: /* G */
   1866             h264bsdFillBlock(refPic->data, lumaPartData,
   1867                     xInt,yInt,width,height,partWidth,partHeight,16);
   1868             break;
   1869         case 1: /* d */
   1870             h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
   1871                     xInt, yInt-2, width, height, partWidth, partHeight, 0);
   1872             break;
   1873         case 2: /* h */
   1874             h264bsdInterpolateVerHalf(refPic->data, lumaPartData,
   1875                     xInt, yInt-2, width, height, partWidth, partHeight);
   1876             break;
   1877         case 3: /* n */
   1878             h264bsdInterpolateVerQuarter(refPic->data, lumaPartData,
   1879                     xInt, yInt-2, width, height, partWidth, partHeight, 1);
   1880             break;
   1881         case 4: /* a */
   1882             h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
   1883                     xInt-2, yInt, width, height, partWidth, partHeight, 0);
   1884             break;
   1885         case 5: /* e */
   1886             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
   1887                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
   1888             break;
   1889         case 6: /* i */
   1890             h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
   1891                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
   1892             break;
   1893         case 7: /* p */
   1894             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
   1895                     xInt-2, yInt-2, width, height, partWidth, partHeight, 2);
   1896             break;
   1897         case 8: /* b */
   1898             h264bsdInterpolateHorHalf(refPic->data, lumaPartData,
   1899                     xInt-2, yInt, width, height, partWidth, partHeight);
   1900             break;
   1901         case 9: /* f */
   1902             h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
   1903                     xInt-2, yInt-2, width, height, partWidth, partHeight, 0);
   1904             break;
   1905         case 10: /* j */
   1906             h264bsdInterpolateMidHalf(refPic->data, lumaPartData,
   1907                     xInt-2, yInt-2, width, height, partWidth, partHeight);
   1908             break;
   1909         case 11: /* q */
   1910             h264bsdInterpolateMidVerQuarter(refPic->data, lumaPartData,
   1911                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
   1912             break;
   1913         case 12: /* c */
   1914             h264bsdInterpolateHorQuarter(refPic->data, lumaPartData,
   1915                     xInt-2, yInt, width, height, partWidth, partHeight, 1);
   1916             break;
   1917         case 13: /* g */
   1918             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
   1919                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
   1920             break;
   1921         case 14: /* k */
   1922             h264bsdInterpolateMidHorQuarter(refPic->data, lumaPartData,
   1923                     xInt-2, yInt-2, width, height, partWidth, partHeight, 1);
   1924             break;
   1925         default: /* case 15, r */
   1926             h264bsdInterpolateHorVerQuarter(refPic->data, lumaPartData,
   1927                     xInt-2, yInt-2, width, height, partWidth, partHeight, 3);
   1928             break;
   1929     }
   1930 
   1931     /* chroma */
   1932     PredictChroma(
   1933       data + 16*16 + (partY>>1)*8 + (partX>>1),
   1934       xA + partX,
   1935       yA + partY,
   1936       partWidth,
   1937       partHeight,
   1938       mv,
   1939       refPic);
   1940 
   1941 }
   1942 
   1943 #else /* H264DEC_OMXDL */
   1944 /*------------------------------------------------------------------------------
   1945 
   1946     Function: h264bsdPredictSamples
   1947 
   1948         Functional description:
   1949           This function reconstructs a prediction for a macroblock partition.
   1950           The prediction is either copied or interpolated using the reference
   1951           frame and the motion vector. Both luminance and chrominance parts are
   1952           predicted. The prediction is stored in given macroblock array (data).
   1953         Inputs:
   1954           data          pointer to macroblock array (384 bytes) for output
   1955           mv            pointer to motion vector used for prediction
   1956           refPic        pointer to reference picture structure
   1957           xA            x-coordinate for current macroblock
   1958           yA            y-coordinate for current macroblock
   1959           partX         x-offset for partition in macroblock
   1960           partY         y-offset for partition in macroblock
   1961           partWidth     width of partition
   1962           partHeight    height of partition
   1963         Outputs:
   1964           data          macroblock array (16x16+8x8+8x8) where predicted
   1965                         partition is stored at correct position
   1966 
   1967 ------------------------------------------------------------------------------*/
   1968 
   1969 /*lint -e{550} Symbol 'res' not accessed */
   1970 void h264bsdPredictSamples(
   1971   u8 *data,
   1972   mv_t *mv,
   1973   image_t *refPic,
   1974   u32 colAndRow,
   1975   u32 part,
   1976   u8 *pFill)
   1977 
   1978 {
   1979 
   1980 /* Variables */
   1981 
   1982     u32 xFrac, yFrac;
   1983     u32 width, height;
   1984     i32 xInt, yInt, x0, y0;
   1985     u8 *partData, *ref;
   1986     OMXSize roi;
   1987     u32 fillWidth;
   1988     u32 fillHeight;
   1989     OMXResult res;
   1990     u32 xA, yA;
   1991     u32 partX, partY;
   1992     u32 partWidth, partHeight;
   1993 
   1994 /* Code */
   1995 
   1996     ASSERT(data);
   1997     ASSERT(mv);
   1998     ASSERT(refPic);
   1999     ASSERT(refPic->data);
   2000     ASSERT(refPic->width);
   2001     ASSERT(refPic->height);
   2002 
   2003     xA = (colAndRow & 0xFFFF0000) >> 16;
   2004     yA = (colAndRow & 0x0000FFFF);
   2005 
   2006     partX = (part & 0xFF000000) >> 24;
   2007     partY = (part & 0x00FF0000) >> 16;
   2008     partWidth = (part & 0x0000FF00) >> 8;
   2009     partHeight = (part & 0x000000FF);
   2010 
   2011     ASSERT(partWidth);
   2012     ASSERT(partHeight);
   2013 
   2014     /* luma */
   2015     partData = data + 16*partY + partX;
   2016 
   2017     xFrac = mv->hor & 0x3;
   2018     yFrac = mv->ver & 0x3;
   2019 
   2020     width = 16 * refPic->width;
   2021     height = 16 * refPic->height;
   2022 
   2023     xInt = (i32)xA + (i32)partX + (mv->hor >> 2);
   2024     yInt = (i32)yA + (i32)partY + (mv->ver >> 2);
   2025 
   2026     x0 = (xFrac) ? xInt-2 : xInt;
   2027     y0 = (yFrac) ? yInt-2 : yInt;
   2028 
   2029     if (xFrac)
   2030     {
   2031         if (partWidth == 16)
   2032             fillWidth = 32;
   2033         else
   2034             fillWidth = 16;
   2035     }
   2036     else
   2037         fillWidth = (partWidth*2);
   2038     if (yFrac)
   2039         fillHeight = partHeight+5;
   2040     else
   2041         fillHeight = partHeight;
   2042 
   2043 
   2044     if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
   2045         (y0 < 0) || ((u32)y0+fillHeight > height))
   2046     {
   2047         h264bsdFillBlock(refPic->data, (u8*)pFill, x0, y0, width, height,
   2048                 fillWidth, fillHeight, fillWidth);
   2049 
   2050         x0 = 0;
   2051         y0 = 0;
   2052         ref = pFill;
   2053         width = fillWidth;
   2054         if (yFrac)
   2055             ref += 2*width;
   2056         if (xFrac)
   2057             ref += 2;
   2058     }
   2059     else
   2060     {
   2061         /*lint --e(737) Loss of sign */
   2062         ref = refPic->data + yInt*width + xInt;
   2063     }
   2064     /* Luma interpolation */
   2065     roi.width = (i32)partWidth;
   2066     roi.height = (i32)partHeight;
   2067 
   2068     res = omxVCM4P10_InterpolateLuma(ref, (i32)width, partData, 16,
   2069                                         (i32)xFrac, (i32)yFrac, roi);
   2070     ASSERT(res == 0);
   2071 
   2072     /* Chroma */
   2073     width  = 8 * refPic->width;
   2074     height = 8 * refPic->height;
   2075 
   2076     x0 = ((xA + partX) >> 1) + (mv->hor >> 3);
   2077     y0 = ((yA + partY) >> 1) + (mv->ver >> 3);
   2078     xFrac = mv->hor & 0x7;
   2079     yFrac = mv->ver & 0x7;
   2080 
   2081     ref = refPic->data + 256 * refPic->width * refPic->height;
   2082 
   2083     roi.width = (i32)(partWidth >> 1);
   2084     fillWidth = ((partWidth >> 1) + 8) & ~0x7;
   2085     roi.height = (i32)(partHeight >> 1);
   2086     fillHeight = (partHeight >> 1) + 1;
   2087 
   2088     if ((x0 < 0) || ((u32)x0+fillWidth > width) ||
   2089         (y0 < 0) || ((u32)y0+fillHeight > height))
   2090     {
   2091         h264bsdFillBlock(ref, pFill, x0, y0, width, height,
   2092             fillWidth, fillHeight, fillWidth);
   2093         ref += width * height;
   2094         h264bsdFillBlock(ref, pFill + fillWidth*fillHeight,
   2095             x0, y0, width, height, fillWidth,
   2096             fillHeight, fillWidth);
   2097 
   2098         ref = pFill;
   2099         x0 = 0;
   2100         y0 = 0;
   2101         width = fillWidth;
   2102         height = fillHeight;
   2103     }
   2104 
   2105     partData = data + 16*16 + (partY>>1)*8 + (partX>>1);
   2106 
   2107     /* Chroma interpolation */
   2108     /*lint --e(737) Loss of sign */
   2109     ref += y0 * width + x0;
   2110     res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
   2111                             (u32)roi.width, (u32)roi.height, xFrac, yFrac);
   2112     ASSERT(res == 0);
   2113     partData += 8 * 8;
   2114     ref += height * width;
   2115     res = armVCM4P10_Interpolate_Chroma(ref, width, partData, 8,
   2116                             (u32)roi.width, (u32)roi.height, xFrac, yFrac);
   2117     ASSERT(res == 0);
   2118 
   2119 }
   2120 
   2121 #endif /* H264DEC_OMXDL */
   2122 
   2123 
   2124 /*------------------------------------------------------------------------------
   2125 
   2126     Function: FillRow1
   2127 
   2128         Functional description:
   2129           This function gets a row of reference pels in a 'normal' case when no
   2130           overfilling is necessary.
   2131 
   2132 ------------------------------------------------------------------------------*/
   2133 
   2134 static void FillRow1(
   2135   u8 *ref,
   2136   u8 *fill,
   2137   i32 left,
   2138   i32 center,
   2139   i32 right)
   2140 {
   2141     UNUSED(left);
   2142     UNUSED(right);
   2143     ASSERT(ref);
   2144     ASSERT(fill);
   2145 
   2146     H264SwDecMemcpy(fill, ref, (u32)center);
   2147 
   2148     /*lint -e(715) */
   2149 }
   2150 
   2151 
   2152 /*------------------------------------------------------------------------------
   2153 
   2154     Function: h264bsdFillRow7
   2155 
   2156         Functional description:
   2157           This function gets a row of reference pels when horizontal coordinate
   2158           is partly negative or partly greater than reference picture width
   2159           (overfilling some pels on left and/or right edge).
   2160         Inputs:
   2161           ref       pointer to reference samples
   2162           left      amount of pixels to overfill on left-edge
   2163           center    amount of pixels to copy
   2164           right     amount of pixels to overfill on right-edge
   2165         Outputs:
   2166           fill      pointer where samples are stored
   2167 
   2168 ------------------------------------------------------------------------------*/
   2169 #ifndef H264DEC_NEON
   2170 void h264bsdFillRow7(
   2171   u8 *ref,
   2172   u8 *fill,
   2173   i32 left,
   2174   i32 center,
   2175   i32 right)
   2176 {
   2177     u8 tmp;
   2178 
   2179     ASSERT(ref);
   2180     ASSERT(fill);
   2181 
   2182     if (left)
   2183         tmp = *ref;
   2184 
   2185     for ( ; left; left--)
   2186         /*lint -esym(644,tmp)  tmp is initialized if used */
   2187         *fill++ = tmp;
   2188 
   2189     for ( ; center; center--)
   2190         *fill++ = *ref++;
   2191 
   2192     if (right)
   2193         tmp = ref[-1];
   2194 
   2195     for ( ; right; right--)
   2196         /*lint -esym(644,tmp)  tmp is initialized if used */
   2197         *fill++ = tmp;
   2198 }
   2199 #endif
   2200 /*------------------------------------------------------------------------------
   2201 
   2202     Function: h264bsdFillBlock
   2203 
   2204         Functional description:
   2205           This function gets a block of reference pels. It determines whether
   2206           overfilling is needed or not and repeatedly calls an appropriate
   2207           function (by using a function pointer) that fills one row the block.
   2208         Inputs:
   2209           ref               pointer to reference frame
   2210           x0                x-coordinate for block
   2211           y0                y-coordinate for block
   2212           width             width of reference frame
   2213           height            height of reference frame
   2214           blockWidth        width of block
   2215           blockHeight       height of block
   2216           fillScanLength    length of a line in output array (pixels)
   2217         Outputs:
   2218           fill              pointer to array where output block is written
   2219 
   2220 ------------------------------------------------------------------------------*/
   2221 
   2222 void h264bsdFillBlock(
   2223   u8 *ref,
   2224   u8 *fill,
   2225   i32 x0,
   2226   i32 y0,
   2227   u32 width,
   2228   u32 height,
   2229   u32 blockWidth,
   2230   u32 blockHeight,
   2231   u32 fillScanLength)
   2232 
   2233 {
   2234 
   2235 /* Variables */
   2236 
   2237     i32 xstop, ystop;
   2238     void (*fp)(u8*, u8*, i32, i32, i32);
   2239     i32 left, x, right;
   2240     i32 top, y, bottom;
   2241 
   2242 /* Code */
   2243 
   2244     ASSERT(ref);
   2245     ASSERT(fill);
   2246     ASSERT(width);
   2247     ASSERT(height);
   2248     ASSERT(fill);
   2249     ASSERT(blockWidth);
   2250     ASSERT(blockHeight);
   2251 
   2252     xstop = x0 + (i32)blockWidth;
   2253     ystop = y0 + (i32)blockHeight;
   2254 
   2255     /* Choose correct function whether overfilling on left-edge or right-edge
   2256      * is needed or not */
   2257     if (x0 >= 0 && xstop <= (i32)width)
   2258         fp = FillRow1;
   2259     else
   2260         fp = h264bsdFillRow7;
   2261 
   2262     if (ystop < 0)
   2263         y0 = -(i32)blockHeight;
   2264 
   2265     if (xstop < 0)
   2266         x0 = -(i32)blockWidth;
   2267 
   2268     if (y0 > (i32)height)
   2269         y0 = (i32)height;
   2270 
   2271     if (x0 > (i32)width)
   2272         x0 = (i32)width;
   2273 
   2274     xstop = x0 + (i32)blockWidth;
   2275     ystop = y0 + (i32)blockHeight;
   2276 
   2277     if (x0 > 0)
   2278         ref += x0;
   2279 
   2280     if (y0 > 0)
   2281         ref += y0 * (i32)width;
   2282 
   2283     left = x0 < 0 ? -x0 : 0;
   2284     right = xstop > (i32)width ? xstop - (i32)width : 0;
   2285     x = (i32)blockWidth - left - right;
   2286 
   2287     top = y0 < 0 ? -y0 : 0;
   2288     bottom = ystop > (i32)height ? ystop - (i32)height : 0;
   2289     y = (i32)blockHeight - top - bottom;
   2290 
   2291     /* Top-overfilling */
   2292     for ( ; top; top-- )
   2293     {
   2294         (*fp)(ref, fill, left, x, right);
   2295         fill += fillScanLength;
   2296     }
   2297 
   2298     /* Lines inside reference image */
   2299     for ( ; y; y-- )
   2300     {
   2301         (*fp)(ref, fill, left, x, right);
   2302         ref += width;
   2303         fill += fillScanLength;
   2304     }
   2305 
   2306     ref -= width;
   2307 
   2308     /* Bottom-overfilling */
   2309     for ( ; bottom; bottom-- )
   2310     {
   2311         (*fp)(ref, fill, left, x, right);
   2312         fill += fillScanLength;
   2313     }
   2314 }
   2315 
   2316 /*lint +e701 +e702 */
   2317 
   2318 
   2319