Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 #include "avcdec_lib.h"
     19 
     20 
     21 #define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
     22                  x = 0xFF & (~(x>>31));}
     23 
     24 /* (blkwidth << 2) + (dy << 1) + dx */
     25 static void (*const ChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
     26 {
     27     &ChromaFullMC_SIMD,
     28     &ChromaHorizontalMC_SIMD,
     29     &ChromaVerticalMC_SIMD,
     30     &ChromaDiagonalMC_SIMD,
     31     &ChromaFullMC_SIMD,
     32     &ChromaHorizontalMC2_SIMD,
     33     &ChromaVerticalMC2_SIMD,
     34     &ChromaDiagonalMC2_SIMD
     35 };
     36 /* Perform motion prediction and compensation with residue if exist. */
     37 void InterMBPrediction(AVCCommonObj *video)
     38 {
     39     AVCMacroblock *currMB = video->currMB;
     40     AVCPictureData *currPic = video->currPic;
     41     int mbPartIdx, subMbPartIdx;
     42     int ref_idx;
     43     int offset_MbPart_indx = 0;
     44     int16 *mv;
     45     uint32 x_pos, y_pos;
     46     uint8 *curL, *curCb, *curCr;
     47     uint8 *ref_l, *ref_Cb, *ref_Cr;
     48     uint8 *predBlock, *predCb, *predCr;
     49     int block_x, block_y, offset_x, offset_y, offsetP, offset;
     50     int x_position = (video->mb_x << 4);
     51     int y_position = (video->mb_y << 4);
     52     int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
     53     int picWidth = currPic->pitch;
     54     int picHeight = currPic->height;
     55     int16 *dataBlock;
     56     uint32 cbp4x4;
     57     uint32 tmp_word;
     58 
     59     tmp_word = y_position * picWidth;
     60     curL = currPic->Sl + tmp_word + x_position;
     61     offset = (tmp_word >> 2) + (x_position >> 1);
     62     curCb = currPic->Scb + offset;
     63     curCr = currPic->Scr + offset;
     64 
     65 #ifdef USE_PRED_BLOCK
     66     predBlock = video->pred + 84;
     67     predCb = video->pred + 452;
     68     predCr = video->pred + 596;
     69 #else
     70     predBlock = curL;
     71     predCb = curCb;
     72     predCr = curCr;
     73 #endif
     74 
     75     GetMotionVectorPredictor(video, false);
     76 
     77     for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
     78     {
     79         MbHeight = currMB->SubMbPartHeight[mbPartIdx];
     80         MbWidth = currMB->SubMbPartWidth[mbPartIdx];
     81         mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
     82         mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
     83         ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
     84         offset_indx = 0;
     85 
     86         ref_l = video->RefPicList0[ref_idx]->Sl;
     87         ref_Cb = video->RefPicList0[ref_idx]->Scb;
     88         ref_Cr = video->RefPicList0[ref_idx]->Scr;
     89 
     90         for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
     91         {
     92             block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);  // check this
     93             block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
     94             mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
     95             offset_x = x_position + (block_x << 2);
     96             offset_y = y_position + (block_y << 2);
     97             x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
     98             y_pos = (offset_y << 2) + *mv;   /*quarter pel */
     99 
    100             //offset = offset_y * currPic->width;
    101             //offsetC = (offset >> 2) + (offset_x >> 1);
    102 #ifdef USE_PRED_BLOCK
    103             offsetP = (block_y * 80) + (block_x << 2);
    104             LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
    105                            /*comp_Sl + offset + offset_x,*/
    106                            predBlock + offsetP, 20, MbWidth, MbHeight);
    107 #else
    108             offsetP = (block_y << 2) * picWidth + (block_x << 2);
    109             LumaMotionComp(ref_l, picWidth, picHeight, x_pos, y_pos,
    110                            /*comp_Sl + offset + offset_x,*/
    111                            predBlock + offsetP, picWidth, MbWidth, MbHeight);
    112 #endif
    113 
    114 #ifdef USE_PRED_BLOCK
    115             offsetP = (block_y * 24) + (block_x << 1);
    116             ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
    117                              /*comp_Scb +  offsetC,*/
    118                              predCb + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
    119             ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
    120                              /*comp_Scr +  offsetC,*/
    121                              predCr + offsetP, 12, MbWidth >> 1, MbHeight >> 1);
    122 #else
    123             offsetP = (block_y * picWidth) + (block_x << 1);
    124             ChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
    125                              /*comp_Scb +  offsetC,*/
    126                              predCb + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
    127             ChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
    128                              /*comp_Scr +  offsetC,*/
    129                              predCr + offsetP, picWidth >> 1, MbWidth >> 1, MbHeight >> 1);
    130 #endif
    131 
    132             offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
    133         }
    134         offset_MbPart_indx = currMB->MbPartWidth >> 4;
    135     }
    136 
    137     /* used in decoder, used to be if(!encFlag)  */
    138 
    139     /* transform in raster scan order */
    140     dataBlock = video->block;
    141     cbp4x4 = video->cbp4x4;
    142     /* luma */
    143     for (block_y = 4; block_y > 0; block_y--)
    144     {
    145         for (block_x = 4; block_x > 0; block_x--)
    146         {
    147 #ifdef USE_PRED_BLOCK
    148             if (cbp4x4&1)
    149             {
    150                 itrans(dataBlock, predBlock, predBlock, 20);
    151             }
    152 #else
    153             if (cbp4x4&1)
    154             {
    155                 itrans(dataBlock, curL, curL, picWidth);
    156             }
    157 #endif
    158             cbp4x4 >>= 1;
    159             dataBlock += 4;
    160 #ifdef USE_PRED_BLOCK
    161             predBlock += 4;
    162 #else
    163             curL += 4;
    164 #endif
    165         }
    166         dataBlock += 48;
    167 #ifdef USE_PRED_BLOCK
    168         predBlock += 64;
    169 #else
    170         curL += ((picWidth << 2) - 16);
    171 #endif
    172     }
    173 
    174     /* chroma */
    175     picWidth = (picWidth >> 1);
    176     for (block_y = 2; block_y > 0; block_y--)
    177     {
    178         for (block_x = 2; block_x > 0; block_x--)
    179         {
    180 #ifdef USE_PRED_BLOCK
    181             if (cbp4x4&1)
    182             {
    183                 ictrans(dataBlock, predCb, predCb, 12);
    184             }
    185 #else
    186             if (cbp4x4&1)
    187             {
    188                 ictrans(dataBlock, curCb, curCb, picWidth);
    189             }
    190 #endif
    191             cbp4x4 >>= 1;
    192             dataBlock += 4;
    193 #ifdef USE_PRED_BLOCK
    194             predCb += 4;
    195 #else
    196             curCb += 4;
    197 #endif
    198         }
    199         for (block_x = 2; block_x > 0; block_x--)
    200         {
    201 #ifdef USE_PRED_BLOCK
    202             if (cbp4x4&1)
    203             {
    204                 ictrans(dataBlock, predCr, predCr, 12);
    205             }
    206 #else
    207             if (cbp4x4&1)
    208             {
    209                 ictrans(dataBlock, curCr, curCr, picWidth);
    210             }
    211 #endif
    212             cbp4x4 >>= 1;
    213             dataBlock += 4;
    214 #ifdef USE_PRED_BLOCK
    215             predCr += 4;
    216 #else
    217             curCr += 4;
    218 #endif
    219         }
    220         dataBlock += 48;
    221 #ifdef USE_PRED_BLOCK
    222         predCb += 40;
    223         predCr += 40;
    224 #else
    225         curCb += ((picWidth << 2) - 8);
    226         curCr += ((picWidth << 2) - 8);
    227 #endif
    228     }
    229 
    230 #ifdef MB_BASED_DEBLOCK
    231     SaveNeighborForIntraPred(video, offset);
    232 #endif
    233 
    234     return ;
    235 }
    236 
    237 
    238 /* preform the actual  motion comp here */
    239 void LumaMotionComp(uint8 *ref, int picwidth, int picheight,
    240                     int x_pos, int y_pos,
    241                     uint8 *pred, int pred_pitch,
    242                     int blkwidth, int blkheight)
    243 {
    244     int dx, dy;
    245     uint8 temp[24][24]; /* for padding, make the size multiple of 4 for packing */
    246     int temp2[21][21]; /* for intermediate results */
    247     uint8 *ref2;
    248 
    249     dx = x_pos & 3;
    250     dy = y_pos & 3;
    251     x_pos = x_pos >> 2;  /* round it to full-pel resolution */
    252     y_pos = y_pos >> 2;
    253 
    254     /* perform actual motion compensation */
    255     if (dx == 0 && dy == 0)
    256     {  /* fullpel position *//* G */
    257         if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
    258         {
    259             ref += y_pos * picwidth + x_pos;
    260             FullPelMC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
    261         }
    262         else
    263         {
    264             CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth, blkheight);
    265             FullPelMC(&temp[0][0], 24, pred, pred_pitch, blkwidth, blkheight);
    266         }
    267 
    268     }   /* other positions */
    269     else  if (dy == 0)
    270     { /* no vertical interpolation *//* a,b,c*/
    271 
    272         if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos >= 0 && y_pos + blkheight <= picheight)
    273         {
    274             ref += y_pos * picwidth + x_pos;
    275 
    276             HorzInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dx);
    277         }
    278         else  /* need padding */
    279         {
    280             CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos, &temp[0][0], blkwidth + 5, blkheight);
    281 
    282             HorzInterp1MC(&temp[0][2], 24, pred, pred_pitch, blkwidth, blkheight, dx);
    283         }
    284     }
    285     else if (dx == 0)
    286     { /*no horizontal interpolation *//* d,h,n */
    287 
    288         if (x_pos >= 0 && x_pos + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
    289         {
    290             ref += y_pos * picwidth + x_pos;
    291 
    292             VertInterp1MC(ref, picwidth, pred, pred_pitch, blkwidth, blkheight, dy);
    293         }
    294         else  /* need padding */
    295         {
    296             CreatePad(ref, picwidth, picheight, x_pos, y_pos - 2, &temp[0][0], blkwidth, blkheight + 5);
    297 
    298             VertInterp1MC(&temp[2][0], 24, pred, pred_pitch, blkwidth, blkheight, dy);
    299         }
    300     }
    301     else if (dy == 2)
    302     {  /* horizontal cross *//* i, j, k */
    303 
    304         if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
    305         {
    306             ref += y_pos * picwidth + x_pos - 2; /* move to the left 2 pixels */
    307 
    308             VertInterp2MC(ref, picwidth, &temp2[0][0], 21, blkwidth + 5, blkheight);
    309 
    310             HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
    311         }
    312         else /* need padding */
    313         {
    314             CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
    315 
    316             VertInterp2MC(&temp[2][0], 24, &temp2[0][0], 21, blkwidth + 5, blkheight);
    317 
    318             HorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
    319         }
    320     }
    321     else if (dx == 2)
    322     { /* vertical cross */ /* f,q */
    323 
    324         if (x_pos - 2 >= 0 && x_pos + 3 + blkwidth <= picwidth && y_pos - 2 >= 0 && y_pos + 3 + blkheight <= picheight)
    325         {
    326             ref += (y_pos - 2) * picwidth + x_pos; /* move to up 2 lines */
    327 
    328             HorzInterp3MC(ref, picwidth, &temp2[0][0], 21, blkwidth, blkheight + 5);
    329             VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
    330         }
    331         else  /* need padding */
    332         {
    333             CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5, blkheight + 5);
    334             HorzInterp3MC(&temp[0][2], 24, &temp2[0][0], 21, blkwidth, blkheight + 5);
    335             VertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
    336         }
    337     }
    338     else
    339     { /* diagonal *//* e,g,p,r */
    340 
    341         if (x_pos - 2 >= 0 && x_pos + 3 + (dx / 2) + blkwidth <= picwidth &&
    342                 y_pos - 2 >= 0 && y_pos + 3 + blkheight + (dy / 2) <= picheight)
    343         {
    344             ref2 = ref + (y_pos + (dy / 2)) * picwidth + x_pos;
    345 
    346             ref += (y_pos * picwidth) + x_pos + (dx / 2);
    347 
    348             DiagonalInterpMC(ref2, ref, picwidth, pred, pred_pitch, blkwidth, blkheight);
    349         }
    350         else  /* need padding */
    351         {
    352             CreatePad(ref, picwidth, picheight, x_pos - 2, y_pos - 2, &temp[0][0], blkwidth + 5 + (dx / 2), blkheight + 5 + (dy / 2));
    353 
    354             ref2 = &temp[2 + (dy/2)][2];
    355 
    356             ref = &temp[2][2 + (dx/2)];
    357 
    358             DiagonalInterpMC(ref2, ref, 24, pred, pred_pitch, blkwidth, blkheight);
    359         }
    360     }
    361 
    362     return ;
    363 }
    364 
    365 void CreateAlign(uint8 *ref, int picwidth, int y_pos,
    366                  uint8 *out, int blkwidth, int blkheight)
    367 {
    368     int i, j;
    369     int offset, out_offset;
    370     uint32 prev_pix, result, pix1, pix2, pix4;
    371 
    372     out_offset = 24 - blkwidth;
    373 
    374     //switch(x_pos&0x3){
    375     switch (((uint32)ref)&0x3)
    376     {
    377         case 1:
    378             ref += y_pos * picwidth;
    379             offset =  picwidth - blkwidth - 3;
    380             for (j = 0; j < blkheight; j++)
    381             {
    382                 pix1 = *ref++;
    383                 pix2 = *((uint16*)ref);
    384                 ref += 2;
    385                 result = (pix2 << 8) | pix1;
    386 
    387                 for (i = 3; i < blkwidth; i += 4)
    388                 {
    389                     pix4 = *((uint32*)ref);
    390                     ref += 4;
    391                     prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
    392                     result |= prev_pix;
    393                     *((uint32*)out) = result;  /* write 4 bytes */
    394                     out += 4;
    395                     result = pix4 >> 8; /* for the next loop */
    396                 }
    397                 ref += offset;
    398                 out += out_offset;
    399             }
    400             break;
    401         case 2:
    402             ref += y_pos * picwidth;
    403             offset =  picwidth - blkwidth - 2;
    404             for (j = 0; j < blkheight; j++)
    405             {
    406                 result = *((uint16*)ref);
    407                 ref += 2;
    408                 for (i = 2; i < blkwidth; i += 4)
    409                 {
    410                     pix4 = *((uint32*)ref);
    411                     ref += 4;
    412                     prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
    413                     result |= prev_pix;
    414                     *((uint32*)out) = result;  /* write 4 bytes */
    415                     out += 4;
    416                     result = pix4 >> 16; /* for the next loop */
    417                 }
    418                 ref += offset;
    419                 out += out_offset;
    420             }
    421             break;
    422         case 3:
    423             ref += y_pos * picwidth;
    424             offset =  picwidth - blkwidth - 1;
    425             for (j = 0; j < blkheight; j++)
    426             {
    427                 result = *ref++;
    428                 for (i = 1; i < blkwidth; i += 4)
    429                 {
    430                     pix4 = *((uint32*)ref);
    431                     ref += 4;
    432                     prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
    433                     result |= prev_pix;
    434                     *((uint32*)out) = result;  /* write 4 bytes */
    435                     out += 4;
    436                     result = pix4 >> 24; /* for the next loop */
    437                 }
    438                 ref += offset;
    439                 out += out_offset;
    440             }
    441             break;
    442     }
    443 }
    444 
    445 void CreatePad(uint8 *ref, int picwidth, int picheight, int x_pos, int y_pos,
    446                uint8 *out, int blkwidth, int blkheight)
    447 {
    448     int x_inc0, x_mid;
    449     int y_inc, y_inc0, y_inc1, y_mid;
    450     int i, j;
    451     int offset;
    452 
    453     if (x_pos < 0)
    454     {
    455         x_inc0 = 0;  /* increment for the first part */
    456         x_mid = ((blkwidth + x_pos > 0) ? -x_pos : blkwidth);  /* stopping point */
    457         x_pos = 0;
    458     }
    459     else if (x_pos + blkwidth > picwidth)
    460     {
    461         x_inc0 = 1;  /* increasing */
    462         x_mid = ((picwidth > x_pos) ? picwidth - x_pos - 1 : 0);  /* clip negative to zero, encode fool proof! */
    463     }
    464     else    /* normal case */
    465     {
    466         x_inc0 = 1;
    467         x_mid = blkwidth; /* just one run */
    468     }
    469 
    470 
    471     /* boundary for y_pos, taking the result from x_pos into account */
    472     if (y_pos < 0)
    473     {
    474         y_inc0 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* offset depending on x_inc1 and x_inc0 */
    475         y_inc1 = picwidth + y_inc0;
    476         y_mid = ((blkheight + y_pos > 0) ? -y_pos : blkheight); /* clip to prevent memory corruption */
    477         y_pos = 0;
    478     }
    479     else  if (y_pos + blkheight > picheight)
    480     {
    481         y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid); /* saturate */
    482         y_inc0 = picwidth + y_inc1;                 /* increasing */
    483         y_mid = ((picheight > y_pos) ? picheight - 1 - y_pos : 0);
    484     }
    485     else  /* normal case */
    486     {
    487         y_inc1 = (x_inc0 ? - x_mid : -blkwidth + x_mid);
    488         y_inc0 = picwidth + y_inc1;
    489         y_mid = blkheight;
    490     }
    491 
    492     /* clip y_pos and x_pos */
    493     if (y_pos > picheight - 1) y_pos = picheight - 1;
    494     if (x_pos > picwidth - 1) x_pos = picwidth - 1;
    495 
    496     ref += y_pos * picwidth + x_pos;
    497 
    498     y_inc = y_inc0;  /* start with top half */
    499 
    500     offset = 24 - blkwidth; /* to use in offset out */
    501     blkwidth -= x_mid; /* to use in the loop limit */
    502 
    503     if (x_inc0 == 0)
    504     {
    505         for (j = 0; j < blkheight; j++)
    506         {
    507             if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
    508             {
    509                 y_inc = y_inc1;  /* switch to lower half */
    510             }
    511             for (i = x_mid; i > 0; i--)   /* first or third quarter */
    512             {
    513                 *out++ = *ref;
    514             }
    515             for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
    516             {
    517                 *out++ = *ref++;
    518             }
    519             out += offset;
    520             ref += y_inc;
    521         }
    522     }
    523     else
    524     {
    525         for (j = 0; j < blkheight; j++)
    526         {
    527             if (j == y_mid)  /* put a check here to reduce the code size (for unrolling the loop) */
    528             {
    529                 y_inc = y_inc1;  /* switch to lower half */
    530             }
    531             for (i = x_mid; i > 0; i--)   /* first or third quarter */
    532             {
    533                 *out++ = *ref++;
    534             }
    535             for (i = blkwidth; i > 0; i--)  /* second or fourth quarter */
    536             {
    537                 *out++ = *ref;
    538             }
    539             out += offset;
    540             ref += y_inc;
    541         }
    542     }
    543 
    544     return ;
    545 }
    546 
    547 void HorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
    548                    int blkwidth, int blkheight, int dx)
    549 {
    550     uint8 *p_ref;
    551     uint32 *p_cur;
    552     uint32 tmp, pkres;
    553     int result, curr_offset, ref_offset;
    554     int j;
    555     int32 r0, r1, r2, r3, r4, r5;
    556     int32 r13, r6;
    557 
    558     p_cur = (uint32*)out; /* assume it's word aligned */
    559     curr_offset = (outpitch - blkwidth) >> 2;
    560     p_ref = in;
    561     ref_offset = inpitch - blkwidth;
    562 
    563     if (dx&1)
    564     {
    565         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
    566         p_ref -= 2;
    567         r13 = 0;
    568         for (j = blkheight; j > 0; j--)
    569         {
    570             tmp = (uint32)(p_ref + blkwidth);
    571             r0 = p_ref[0];
    572             r1 = p_ref[2];
    573             r0 |= (r1 << 16);           /* 0,c,0,a */
    574             r1 = p_ref[1];
    575             r2 = p_ref[3];
    576             r1 |= (r2 << 16);           /* 0,d,0,b */
    577             while ((uint32)p_ref < tmp)
    578             {
    579                 r2 = *(p_ref += 4); /* move pointer to e */
    580                 r3 = p_ref[2];
    581                 r2 |= (r3 << 16);           /* 0,g,0,e */
    582                 r3 = p_ref[1];
    583                 r4 = p_ref[3];
    584                 r3 |= (r4 << 16);           /* 0,h,0,f */
    585 
    586                 r4 = r0 + r3;       /* c+h, a+f */
    587                 r5 = r0 + r1;   /* c+d, a+b */
    588                 r6 = r2 + r3;   /* g+h, e+f */
    589                 r5 >>= 16;
    590                 r5 |= (r6 << 16);   /* e+f, c+d */
    591                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
    592                 r4 += 0x100010; /* +16, +16 */
    593                 r5 = r1 + r2;       /* d+g, b+e */
    594                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
    595                 r4 >>= 5;
    596                 r13 |= r4;      /* check clipping */
    597 
    598                 r5 = p_ref[dx+2];
    599                 r6 = p_ref[dx+4];
    600                 r5 |= (r6 << 16);
    601                 r4 += r5;
    602                 r4 += 0x10001;
    603                 r4 = (r4 >> 1) & 0xFF00FF;
    604 
    605                 r5 = p_ref[4];  /* i */
    606                 r6 = (r5 << 16);
    607                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
    608                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
    609                 r1 >>= 16;
    610                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
    611                 r1 += r2;       /* f+g, d+e */
    612                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
    613                 r0 >>= 16;
    614                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
    615                 r0 += r3;       /* e+h, c+f */
    616                 r5 += 0x100010; /* 16,16 */
    617                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
    618                 r5 >>= 5;
    619                 r13 |= r5;      /* check clipping */
    620 
    621                 r0 = p_ref[dx+3];
    622                 r1 = p_ref[dx+5];
    623                 r0 |= (r1 << 16);
    624                 r5 += r0;
    625                 r5 += 0x10001;
    626                 r5 = (r5 >> 1) & 0xFF00FF;
    627 
    628                 r4 |= (r5 << 8);    /* pack them together */
    629                 *p_cur++ = r4;
    630                 r1 = r3;
    631                 r0 = r2;
    632             }
    633             p_cur += curr_offset; /* move to the next line */
    634             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
    635 
    636             if (r13&0xFF000700) /* need clipping */
    637             {
    638                 /* move back to the beginning of the line */
    639                 p_ref -= (ref_offset + blkwidth);   /* input */
    640                 p_cur -= (outpitch >> 2);
    641 
    642                 tmp = (uint32)(p_ref + blkwidth);
    643                 for (; (uint32)p_ref < tmp;)
    644                 {
    645 
    646                     r0 = *p_ref++;
    647                     r1 = *p_ref++;
    648                     r2 = *p_ref++;
    649                     r3 = *p_ref++;
    650                     r4 = *p_ref++;
    651                     /* first pixel */
    652                     r5 = *p_ref++;
    653                     result = (r0 + r5);
    654                     r0 = (r1 + r4);
    655                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    656                     r0 = (r2 + r3);
    657                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    658                     result = (result + 16) >> 5;
    659                     CLIP_RESULT(result)
    660                     /* 3/4 pel,  no need to clip */
    661                     result = (result + p_ref[dx] + 1);
    662                     pkres = (result >> 1) ;
    663                     /* second pixel */
    664                     r0 = *p_ref++;
    665                     result = (r1 + r0);
    666                     r1 = (r2 + r5);
    667                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    668                     r1 = (r3 + r4);
    669                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    670                     result = (result + 16) >> 5;
    671                     CLIP_RESULT(result)
    672                     /* 3/4 pel,  no need to clip */
    673                     result = (result + p_ref[dx] + 1);
    674                     result = (result >> 1);
    675                     pkres  |= (result << 8);
    676                     /* third pixel */
    677                     r1 = *p_ref++;
    678                     result = (r2 + r1);
    679                     r2 = (r3 + r0);
    680                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    681                     r2 = (r4 + r5);
    682                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    683                     result = (result + 16) >> 5;
    684                     CLIP_RESULT(result)
    685                     /* 3/4 pel,  no need to clip */
    686                     result = (result + p_ref[dx] + 1);
    687                     result = (result >> 1);
    688                     pkres  |= (result << 16);
    689                     /* fourth pixel */
    690                     r2 = *p_ref++;
    691                     result = (r3 + r2);
    692                     r3 = (r4 + r1);
    693                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    694                     r3 = (r5 + r0);
    695                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    696                     result = (result + 16) >> 5;
    697                     CLIP_RESULT(result)
    698                     /* 3/4 pel,  no need to clip */
    699                     result = (result + p_ref[dx] + 1);
    700                     result = (result >> 1);
    701                     pkres  |= (result << 24);
    702                     *p_cur++ = pkres; /* write 4 pixels */
    703                     p_ref -= 5;  /* offset back to the middle of filter */
    704                 }
    705                 p_cur += curr_offset;  /* move to the next line */
    706                 p_ref += ref_offset;    /* move to the next line */
    707             }
    708         }
    709     }
    710     else
    711     {
    712         p_ref -= 2;
    713         r13 = 0;
    714         for (j = blkheight; j > 0; j--)
    715         {
    716             tmp = (uint32)(p_ref + blkwidth);
    717             r0 = p_ref[0];
    718             r1 = p_ref[2];
    719             r0 |= (r1 << 16);           /* 0,c,0,a */
    720             r1 = p_ref[1];
    721             r2 = p_ref[3];
    722             r1 |= (r2 << 16);           /* 0,d,0,b */
    723             while ((uint32)p_ref < tmp)
    724             {
    725                 r2 = *(p_ref += 4); /* move pointer to e */
    726                 r3 = p_ref[2];
    727                 r2 |= (r3 << 16);           /* 0,g,0,e */
    728                 r3 = p_ref[1];
    729                 r4 = p_ref[3];
    730                 r3 |= (r4 << 16);           /* 0,h,0,f */
    731 
    732                 r4 = r0 + r3;       /* c+h, a+f */
    733                 r5 = r0 + r1;   /* c+d, a+b */
    734                 r6 = r2 + r3;   /* g+h, e+f */
    735                 r5 >>= 16;
    736                 r5 |= (r6 << 16);   /* e+f, c+d */
    737                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
    738                 r4 += 0x100010; /* +16, +16 */
    739                 r5 = r1 + r2;       /* d+g, b+e */
    740                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
    741                 r4 >>= 5;
    742                 r13 |= r4;      /* check clipping */
    743                 r4 &= 0xFF00FF; /* mask */
    744 
    745                 r5 = p_ref[4];  /* i */
    746                 r6 = (r5 << 16);
    747                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
    748                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
    749                 r1 >>= 16;
    750                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
    751                 r1 += r2;       /* f+g, d+e */
    752                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
    753                 r0 >>= 16;
    754                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
    755                 r0 += r3;       /* e+h, c+f */
    756                 r5 += 0x100010; /* 16,16 */
    757                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
    758                 r5 >>= 5;
    759                 r13 |= r5;      /* check clipping */
    760                 r5 &= 0xFF00FF; /* mask */
    761 
    762                 r4 |= (r5 << 8);    /* pack them together */
    763                 *p_cur++ = r4;
    764                 r1 = r3;
    765                 r0 = r2;
    766             }
    767             p_cur += curr_offset; /* move to the next line */
    768             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
    769 
    770             if (r13&0xFF000700) /* need clipping */
    771             {
    772                 /* move back to the beginning of the line */
    773                 p_ref -= (ref_offset + blkwidth);   /* input */
    774                 p_cur -= (outpitch >> 2);
    775 
    776                 tmp = (uint32)(p_ref + blkwidth);
    777                 for (; (uint32)p_ref < tmp;)
    778                 {
    779 
    780                     r0 = *p_ref++;
    781                     r1 = *p_ref++;
    782                     r2 = *p_ref++;
    783                     r3 = *p_ref++;
    784                     r4 = *p_ref++;
    785                     /* first pixel */
    786                     r5 = *p_ref++;
    787                     result = (r0 + r5);
    788                     r0 = (r1 + r4);
    789                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    790                     r0 = (r2 + r3);
    791                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    792                     result = (result + 16) >> 5;
    793                     CLIP_RESULT(result)
    794                     pkres  = result;
    795                     /* second pixel */
    796                     r0 = *p_ref++;
    797                     result = (r1 + r0);
    798                     r1 = (r2 + r5);
    799                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    800                     r1 = (r3 + r4);
    801                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    802                     result = (result + 16) >> 5;
    803                     CLIP_RESULT(result)
    804                     pkres  |= (result << 8);
    805                     /* third pixel */
    806                     r1 = *p_ref++;
    807                     result = (r2 + r1);
    808                     r2 = (r3 + r0);
    809                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    810                     r2 = (r4 + r5);
    811                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    812                     result = (result + 16) >> 5;
    813                     CLIP_RESULT(result)
    814                     pkres  |= (result << 16);
    815                     /* fourth pixel */
    816                     r2 = *p_ref++;
    817                     result = (r3 + r2);
    818                     r3 = (r4 + r1);
    819                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    820                     r3 = (r5 + r0);
    821                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    822                     result = (result + 16) >> 5;
    823                     CLIP_RESULT(result)
    824                     pkres  |= (result << 24);
    825                     *p_cur++ = pkres;   /* write 4 pixels */
    826                     p_ref -= 5;
    827                 }
    828                 p_cur += curr_offset; /* move to the next line */
    829                 p_ref += ref_offset;
    830             }
    831         }
    832     }
    833 
    834     return ;
    835 }
    836 
    837 void HorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
    838                    int blkwidth, int blkheight, int dx)
    839 {
    840     int *p_ref;
    841     uint32 *p_cur;
    842     uint32 tmp, pkres;
    843     int result, result2, curr_offset, ref_offset;
    844     int j, r0, r1, r2, r3, r4, r5;
    845 
    846     p_cur = (uint32*)out; /* assume it's word aligned */
    847     curr_offset = (outpitch - blkwidth) >> 2;
    848     p_ref = in;
    849     ref_offset = inpitch - blkwidth;
    850 
    851     if (dx&1)
    852     {
    853         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
    854 
    855         for (j = blkheight; j > 0 ; j--)
    856         {
    857             tmp = (uint32)(p_ref + blkwidth);
    858             for (; (uint32)p_ref < tmp;)
    859             {
    860 
    861                 r0 = p_ref[-2];
    862                 r1 = p_ref[-1];
    863                 r2 = *p_ref++;
    864                 r3 = *p_ref++;
    865                 r4 = *p_ref++;
    866                 /* first pixel */
    867                 r5 = *p_ref++;
    868                 result = (r0 + r5);
    869                 r0 = (r1 + r4);
    870                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    871                 r0 = (r2 + r3);
    872                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    873                 result = (result + 512) >> 10;
    874                 CLIP_RESULT(result)
    875                 result2 = ((p_ref[dx] + 16) >> 5);
    876                 CLIP_RESULT(result2)
    877                 /* 3/4 pel,  no need to clip */
    878                 result = (result + result2 + 1);
    879                 pkres = (result >> 1);
    880                 /* second pixel */
    881                 r0 = *p_ref++;
    882                 result = (r1 + r0);
    883                 r1 = (r2 + r5);
    884                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    885                 r1 = (r3 + r4);
    886                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    887                 result = (result + 512) >> 10;
    888                 CLIP_RESULT(result)
    889                 result2 = ((p_ref[dx] + 16) >> 5);
    890                 CLIP_RESULT(result2)
    891                 /* 3/4 pel,  no need to clip */
    892                 result = (result + result2 + 1);
    893                 result = (result >> 1);
    894                 pkres  |= (result << 8);
    895                 /* third pixel */
    896                 r1 = *p_ref++;
    897                 result = (r2 + r1);
    898                 r2 = (r3 + r0);
    899                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    900                 r2 = (r4 + r5);
    901                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    902                 result = (result + 512) >> 10;
    903                 CLIP_RESULT(result)
    904                 result2 = ((p_ref[dx] + 16) >> 5);
    905                 CLIP_RESULT(result2)
    906                 /* 3/4 pel,  no need to clip */
    907                 result = (result + result2 + 1);
    908                 result = (result >> 1);
    909                 pkres  |= (result << 16);
    910                 /* fourth pixel */
    911                 r2 = *p_ref++;
    912                 result = (r3 + r2);
    913                 r3 = (r4 + r1);
    914                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    915                 r3 = (r5 + r0);
    916                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    917                 result = (result + 512) >> 10;
    918                 CLIP_RESULT(result)
    919                 result2 = ((p_ref[dx] + 16) >> 5);
    920                 CLIP_RESULT(result2)
    921                 /* 3/4 pel,  no need to clip */
    922                 result = (result + result2 + 1);
    923                 result = (result >> 1);
    924                 pkres  |= (result << 24);
    925                 *p_cur++ = pkres; /* write 4 pixels */
    926                 p_ref -= 3;  /* offset back to the middle of filter */
    927             }
    928             p_cur += curr_offset;  /* move to the next line */
    929             p_ref += ref_offset;    /* move to the next line */
    930         }
    931     }
    932     else
    933     {
    934         for (j = blkheight; j > 0 ; j--)
    935         {
    936             tmp = (uint32)(p_ref + blkwidth);
    937             for (; (uint32)p_ref < tmp;)
    938             {
    939 
    940                 r0 = p_ref[-2];
    941                 r1 = p_ref[-1];
    942                 r2 = *p_ref++;
    943                 r3 = *p_ref++;
    944                 r4 = *p_ref++;
    945                 /* first pixel */
    946                 r5 = *p_ref++;
    947                 result = (r0 + r5);
    948                 r0 = (r1 + r4);
    949                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    950                 r0 = (r2 + r3);
    951                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    952                 result = (result + 512) >> 10;
    953                 CLIP_RESULT(result)
    954                 pkres  = result;
    955                 /* second pixel */
    956                 r0 = *p_ref++;
    957                 result = (r1 + r0);
    958                 r1 = (r2 + r5);
    959                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    960                 r1 = (r3 + r4);
    961                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    962                 result = (result + 512) >> 10;
    963                 CLIP_RESULT(result)
    964                 pkres  |= (result << 8);
    965                 /* third pixel */
    966                 r1 = *p_ref++;
    967                 result = (r2 + r1);
    968                 r2 = (r3 + r0);
    969                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    970                 r2 = (r4 + r5);
    971                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    972                 result = (result + 512) >> 10;
    973                 CLIP_RESULT(result)
    974                 pkres  |= (result << 16);
    975                 /* fourth pixel */
    976                 r2 = *p_ref++;
    977                 result = (r3 + r2);
    978                 r3 = (r4 + r1);
    979                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    980                 r3 = (r5 + r0);
    981                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    982                 result = (result + 512) >> 10;
    983                 CLIP_RESULT(result)
    984                 pkres  |= (result << 24);
    985                 *p_cur++ = pkres; /* write 4 pixels */
    986                 p_ref -= 3;  /* offset back to the middle of filter */
    987             }
    988             p_cur += curr_offset;  /* move to the next line */
    989             p_ref += ref_offset;    /* move to the next line */
    990         }
    991     }
    992 
    993     return ;
    994 }
    995 
    996 void HorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
    997                    int blkwidth, int blkheight)
    998 {
    999     uint8 *p_ref;
   1000     int   *p_cur;
   1001     uint32 tmp;
   1002     int result, curr_offset, ref_offset;
   1003     int j, r0, r1, r2, r3, r4, r5;
   1004 
   1005     p_cur = out;
   1006     curr_offset = (outpitch - blkwidth);
   1007     p_ref = in;
   1008     ref_offset = inpitch - blkwidth;
   1009 
   1010     for (j = blkheight; j > 0 ; j--)
   1011     {
   1012         tmp = (uint32)(p_ref + blkwidth);
   1013         for (; (uint32)p_ref < tmp;)
   1014         {
   1015 
   1016             r0 = p_ref[-2];
   1017             r1 = p_ref[-1];
   1018             r2 = *p_ref++;
   1019             r3 = *p_ref++;
   1020             r4 = *p_ref++;
   1021             /* first pixel */
   1022             r5 = *p_ref++;
   1023             result = (r0 + r5);
   1024             r0 = (r1 + r4);
   1025             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1026             r0 = (r2 + r3);
   1027             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1028             *p_cur++ = result;
   1029             /* second pixel */
   1030             r0 = *p_ref++;
   1031             result = (r1 + r0);
   1032             r1 = (r2 + r5);
   1033             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1034             r1 = (r3 + r4);
   1035             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1036             *p_cur++ = result;
   1037             /* third pixel */
   1038             r1 = *p_ref++;
   1039             result = (r2 + r1);
   1040             r2 = (r3 + r0);
   1041             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1042             r2 = (r4 + r5);
   1043             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1044             *p_cur++ = result;
   1045             /* fourth pixel */
   1046             r2 = *p_ref++;
   1047             result = (r3 + r2);
   1048             r3 = (r4 + r1);
   1049             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1050             r3 = (r5 + r0);
   1051             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1052             *p_cur++ = result;
   1053             p_ref -= 3; /* move back to the middle of the filter */
   1054         }
   1055         p_cur += curr_offset; /* move to the next line */
   1056         p_ref += ref_offset;
   1057     }
   1058 
   1059     return ;
   1060 }
   1061 void VertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
   1062                    int blkwidth, int blkheight, int dy)
   1063 {
   1064     uint8 *p_cur, *p_ref;
   1065     uint32 tmp;
   1066     int result, curr_offset, ref_offset;
   1067     int j, i;
   1068     int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
   1069     uint8  tmp_in[24][24];
   1070 
   1071     /* not word-aligned */
   1072     if (((uint32)in)&0x3)
   1073     {
   1074         CreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
   1075         in = &tmp_in[2][0];
   1076         inpitch = 24;
   1077     }
   1078     p_cur = out;
   1079     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
   1080     ref_offset = blkheight * inpitch; /* for limit */
   1081 
   1082     curr_offset += 3;
   1083 
   1084     if (dy&1)
   1085     {
   1086         dy = (dy >> 1) ? 0 : -inpitch;
   1087 
   1088         for (j = 0; j < blkwidth; j += 4, in += 4)
   1089         {
   1090             r13 = 0;
   1091             p_ref = in;
   1092             p_cur -= outpitch;  /* compensate for the first offset */
   1093             tmp = (uint32)(p_ref + ref_offset); /* limit */
   1094             while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
   1095             {
   1096                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
   1097                 p_ref += inpitch;
   1098                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
   1099                 r0 &= 0xFF00FF;
   1100 
   1101                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
   1102                 r7 = (r1 >> 8) & 0xFF00FF;
   1103                 r1 &= 0xFF00FF;
   1104 
   1105                 r0 += r1;
   1106                 r6 += r7;
   1107 
   1108                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
   1109                 r8 = (r2 >> 8) & 0xFF00FF;
   1110                 r2 &= 0xFF00FF;
   1111 
   1112                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
   1113                 r7 = (r1 >> 8) & 0xFF00FF;
   1114                 r1 &= 0xFF00FF;
   1115                 r1 += r2;
   1116 
   1117                 r7 += r8;
   1118 
   1119                 r0 += 20 * r1;
   1120                 r6 += 20 * r7;
   1121                 r0 += 0x100010;
   1122                 r6 += 0x100010;
   1123 
   1124                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
   1125                 r8 = (r2 >> 8) & 0xFF00FF;
   1126                 r2 &= 0xFF00FF;
   1127 
   1128                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
   1129                 r7 = (r1 >> 8) & 0xFF00FF;
   1130                 r1 &= 0xFF00FF;
   1131                 r1 += r2;
   1132 
   1133                 r7 += r8;
   1134 
   1135                 r0 -= 5 * r1;
   1136                 r6 -= 5 * r7;
   1137 
   1138                 r0 >>= 5;
   1139                 r6 >>= 5;
   1140                 /* clip */
   1141                 r13 |= r6;
   1142                 r13 |= r0;
   1143                 //CLIPPACK(r6,result)
   1144 
   1145                 r1 = *((uint32*)(p_ref + dy));
   1146                 r2 = (r1 >> 8) & 0xFF00FF;
   1147                 r1 &= 0xFF00FF;
   1148                 r0 += r1;
   1149                 r6 += r2;
   1150                 r0 += 0x10001;
   1151                 r6 += 0x10001;
   1152                 r0 = (r0 >> 1) & 0xFF00FF;
   1153                 r6 = (r6 >> 1) & 0xFF00FF;
   1154 
   1155                 r0 |= (r6 << 8);  /* pack it back */
   1156                 *((uint32*)(p_cur += outpitch)) = r0;
   1157             }
   1158             p_cur += curr_offset; /* offset to the next pixel */
   1159             if (r13 & 0xFF000700) /* this column need clipping */
   1160             {
   1161                 p_cur -= 4;
   1162                 for (i = 0; i < 4; i++)
   1163                 {
   1164                     p_ref = in + i;
   1165                     p_cur -= outpitch;  /* compensate for the first offset */
   1166 
   1167                     tmp = (uint32)(p_ref + ref_offset); /* limit */
   1168                     while ((uint32)p_ref < tmp)
   1169                     {                           /* loop un-rolled */
   1170                         r0 = *(p_ref - (inpitch << 1));
   1171                         r1 = *(p_ref - inpitch);
   1172                         r2 = *p_ref;
   1173                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1174                         r4 = *(p_ref += inpitch);
   1175                         /* first pixel */
   1176                         r5 = *(p_ref += inpitch);
   1177                         result = (r0 + r5);
   1178                         r0 = (r1 + r4);
   1179                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1180                         r0 = (r2 + r3);
   1181                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1182                         result = (result + 16) >> 5;
   1183                         CLIP_RESULT(result)
   1184                         /* 3/4 pel,  no need to clip */
   1185                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
   1186                         result = (result >> 1);
   1187                         *(p_cur += outpitch) = result;
   1188                         /* second pixel */
   1189                         r0 = *(p_ref += inpitch);
   1190                         result = (r1 + r0);
   1191                         r1 = (r2 + r5);
   1192                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1193                         r1 = (r3 + r4);
   1194                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1195                         result = (result + 16) >> 5;
   1196                         CLIP_RESULT(result)
   1197                         /* 3/4 pel,  no need to clip */
   1198                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
   1199                         result = (result >> 1);
   1200                         *(p_cur += outpitch) = result;
   1201                         /* third pixel */
   1202                         r1 = *(p_ref += inpitch);
   1203                         result = (r2 + r1);
   1204                         r2 = (r3 + r0);
   1205                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1206                         r2 = (r4 + r5);
   1207                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1208                         result = (result + 16) >> 5;
   1209                         CLIP_RESULT(result)
   1210                         /* 3/4 pel,  no need to clip */
   1211                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
   1212                         result = (result >> 1);
   1213                         *(p_cur += outpitch) = result;
   1214                         /* fourth pixel */
   1215                         r2 = *(p_ref += inpitch);
   1216                         result = (r3 + r2);
   1217                         r3 = (r4 + r1);
   1218                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1219                         r3 = (r5 + r0);
   1220                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1221                         result = (result + 16) >> 5;
   1222                         CLIP_RESULT(result)
   1223                         /* 3/4 pel,  no need to clip */
   1224                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
   1225                         result = (result >> 1);
   1226                         *(p_cur += outpitch) = result;
   1227                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1228                     }
   1229                     p_cur += (curr_offset - 3);
   1230                 }
   1231             }
   1232         }
   1233     }
   1234     else
   1235     {
   1236         for (j = 0; j < blkwidth; j += 4, in += 4)
   1237         {
   1238             r13 = 0;
   1239             p_ref = in;
   1240             p_cur -= outpitch;  /* compensate for the first offset */
   1241             tmp = (uint32)(p_ref + ref_offset); /* limit */
   1242             while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
   1243             {
   1244                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
   1245                 p_ref += inpitch;
   1246                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
   1247                 r0 &= 0xFF00FF;
   1248 
   1249                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
   1250                 r7 = (r1 >> 8) & 0xFF00FF;
   1251                 r1 &= 0xFF00FF;
   1252 
   1253                 r0 += r1;
   1254                 r6 += r7;
   1255 
   1256                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
   1257                 r8 = (r2 >> 8) & 0xFF00FF;
   1258                 r2 &= 0xFF00FF;
   1259 
   1260                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
   1261                 r7 = (r1 >> 8) & 0xFF00FF;
   1262                 r1 &= 0xFF00FF;
   1263                 r1 += r2;
   1264 
   1265                 r7 += r8;
   1266 
   1267                 r0 += 20 * r1;
   1268                 r6 += 20 * r7;
   1269                 r0 += 0x100010;
   1270                 r6 += 0x100010;
   1271 
   1272                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
   1273                 r8 = (r2 >> 8) & 0xFF00FF;
   1274                 r2 &= 0xFF00FF;
   1275 
   1276                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
   1277                 r7 = (r1 >> 8) & 0xFF00FF;
   1278                 r1 &= 0xFF00FF;
   1279                 r1 += r2;
   1280 
   1281                 r7 += r8;
   1282 
   1283                 r0 -= 5 * r1;
   1284                 r6 -= 5 * r7;
   1285 
   1286                 r0 >>= 5;
   1287                 r6 >>= 5;
   1288                 /* clip */
   1289                 r13 |= r6;
   1290                 r13 |= r0;
   1291                 //CLIPPACK(r6,result)
   1292                 r0 &= 0xFF00FF;
   1293                 r6 &= 0xFF00FF;
   1294                 r0 |= (r6 << 8);  /* pack it back */
   1295                 *((uint32*)(p_cur += outpitch)) = r0;
   1296             }
   1297             p_cur += curr_offset; /* offset to the next pixel */
   1298             if (r13 & 0xFF000700) /* this column need clipping */
   1299             {
   1300                 p_cur -= 4;
   1301                 for (i = 0; i < 4; i++)
   1302                 {
   1303                     p_ref = in + i;
   1304                     p_cur -= outpitch;  /* compensate for the first offset */
   1305                     tmp = (uint32)(p_ref + ref_offset); /* limit */
   1306                     while ((uint32)p_ref < tmp)
   1307                     {                           /* loop un-rolled */
   1308                         r0 = *(p_ref - (inpitch << 1));
   1309                         r1 = *(p_ref - inpitch);
   1310                         r2 = *p_ref;
   1311                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1312                         r4 = *(p_ref += inpitch);
   1313                         /* first pixel */
   1314                         r5 = *(p_ref += inpitch);
   1315                         result = (r0 + r5);
   1316                         r0 = (r1 + r4);
   1317                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1318                         r0 = (r2 + r3);
   1319                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1320                         result = (result + 16) >> 5;
   1321                         CLIP_RESULT(result)
   1322                         *(p_cur += outpitch) = result;
   1323                         /* second pixel */
   1324                         r0 = *(p_ref += inpitch);
   1325                         result = (r1 + r0);
   1326                         r1 = (r2 + r5);
   1327                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1328                         r1 = (r3 + r4);
   1329                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1330                         result = (result + 16) >> 5;
   1331                         CLIP_RESULT(result)
   1332                         *(p_cur += outpitch) = result;
   1333                         /* third pixel */
   1334                         r1 = *(p_ref += inpitch);
   1335                         result = (r2 + r1);
   1336                         r2 = (r3 + r0);
   1337                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1338                         r2 = (r4 + r5);
   1339                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1340                         result = (result + 16) >> 5;
   1341                         CLIP_RESULT(result)
   1342                         *(p_cur += outpitch) = result;
   1343                         /* fourth pixel */
   1344                         r2 = *(p_ref += inpitch);
   1345                         result = (r3 + r2);
   1346                         r3 = (r4 + r1);
   1347                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1348                         r3 = (r5 + r0);
   1349                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1350                         result = (result + 16) >> 5;
   1351                         CLIP_RESULT(result)
   1352                         *(p_cur += outpitch) = result;
   1353                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1354                     }
   1355                     p_cur += (curr_offset - 3);
   1356                 }
   1357             }
   1358         }
   1359     }
   1360 
   1361     return ;
   1362 }
   1363 
   1364 void VertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
   1365                    int blkwidth, int blkheight)
   1366 {
   1367     int *p_cur;
   1368     uint8 *p_ref;
   1369     uint32 tmp;
   1370     int result, curr_offset, ref_offset;
   1371     int j, r0, r1, r2, r3, r4, r5;
   1372 
   1373     p_cur = out;
   1374     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
   1375     ref_offset = blkheight * inpitch; /* for limit */
   1376 
   1377     for (j = 0; j < blkwidth; j++)
   1378     {
   1379         p_cur -= outpitch; /* compensate for the first offset */
   1380         p_ref = in++;
   1381 
   1382         tmp = (uint32)(p_ref + ref_offset); /* limit */
   1383         while ((uint32)p_ref < tmp)
   1384         {                           /* loop un-rolled */
   1385             r0 = *(p_ref - (inpitch << 1));
   1386             r1 = *(p_ref - inpitch);
   1387             r2 = *p_ref;
   1388             r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1389             r4 = *(p_ref += inpitch);
   1390             /* first pixel */
   1391             r5 = *(p_ref += inpitch);
   1392             result = (r0 + r5);
   1393             r0 = (r1 + r4);
   1394             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1395             r0 = (r2 + r3);
   1396             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1397             *(p_cur += outpitch) = result;
   1398             /* second pixel */
   1399             r0 = *(p_ref += inpitch);
   1400             result = (r1 + r0);
   1401             r1 = (r2 + r5);
   1402             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1403             r1 = (r3 + r4);
   1404             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1405             *(p_cur += outpitch) = result;
   1406             /* third pixel */
   1407             r1 = *(p_ref += inpitch);
   1408             result = (r2 + r1);
   1409             r2 = (r3 + r0);
   1410             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1411             r2 = (r4 + r5);
   1412             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1413             *(p_cur += outpitch) = result;
   1414             /* fourth pixel */
   1415             r2 = *(p_ref += inpitch);
   1416             result = (r3 + r2);
   1417             r3 = (r4 + r1);
   1418             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1419             r3 = (r5 + r0);
   1420             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1421             *(p_cur += outpitch) = result;
   1422             p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1423         }
   1424         p_cur += curr_offset;
   1425     }
   1426 
   1427     return ;
   1428 }
   1429 
   1430 void VertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
   1431                    int blkwidth, int blkheight, int dy)
   1432 {
   1433     uint8 *p_cur;
   1434     int *p_ref;
   1435     uint32 tmp;
   1436     int result, result2, curr_offset, ref_offset;
   1437     int j, r0, r1, r2, r3, r4, r5;
   1438 
   1439     p_cur = out;
   1440     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
   1441     ref_offset = blkheight * inpitch; /* for limit */
   1442 
   1443     if (dy&1)
   1444     {
   1445         dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
   1446 
   1447         for (j = 0; j < blkwidth; j++)
   1448         {
   1449             p_cur -= outpitch; /* compensate for the first offset */
   1450             p_ref = in++;
   1451 
   1452             tmp = (uint32)(p_ref + ref_offset); /* limit */
   1453             while ((uint32)p_ref < tmp)
   1454             {                           /* loop un-rolled */
   1455                 r0 = *(p_ref - (inpitch << 1));
   1456                 r1 = *(p_ref - inpitch);
   1457                 r2 = *p_ref;
   1458                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1459                 r4 = *(p_ref += inpitch);
   1460                 /* first pixel */
   1461                 r5 = *(p_ref += inpitch);
   1462                 result = (r0 + r5);
   1463                 r0 = (r1 + r4);
   1464                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1465                 r0 = (r2 + r3);
   1466                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1467                 result = (result + 512) >> 10;
   1468                 CLIP_RESULT(result)
   1469                 result2 = ((p_ref[dy] + 16) >> 5);
   1470                 CLIP_RESULT(result2)
   1471                 /* 3/4 pel,  no need to clip */
   1472                 result = (result + result2 + 1);
   1473                 result = (result >> 1);
   1474                 *(p_cur += outpitch) = result;
   1475                 /* second pixel */
   1476                 r0 = *(p_ref += inpitch);
   1477                 result = (r1 + r0);
   1478                 r1 = (r2 + r5);
   1479                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1480                 r1 = (r3 + r4);
   1481                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1482                 result = (result + 512) >> 10;
   1483                 CLIP_RESULT(result)
   1484                 result2 = ((p_ref[dy] + 16) >> 5);
   1485                 CLIP_RESULT(result2)
   1486                 /* 3/4 pel,  no need to clip */
   1487                 result = (result + result2 + 1);
   1488                 result = (result >> 1);
   1489                 *(p_cur += outpitch) = result;
   1490                 /* third pixel */
   1491                 r1 = *(p_ref += inpitch);
   1492                 result = (r2 + r1);
   1493                 r2 = (r3 + r0);
   1494                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1495                 r2 = (r4 + r5);
   1496                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1497                 result = (result + 512) >> 10;
   1498                 CLIP_RESULT(result)
   1499                 result2 = ((p_ref[dy] + 16) >> 5);
   1500                 CLIP_RESULT(result2)
   1501                 /* 3/4 pel,  no need to clip */
   1502                 result = (result + result2 + 1);
   1503                 result = (result >> 1);
   1504                 *(p_cur += outpitch) = result;
   1505                 /* fourth pixel */
   1506                 r2 = *(p_ref += inpitch);
   1507                 result = (r3 + r2);
   1508                 r3 = (r4 + r1);
   1509                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1510                 r3 = (r5 + r0);
   1511                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1512                 result = (result + 512) >> 10;
   1513                 CLIP_RESULT(result)
   1514                 result2 = ((p_ref[dy] + 16) >> 5);
   1515                 CLIP_RESULT(result2)
   1516                 /* 3/4 pel,  no need to clip */
   1517                 result = (result + result2 + 1);
   1518                 result = (result >> 1);
   1519                 *(p_cur += outpitch) = result;
   1520                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1521             }
   1522             p_cur += curr_offset;
   1523         }
   1524     }
   1525     else
   1526     {
   1527         for (j = 0; j < blkwidth; j++)
   1528         {
   1529             p_cur -= outpitch; /* compensate for the first offset */
   1530             p_ref = in++;
   1531 
   1532             tmp = (uint32)(p_ref + ref_offset); /* limit */
   1533             while ((uint32)p_ref < tmp)
   1534             {                           /* loop un-rolled */
   1535                 r0 = *(p_ref - (inpitch << 1));
   1536                 r1 = *(p_ref - inpitch);
   1537                 r2 = *p_ref;
   1538                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1539                 r4 = *(p_ref += inpitch);
   1540                 /* first pixel */
   1541                 r5 = *(p_ref += inpitch);
   1542                 result = (r0 + r5);
   1543                 r0 = (r1 + r4);
   1544                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1545                 r0 = (r2 + r3);
   1546                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1547                 result = (result + 512) >> 10;
   1548                 CLIP_RESULT(result)
   1549                 *(p_cur += outpitch) = result;
   1550                 /* second pixel */
   1551                 r0 = *(p_ref += inpitch);
   1552                 result = (r1 + r0);
   1553                 r1 = (r2 + r5);
   1554                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1555                 r1 = (r3 + r4);
   1556                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1557                 result = (result + 512) >> 10;
   1558                 CLIP_RESULT(result)
   1559                 *(p_cur += outpitch) = result;
   1560                 /* third pixel */
   1561                 r1 = *(p_ref += inpitch);
   1562                 result = (r2 + r1);
   1563                 r2 = (r3 + r0);
   1564                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1565                 r2 = (r4 + r5);
   1566                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1567                 result = (result + 512) >> 10;
   1568                 CLIP_RESULT(result)
   1569                 *(p_cur += outpitch) = result;
   1570                 /* fourth pixel */
   1571                 r2 = *(p_ref += inpitch);
   1572                 result = (r3 + r2);
   1573                 r3 = (r4 + r1);
   1574                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1575                 r3 = (r5 + r0);
   1576                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1577                 result = (result + 512) >> 10;
   1578                 CLIP_RESULT(result)
   1579                 *(p_cur += outpitch) = result;
   1580                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1581             }
   1582             p_cur += curr_offset;
   1583         }
   1584     }
   1585 
   1586     return ;
   1587 }
   1588 
   1589 void DiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
   1590                       uint8 *out, int outpitch,
   1591                       int blkwidth, int blkheight)
   1592 {
   1593     int j, i;
   1594     int result;
   1595     uint8 *p_cur, *p_ref, *p_tmp8;
   1596     int curr_offset, ref_offset;
   1597     uint8 tmp_res[24][24], tmp_in[24][24];
   1598     uint32 *p_tmp;
   1599     uint32 tmp, pkres, tmp_result;
   1600     int32 r0, r1, r2, r3, r4, r5;
   1601     int32 r6, r7, r8, r9, r10, r13;
   1602 
   1603     ref_offset = inpitch - blkwidth;
   1604     p_ref = in1 - 2;
   1605     /* perform horizontal interpolation */
   1606     /* not word-aligned */
   1607     /* It is faster to read 1 byte at time to avoid calling CreateAlign */
   1608     /*  if(((uint32)p_ref)&0x3)
   1609         {
   1610             CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
   1611             p_ref = &tmp_in[0][0];
   1612             ref_offset = 24-blkwidth;
   1613         }*/
   1614 
   1615     p_tmp = (uint32*) & (tmp_res[0][0]);
   1616     for (j = blkheight; j > 0; j--)
   1617     {
   1618         r13 = 0;
   1619         tmp = (uint32)(p_ref + blkwidth);
   1620 
   1621         //r0 = *((uint32*)p_ref);   /* d,c,b,a */
   1622         //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
   1623         //r0 &= 0xFF00FF;           /* 0,c,0,a */
   1624         /* It is faster to read 1 byte at a time,  */
   1625         r0 = p_ref[0];
   1626         r1 = p_ref[2];
   1627         r0 |= (r1 << 16);           /* 0,c,0,a */
   1628         r1 = p_ref[1];
   1629         r2 = p_ref[3];
   1630         r1 |= (r2 << 16);           /* 0,d,0,b */
   1631 
   1632         while ((uint32)p_ref < tmp)
   1633         {
   1634             //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
   1635             //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
   1636             //r2 &= 0xFF00FF;           /* 0,g,0,e */
   1637             /* It is faster to read 1 byte at a time,  */
   1638             r2 = *(p_ref += 4);
   1639             r3 = p_ref[2];
   1640             r2 |= (r3 << 16);           /* 0,g,0,e */
   1641             r3 = p_ref[1];
   1642             r4 = p_ref[3];
   1643             r3 |= (r4 << 16);           /* 0,h,0,f */
   1644 
   1645             r4 = r0 + r3;       /* c+h, a+f */
   1646             r5 = r0 + r1;   /* c+d, a+b */
   1647             r6 = r2 + r3;   /* g+h, e+f */
   1648             r5 >>= 16;
   1649             r5 |= (r6 << 16);   /* e+f, c+d */
   1650             r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
   1651             r4 += 0x100010; /* +16, +16 */
   1652             r5 = r1 + r2;       /* d+g, b+e */
   1653             r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
   1654             r4 >>= 5;
   1655             r13 |= r4;      /* check clipping */
   1656             r4 &= 0xFF00FF; /* mask */
   1657 
   1658             r5 = p_ref[4];  /* i */
   1659             r6 = (r5 << 16);
   1660             r5 = r6 | (r2 >> 16);/* 0,i,0,g */
   1661             r5 += r1;       /* d+i, b+g */ /* r5 not free */
   1662             r1 >>= 16;
   1663             r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
   1664             r1 += r2;       /* f+g, d+e */
   1665             r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
   1666             r0 >>= 16;
   1667             r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
   1668             r0 += r3;       /* e+h, c+f */
   1669             r5 += 0x100010; /* 16,16 */
   1670             r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
   1671             r5 >>= 5;
   1672             r13 |= r5;      /* check clipping */
   1673             r5 &= 0xFF00FF; /* mask */
   1674 
   1675             r4 |= (r5 << 8);    /* pack them together */
   1676             *p_tmp++ = r4;
   1677             r1 = r3;
   1678             r0 = r2;
   1679         }
   1680         p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
   1681         p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
   1682 
   1683         if (r13&0xFF000700) /* need clipping */
   1684         {
   1685             /* move back to the beginning of the line */
   1686             p_ref -= (ref_offset + blkwidth);   /* input */
   1687             p_tmp -= 6; /* intermediate output */
   1688             tmp = (uint32)(p_ref + blkwidth);
   1689             while ((uint32)p_ref < tmp)
   1690             {
   1691                 r0 = *p_ref++;
   1692                 r1 = *p_ref++;
   1693                 r2 = *p_ref++;
   1694                 r3 = *p_ref++;
   1695                 r4 = *p_ref++;
   1696                 /* first pixel */
   1697                 r5 = *p_ref++;
   1698                 result = (r0 + r5);
   1699                 r0 = (r1 + r4);
   1700                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1701                 r0 = (r2 + r3);
   1702                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1703                 result = (result + 16) >> 5;
   1704                 CLIP_RESULT(result)
   1705                 pkres = result;
   1706                 /* second pixel */
   1707                 r0 = *p_ref++;
   1708                 result = (r1 + r0);
   1709                 r1 = (r2 + r5);
   1710                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1711                 r1 = (r3 + r4);
   1712                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1713                 result = (result + 16) >> 5;
   1714                 CLIP_RESULT(result)
   1715                 pkres |= (result << 8);
   1716                 /* third pixel */
   1717                 r1 = *p_ref++;
   1718                 result = (r2 + r1);
   1719                 r2 = (r3 + r0);
   1720                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1721                 r2 = (r4 + r5);
   1722                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1723                 result = (result + 16) >> 5;
   1724                 CLIP_RESULT(result)
   1725                 pkres |= (result << 16);
   1726                 /* fourth pixel */
   1727                 r2 = *p_ref++;
   1728                 result = (r3 + r2);
   1729                 r3 = (r4 + r1);
   1730                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1731                 r3 = (r5 + r0);
   1732                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1733                 result = (result + 16) >> 5;
   1734                 CLIP_RESULT(result)
   1735                 pkres |= (result << 24);
   1736 
   1737                 *p_tmp++ = pkres; /* write 4 pixel */
   1738                 p_ref -= 5;
   1739             }
   1740             p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
   1741             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
   1742         }
   1743     }
   1744 
   1745     /*  perform vertical interpolation */
   1746     /* not word-aligned */
   1747     if (((uint32)in2)&0x3)
   1748     {
   1749         CreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
   1750         in2 = &tmp_in[2][0];
   1751         inpitch = 24;
   1752     }
   1753 
   1754     p_cur = out;
   1755     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
   1756     pkres = blkheight * inpitch; /* reuse it for limit */
   1757 
   1758     curr_offset += 3;
   1759 
   1760     for (j = 0; j < blkwidth; j += 4, in2 += 4)
   1761     {
   1762         r13 = 0;
   1763         p_ref = in2;
   1764         p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
   1765         p_tmp8 -= 24;  /* compensate for the first offset */
   1766         p_cur -= outpitch;  /* compensate for the first offset */
   1767         tmp = (uint32)(p_ref + pkres); /* limit */
   1768         while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
   1769         {
   1770             /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign,  */
   1771             /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
   1772             r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
   1773             r6 |= (r1<<16);         p_ref+=inpitch; */
   1774             r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
   1775             p_ref += inpitch;
   1776             r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
   1777             r0 &= 0xFF00FF;
   1778 
   1779             /*p_ref8 = p_ref+(inpitch<<1);
   1780             r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
   1781             r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1782             r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
   1783             r7 = (r1 >> 8) & 0xFF00FF;
   1784             r1 &= 0xFF00FF;
   1785 
   1786             r0 += r1;
   1787             r6 += r7;
   1788 
   1789             /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
   1790             r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
   1791             r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
   1792             r8 = (r2 >> 8) & 0xFF00FF;
   1793             r2 &= 0xFF00FF;
   1794 
   1795             /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
   1796             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
   1797             r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1798             r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
   1799             r7 = (r1 >> 8) & 0xFF00FF;
   1800             r1 &= 0xFF00FF;
   1801             r1 += r2;
   1802 
   1803             r7 += r8;
   1804 
   1805             r0 += 20 * r1;
   1806             r6 += 20 * r7;
   1807             r0 += 0x100010;
   1808             r6 += 0x100010;
   1809 
   1810             /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
   1811             r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
   1812             r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
   1813             r8 = (r2 >> 8) & 0xFF00FF;
   1814             r2 &= 0xFF00FF;
   1815 
   1816             /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
   1817             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
   1818             r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1819             r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
   1820             r7 = (r1 >> 8) & 0xFF00FF;
   1821             r1 &= 0xFF00FF;
   1822             r1 += r2;
   1823 
   1824             r7 += r8;
   1825 
   1826             r0 -= 5 * r1;
   1827             r6 -= 5 * r7;
   1828 
   1829             r0 >>= 5;
   1830             r6 >>= 5;
   1831             /* clip */
   1832             r13 |= r6;
   1833             r13 |= r0;
   1834             //CLIPPACK(r6,result)
   1835             /* add with horizontal results */
   1836             r10 = *((uint32*)(p_tmp8 += 24));
   1837             r9 = (r10 >> 8) & 0xFF00FF;
   1838             r10 &= 0xFF00FF;
   1839 
   1840             r0 += r10;
   1841             r0 += 0x10001;
   1842             r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
   1843 
   1844             r6 += r9;
   1845             r6 += 0x10001;
   1846             r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
   1847 
   1848             r0 |= (r6 << 8);  /* pack it back */
   1849             *((uint32*)(p_cur += outpitch)) = r0;
   1850         }
   1851         p_cur += curr_offset; /* offset to the next pixel */
   1852         if (r13 & 0xFF000700) /* this column need clipping */
   1853         {
   1854             p_cur -= 4;
   1855             for (i = 0; i < 4; i++)
   1856             {
   1857                 p_ref = in2 + i;
   1858                 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
   1859                 p_tmp8 -= 24;  /* compensate for the first offset */
   1860                 p_cur -= outpitch;  /* compensate for the first offset */
   1861                 tmp = (uint32)(p_ref + pkres); /* limit */
   1862                 while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
   1863                 {
   1864                     r0 = *(p_ref - (inpitch << 1));
   1865                     r1 = *(p_ref - inpitch);
   1866                     r2 = *p_ref;
   1867                     r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1868                     r4 = *(p_ref += inpitch);
   1869                     /* first pixel */
   1870                     r5 = *(p_ref += inpitch);
   1871                     result = (r0 + r5);
   1872                     r0 = (r1 + r4);
   1873                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1874                     r0 = (r2 + r3);
   1875                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1876                     result = (result + 16) >> 5;
   1877                     CLIP_RESULT(result)
   1878                     tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
   1879                     result = (result + tmp_result + 1);  /* no clip */
   1880                     result = (result >> 1);
   1881                     *(p_cur += outpitch) = result;
   1882                     /* second pixel */
   1883                     r0 = *(p_ref += inpitch);
   1884                     result = (r1 + r0);
   1885                     r1 = (r2 + r5);
   1886                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1887                     r1 = (r3 + r4);
   1888                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1889                     result = (result + 16) >> 5;
   1890                     CLIP_RESULT(result)
   1891                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1892                     result = (result + tmp_result + 1);  /* no clip */
   1893                     result = (result >> 1);
   1894                     *(p_cur += outpitch) = result;
   1895                     /* third pixel */
   1896                     r1 = *(p_ref += inpitch);
   1897                     result = (r2 + r1);
   1898                     r2 = (r3 + r0);
   1899                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1900                     r2 = (r4 + r5);
   1901                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1902                     result = (result + 16) >> 5;
   1903                     CLIP_RESULT(result)
   1904                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1905                     result = (result + tmp_result + 1);  /* no clip */
   1906                     result = (result >> 1);
   1907                     *(p_cur += outpitch) = result;
   1908                     /* fourth pixel */
   1909                     r2 = *(p_ref += inpitch);
   1910                     result = (r3 + r2);
   1911                     r3 = (r4 + r1);
   1912                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1913                     r3 = (r5 + r0);
   1914                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1915                     result = (result + 16) >> 5;
   1916                     CLIP_RESULT(result)
   1917                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1918                     result = (result + tmp_result + 1);  /* no clip */
   1919                     result = (result >> 1);
   1920                     *(p_cur += outpitch) = result;
   1921                     p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1922                 }
   1923                 p_cur += (curr_offset - 3);
   1924             }
   1925         }
   1926     }
   1927 
   1928     return ;
   1929 }
   1930 
   1931 /* position G */
   1932 void FullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
   1933                int blkwidth, int blkheight)
   1934 {
   1935     int i, j;
   1936     int offset_in = inpitch - blkwidth;
   1937     int offset_out = outpitch - blkwidth;
   1938     uint32 temp;
   1939     uint8 byte;
   1940 
   1941     if (((uint32)in)&3)
   1942     {
   1943         for (j = blkheight; j > 0; j--)
   1944         {
   1945             for (i = blkwidth; i > 0; i -= 4)
   1946             {
   1947                 temp = *in++;
   1948                 byte = *in++;
   1949                 temp |= (byte << 8);
   1950                 byte = *in++;
   1951                 temp |= (byte << 16);
   1952                 byte = *in++;
   1953                 temp |= (byte << 24);
   1954 
   1955                 *((uint32*)out) = temp; /* write 4 bytes */
   1956                 out += 4;
   1957             }
   1958             out += offset_out;
   1959             in += offset_in;
   1960         }
   1961     }
   1962     else
   1963     {
   1964         for (j = blkheight; j > 0; j--)
   1965         {
   1966             for (i = blkwidth; i > 0; i -= 4)
   1967             {
   1968                 temp = *((uint32*)in);
   1969                 *((uint32*)out) = temp;
   1970                 in += 4;
   1971                 out += 4;
   1972             }
   1973             out += offset_out;
   1974             in += offset_in;
   1975         }
   1976     }
   1977     return ;
   1978 }
   1979 
   1980 void ChromaMotionComp(uint8 *ref, int picwidth, int picheight,
   1981                       int x_pos, int y_pos,
   1982                       uint8 *pred, int pred_pitch,
   1983                       int blkwidth, int blkheight)
   1984 {
   1985     int dx, dy;
   1986     int offset_dx, offset_dy;
   1987     int index;
   1988     uint8 temp[24][24];
   1989 
   1990     dx = x_pos & 7;
   1991     dy = y_pos & 7;
   1992     offset_dx = (dx + 7) >> 3;
   1993     offset_dy = (dy + 7) >> 3;
   1994     x_pos = x_pos >> 3;  /* round it to full-pel resolution */
   1995     y_pos = y_pos >> 3;
   1996 
   1997     if ((x_pos >= 0 && x_pos + blkwidth + offset_dx <= picwidth) && (y_pos >= 0 && y_pos + blkheight + offset_dy <= picheight))
   1998     {
   1999         ref += y_pos * picwidth + x_pos;
   2000     }
   2001     else
   2002     {
   2003         CreatePad(ref, picwidth, picheight, x_pos, y_pos, &temp[0][0], blkwidth + offset_dx, blkheight + offset_dy);
   2004         ref = &temp[0][0];
   2005         picwidth = 24;
   2006     }
   2007 
   2008     index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
   2009 
   2010     (*(ChromaMC_SIMD[index]))(ref, picwidth , dx, dy, pred, pred_pitch, blkwidth, blkheight);
   2011     return ;
   2012 }
   2013 
   2014 
   2015 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done)  */
   2016 void ChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2017                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2018 {
   2019     int32 r0, r1, r2, r3, result0, result1;
   2020     uint8 temp[288];
   2021     uint8 *ref, *out;
   2022     int i, j;
   2023     int dx_8 = 8 - dx;
   2024     int dy_8 = 8 - dy;
   2025 
   2026     /* horizontal first */
   2027     out = temp;
   2028     for (i = 0; i < blkheight + 1; i++)
   2029     {
   2030         ref = pRef;
   2031         r0 = ref[0];
   2032         for (j = 0; j < blkwidth; j += 4)
   2033         {
   2034             r0 |= (ref[2] << 16);
   2035             result0 = dx_8 * r0;
   2036 
   2037             r1 = ref[1] | (ref[3] << 16);
   2038             result0 += dx * r1;
   2039             *(int32 *)out = result0;
   2040 
   2041             result0 = dx_8 * r1;
   2042 
   2043             r2 = ref[4];
   2044             r0 = r0 >> 16;
   2045             r1 = r0 | (r2 << 16);
   2046             result0 += dx * r1;
   2047             *(int32 *)(out + 16) = result0;
   2048 
   2049             ref += 4;
   2050             out += 4;
   2051             r0 = r2;
   2052         }
   2053         pRef += srcPitch;
   2054         out += (32 - blkwidth);
   2055     }
   2056 
   2057 //  pRef -= srcPitch*(blkheight+1);
   2058     ref = temp;
   2059 
   2060     for (j = 0; j < blkwidth; j += 4)
   2061     {
   2062         r0 = *(int32 *)ref;
   2063         r1 = *(int32 *)(ref + 16);
   2064         ref += 32;
   2065         out = pOut;
   2066         for (i = 0; i < (blkheight >> 1); i++)
   2067         {
   2068             result0 = dy_8 * r0 + 0x00200020;
   2069             r2 = *(int32 *)ref;
   2070             result0 += dy * r2;
   2071             result0 >>= 6;
   2072             result0 &= 0x00FF00FF;
   2073             r0 = r2;
   2074 
   2075             result1 = dy_8 * r1 + 0x00200020;
   2076             r3 = *(int32 *)(ref + 16);
   2077             result1 += dy * r3;
   2078             result1 >>= 6;
   2079             result1 &= 0x00FF00FF;
   2080             r1 = r3;
   2081             *(int32 *)out = result0 | (result1 << 8);
   2082             out += predPitch;
   2083             ref += 32;
   2084 
   2085             result0 = dy_8 * r0 + 0x00200020;
   2086             r2 = *(int32 *)ref;
   2087             result0 += dy * r2;
   2088             result0 >>= 6;
   2089             result0 &= 0x00FF00FF;
   2090             r0 = r2;
   2091 
   2092             result1 = dy_8 * r1 + 0x00200020;
   2093             r3 = *(int32 *)(ref + 16);
   2094             result1 += dy * r3;
   2095             result1 >>= 6;
   2096             result1 &= 0x00FF00FF;
   2097             r1 = r3;
   2098             *(int32 *)out = result0 | (result1 << 8);
   2099             out += predPitch;
   2100             ref += 32;
   2101         }
   2102         pOut += 4;
   2103         ref = temp + 4; /* since it can only iterate twice max  */
   2104     }
   2105     return;
   2106 }
   2107 
   2108 void ChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2109                              uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2110 {
   2111     OSCL_UNUSED_ARG(dy);
   2112     int32 r0, r1, r2, result0, result1;
   2113     uint8 *ref, *out;
   2114     int i, j;
   2115     int dx_8 = 8 - dx;
   2116 
   2117     /* horizontal first */
   2118     for (i = 0; i < blkheight; i++)
   2119     {
   2120         ref = pRef;
   2121         out = pOut;
   2122 
   2123         r0 = ref[0];
   2124         for (j = 0; j < blkwidth; j += 4)
   2125         {
   2126             r0 |= (ref[2] << 16);
   2127             result0 = dx_8 * r0 + 0x00040004;
   2128 
   2129             r1 = ref[1] | (ref[3] << 16);
   2130             result0 += dx * r1;
   2131             result0 >>= 3;
   2132             result0 &= 0x00FF00FF;
   2133 
   2134             result1 = dx_8 * r1 + 0x00040004;
   2135 
   2136             r2 = ref[4];
   2137             r0 = r0 >> 16;
   2138             r1 = r0 | (r2 << 16);
   2139             result1 += dx * r1;
   2140             result1 >>= 3;
   2141             result1 &= 0x00FF00FF;
   2142 
   2143             *(int32 *)out = result0 | (result1 << 8);
   2144 
   2145             ref += 4;
   2146             out += 4;
   2147             r0 = r2;
   2148         }
   2149 
   2150         pRef += srcPitch;
   2151         pOut += predPitch;
   2152     }
   2153     return;
   2154 }
   2155 
   2156 void ChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2157                            uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2158 {
   2159     OSCL_UNUSED_ARG(dx);
   2160     int32 r0, r1, r2, r3, result0, result1;
   2161     int i, j;
   2162     uint8 *ref, *out;
   2163     int dy_8 = 8 - dy;
   2164     /* vertical first */
   2165     for (i = 0; i < blkwidth; i += 4)
   2166     {
   2167         ref = pRef;
   2168         out = pOut;
   2169 
   2170         r0 = ref[0] | (ref[2] << 16);
   2171         r1 = ref[1] | (ref[3] << 16);
   2172         ref += srcPitch;
   2173         for (j = 0; j < blkheight; j++)
   2174         {
   2175             result0 = dy_8 * r0 + 0x00040004;
   2176             r2 = ref[0] | (ref[2] << 16);
   2177             result0 += dy * r2;
   2178             result0 >>= 3;
   2179             result0 &= 0x00FF00FF;
   2180             r0 = r2;
   2181 
   2182             result1 = dy_8 * r1 + 0x00040004;
   2183             r3 = ref[1] | (ref[3] << 16);
   2184             result1 += dy * r3;
   2185             result1 >>= 3;
   2186             result1 &= 0x00FF00FF;
   2187             r1 = r3;
   2188             *(int32 *)out = result0 | (result1 << 8);
   2189             ref += srcPitch;
   2190             out += predPitch;
   2191         }
   2192         pOut += 4;
   2193         pRef += 4;
   2194     }
   2195     return;
   2196 }
   2197 
   2198 void ChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2199                             uint8 *pOut,  int predPitch, int blkwidth, int blkheight)
   2200 {
   2201     OSCL_UNUSED_ARG(blkwidth);
   2202     int32 r0, r1, temp0, temp1, result;
   2203     int32 temp[9];
   2204     int32 *out;
   2205     int i, r_temp;
   2206     int dy_8 = 8 - dy;
   2207 
   2208     /* horizontal first */
   2209     out = temp;
   2210     for (i = 0; i < blkheight + 1; i++)
   2211     {
   2212         r_temp = pRef[1];
   2213         temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
   2214         temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
   2215         r0 = temp0 | (temp1 << 16);
   2216         *out++ = r0;
   2217         pRef += srcPitch;
   2218     }
   2219 
   2220     pRef -= srcPitch * (blkheight + 1);
   2221 
   2222     out = temp;
   2223 
   2224     r0 = *out++;
   2225 
   2226     for (i = 0; i < blkheight; i++)
   2227     {
   2228         result = dy_8 * r0 + 0x00200020;
   2229         r1 = *out++;
   2230         result += dy * r1;
   2231         result >>= 6;
   2232         result &= 0x00FF00FF;
   2233         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
   2234         r0 = r1;
   2235         pOut += predPitch;
   2236     }
   2237     return;
   2238 }
   2239 
   2240 void ChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2241                               uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2242 {
   2243     OSCL_UNUSED_ARG(dy);
   2244     OSCL_UNUSED_ARG(blkwidth);
   2245     int i, temp, temp0, temp1;
   2246 
   2247     /* horizontal first */
   2248     for (i = 0; i < blkheight; i++)
   2249     {
   2250         temp = pRef[1];
   2251         temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
   2252         temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
   2253 
   2254         *(int16 *)pOut = temp0 | (temp1 << 8);
   2255         pRef += srcPitch;
   2256         pOut += predPitch;
   2257 
   2258     }
   2259     return;
   2260 }
   2261 void ChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2262                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2263 {
   2264     OSCL_UNUSED_ARG(dx);
   2265     OSCL_UNUSED_ARG(blkwidth);
   2266     int32 r0, r1, result;
   2267     int i;
   2268     int dy_8 = 8 - dy;
   2269     r0 = pRef[0] | (pRef[1] << 16);
   2270     pRef += srcPitch;
   2271     for (i = 0; i < blkheight; i++)
   2272     {
   2273         result = dy_8 * r0 + 0x00040004;
   2274         r1 = pRef[0] | (pRef[1] << 16);
   2275         result += dy * r1;
   2276         result >>= 3;
   2277         result &= 0x00FF00FF;
   2278         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
   2279         r0 = r1;
   2280         pRef += srcPitch;
   2281         pOut += predPitch;
   2282     }
   2283     return;
   2284 }
   2285 
   2286 void ChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2287                        uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2288 {
   2289     OSCL_UNUSED_ARG(dx);
   2290     OSCL_UNUSED_ARG(dy);
   2291     int i, j;
   2292     int offset_in = srcPitch - blkwidth;
   2293     int offset_out = predPitch - blkwidth;
   2294     uint16 temp;
   2295     uint8 byte;
   2296 
   2297     if (((uint32)pRef)&1)
   2298     {
   2299         for (j = blkheight; j > 0; j--)
   2300         {
   2301             for (i = blkwidth; i > 0; i -= 2)
   2302             {
   2303                 temp = *pRef++;
   2304                 byte = *pRef++;
   2305                 temp |= (byte << 8);
   2306                 *((uint16*)pOut) = temp; /* write 2 bytes */
   2307                 pOut += 2;
   2308             }
   2309             pOut += offset_out;
   2310             pRef += offset_in;
   2311         }
   2312     }
   2313     else
   2314     {
   2315         for (j = blkheight; j > 0; j--)
   2316         {
   2317             for (i = blkwidth; i > 0; i -= 2)
   2318             {
   2319                 temp = *((uint16*)pRef);
   2320                 *((uint16*)pOut) = temp;
   2321                 pRef += 2;
   2322                 pOut += 2;
   2323             }
   2324             pOut += offset_out;
   2325             pRef += offset_in;
   2326         }
   2327     }
   2328     return ;
   2329 }
   2330