Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 #include "avcenc_lib.h"
     19 #include "avcenc_int.h"
     20 
     21 
     22 #define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
     23                  x = 0xFF & (~(x>>31));}
     24 
     25 /* (blkwidth << 2) + (dy << 1) + dx */
     26 static void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
     27 {
     28     &eChromaFullMC_SIMD,
     29     &eChromaHorizontalMC_SIMD,
     30     &eChromaVerticalMC_SIMD,
     31     &eChromaDiagonalMC_SIMD,
     32     &eChromaFullMC_SIMD,
     33     &eChromaHorizontalMC2_SIMD,
     34     &eChromaVerticalMC2_SIMD,
     35     &eChromaDiagonalMC2_SIMD
     36 };
     37 /* Perform motion prediction and compensation with residue if exist. */
     38 void AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video)
     39 {
     40     (void)(encvid);
     41 
     42     AVCMacroblock *currMB = video->currMB;
     43     AVCPictureData *currPic = video->currPic;
     44     int mbPartIdx, subMbPartIdx;
     45     int ref_idx;
     46     int offset_MbPart_indx = 0;
     47     int16 *mv;
     48     uint32 x_pos, y_pos;
     49     uint8 *curL, *curCb, *curCr;
     50     uint8 *ref_l, *ref_Cb, *ref_Cr;
     51     uint8 *predBlock, *predCb, *predCr;
     52     int block_x, block_y, offset_x, offset_y, offsetP, offset;
     53     int x_position = (video->mb_x << 4);
     54     int y_position = (video->mb_y << 4);
     55     int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
     56     int picWidth = currPic->width;
     57     int picPitch = currPic->pitch;
     58     int picHeight = currPic->height;
     59     uint32 tmp_word;
     60 
     61     tmp_word = y_position * picPitch;
     62     curL = currPic->Sl + tmp_word + x_position;
     63     offset = (tmp_word >> 2) + (x_position >> 1);
     64     curCb = currPic->Scb + offset;
     65     curCr = currPic->Scr + offset;
     66 
     67     predBlock = curL;
     68     predCb = curCb;
     69     predCr = curCr;
     70 
     71     GetMotionVectorPredictor(video, 1);
     72 
     73     for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
     74     {
     75         MbHeight = currMB->SubMbPartHeight[mbPartIdx];
     76         MbWidth = currMB->SubMbPartWidth[mbPartIdx];
     77         mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
     78         mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
     79         ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
     80         offset_indx = 0;
     81 
     82         ref_l = video->RefPicList0[ref_idx]->Sl;
     83         ref_Cb = video->RefPicList0[ref_idx]->Scb;
     84         ref_Cr = video->RefPicList0[ref_idx]->Scr;
     85 
     86         for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
     87         {
     88             block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);
     89             block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
     90             mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
     91             offset_x = x_position + (block_x << 2);
     92             offset_y = y_position + (block_y << 2);
     93             x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
     94             y_pos = (offset_y << 2) + *mv;   /*quarter pel */
     95 
     96             //offset = offset_y * currPic->width;
     97             //offsetC = (offset >> 2) + (offset_x >> 1);
     98             offsetP = (block_y << 2) * picPitch + (block_x << 2);
     99             eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos,
    100                             /*comp_Sl + offset + offset_x,*/
    101                             predBlock + offsetP, picPitch, MbWidth, MbHeight);
    102 
    103             offsetP = (block_y * picWidth) + (block_x << 1);
    104             eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
    105                               /*comp_Scb +  offsetC,*/
    106                               predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
    107             eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
    108                               /*comp_Scr +  offsetC,*/
    109                               predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
    110 
    111             offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
    112         }
    113         offset_MbPart_indx = currMB->MbPartWidth >> 4;
    114     }
    115 
    116     return ;
    117 }
    118 
    119 
    120 /* preform the actual  motion comp here */
    121 void eLumaMotionComp(uint8 *ref, int picpitch, int picheight,
    122                      int x_pos, int y_pos,
    123                      uint8 *pred, int pred_pitch,
    124                      int blkwidth, int blkheight)
    125 {
    126     (void)(picheight);
    127 
    128     int dx, dy;
    129     int temp2[21][21]; /* for intermediate results */
    130     uint8 *ref2;
    131 
    132     dx = x_pos & 3;
    133     dy = y_pos & 3;
    134     x_pos = x_pos >> 2;  /* round it to full-pel resolution */
    135     y_pos = y_pos >> 2;
    136 
    137     /* perform actual motion compensation */
    138     if (dx == 0 && dy == 0)
    139     {  /* fullpel position *//* G */
    140 
    141         ref += y_pos * picpitch + x_pos;
    142 
    143         eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
    144 
    145     }   /* other positions */
    146     else  if (dy == 0)
    147     { /* no vertical interpolation *//* a,b,c*/
    148 
    149         ref += y_pos * picpitch + x_pos;
    150 
    151         eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx);
    152     }
    153     else if (dx == 0)
    154     { /*no horizontal interpolation *//* d,h,n */
    155 
    156         ref += y_pos * picpitch + x_pos;
    157 
    158         eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy);
    159     }
    160     else if (dy == 2)
    161     {  /* horizontal cross *//* i, j, k */
    162 
    163         ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */
    164 
    165         eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight);
    166 
    167         eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
    168     }
    169     else if (dx == 2)
    170     { /* vertical cross */ /* f,q */
    171 
    172         ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */
    173 
    174         eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5);
    175         eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
    176     }
    177     else
    178     { /* diagonal *//* e,g,p,r */
    179 
    180         ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos;
    181 
    182         ref += (y_pos * picpitch) + x_pos + (dx / 2);
    183 
    184         eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
    185     }
    186 
    187     return ;
    188 }
    189 
    190 void eCreateAlign(uint8 *ref, int picpitch, int y_pos,
    191                   uint8 *out, int blkwidth, int blkheight)
    192 {
    193     int i, j;
    194     int offset, out_offset;
    195     uint32 prev_pix, result, pix1, pix2, pix4;
    196 
    197     ref += y_pos * picpitch;// + x_pos;
    198     out_offset = 24 - blkwidth;
    199 
    200     //switch(x_pos&0x3){
    201     switch (((uint32)ref)&0x3)
    202     {
    203         case 1:
    204             offset =  picpitch - blkwidth - 3;
    205             for (j = 0; j < blkheight; j++)
    206             {
    207                 pix1 = *ref++;
    208                 pix2 = *((uint16*)ref);
    209                 ref += 2;
    210                 result = (pix2 << 8) | pix1;
    211 
    212                 for (i = 3; i < blkwidth; i += 4)
    213                 {
    214                     pix4 = *((uint32*)ref);
    215                     ref += 4;
    216                     prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
    217                     result |= prev_pix;
    218                     *((uint32*)out) = result;  /* write 4 bytes */
    219                     out += 4;
    220                     result = pix4 >> 8; /* for the next loop */
    221                 }
    222                 ref += offset;
    223                 out += out_offset;
    224             }
    225             break;
    226         case 2:
    227             offset =  picpitch - blkwidth - 2;
    228             for (j = 0; j < blkheight; j++)
    229             {
    230                 result = *((uint16*)ref);
    231                 ref += 2;
    232                 for (i = 2; i < blkwidth; i += 4)
    233                 {
    234                     pix4 = *((uint32*)ref);
    235                     ref += 4;
    236                     prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
    237                     result |= prev_pix;
    238                     *((uint32*)out) = result;  /* write 4 bytes */
    239                     out += 4;
    240                     result = pix4 >> 16; /* for the next loop */
    241                 }
    242                 ref += offset;
    243                 out += out_offset;
    244             }
    245             break;
    246         case 3:
    247             offset =  picpitch - blkwidth - 1;
    248             for (j = 0; j < blkheight; j++)
    249             {
    250                 result = *ref++;
    251                 for (i = 1; i < blkwidth; i += 4)
    252                 {
    253                     pix4 = *((uint32*)ref);
    254                     ref += 4;
    255                     prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
    256                     result |= prev_pix;
    257                     *((uint32*)out) = result;  /* write 4 bytes */
    258                     out += 4;
    259                     result = pix4 >> 24; /* for the next loop */
    260                 }
    261                 ref += offset;
    262                 out += out_offset;
    263             }
    264             break;
    265     }
    266 }
    267 
    268 void eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
    269                     int blkwidth, int blkheight, int dx)
    270 {
    271     uint8 *p_ref;
    272     uint32 *p_cur;
    273     uint32 tmp, pkres;
    274     int result, curr_offset, ref_offset;
    275     int j;
    276     int32 r0, r1, r2, r3, r4, r5;
    277     int32 r13, r6;
    278 
    279     p_cur = (uint32*)out; /* assume it's word aligned */
    280     curr_offset = (outpitch - blkwidth) >> 2;
    281     p_ref = in;
    282     ref_offset = inpitch - blkwidth;
    283 
    284     if (dx&1)
    285     {
    286         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
    287         p_ref -= 2;
    288         r13 = 0;
    289         for (j = blkheight; j > 0; j--)
    290         {
    291             tmp = (uint32)(p_ref + blkwidth);
    292             r0 = p_ref[0];
    293             r1 = p_ref[2];
    294             r0 |= (r1 << 16);           /* 0,c,0,a */
    295             r1 = p_ref[1];
    296             r2 = p_ref[3];
    297             r1 |= (r2 << 16);           /* 0,d,0,b */
    298             while ((uint32)p_ref < tmp)
    299             {
    300                 r2 = *(p_ref += 4); /* move pointer to e */
    301                 r3 = p_ref[2];
    302                 r2 |= (r3 << 16);           /* 0,g,0,e */
    303                 r3 = p_ref[1];
    304                 r4 = p_ref[3];
    305                 r3 |= (r4 << 16);           /* 0,h,0,f */
    306 
    307                 r4 = r0 + r3;       /* c+h, a+f */
    308                 r5 = r0 + r1;   /* c+d, a+b */
    309                 r6 = r2 + r3;   /* g+h, e+f */
    310                 r5 >>= 16;
    311                 r5 |= (r6 << 16);   /* e+f, c+d */
    312                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
    313                 r4 += 0x100010; /* +16, +16 */
    314                 r5 = r1 + r2;       /* d+g, b+e */
    315                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
    316                 r4 >>= 5;
    317                 r13 |= r4;      /* check clipping */
    318 
    319                 r5 = p_ref[dx+2];
    320                 r6 = p_ref[dx+4];
    321                 r5 |= (r6 << 16);
    322                 r4 += r5;
    323                 r4 += 0x10001;
    324                 r4 = (r4 >> 1) & 0xFF00FF;
    325 
    326                 r5 = p_ref[4];  /* i */
    327                 r6 = (r5 << 16);
    328                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
    329                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
    330                 r1 >>= 16;
    331                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
    332                 r1 += r2;       /* f+g, d+e */
    333                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
    334                 r0 >>= 16;
    335                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
    336                 r0 += r3;       /* e+h, c+f */
    337                 r5 += 0x100010; /* 16,16 */
    338                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
    339                 r5 >>= 5;
    340                 r13 |= r5;      /* check clipping */
    341 
    342                 r0 = p_ref[dx+3];
    343                 r1 = p_ref[dx+5];
    344                 r0 |= (r1 << 16);
    345                 r5 += r0;
    346                 r5 += 0x10001;
    347                 r5 = (r5 >> 1) & 0xFF00FF;
    348 
    349                 r4 |= (r5 << 8);    /* pack them together */
    350                 *p_cur++ = r4;
    351                 r1 = r3;
    352                 r0 = r2;
    353             }
    354             p_cur += curr_offset; /* move to the next line */
    355             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
    356 
    357             if (r13&0xFF000700) /* need clipping */
    358             {
    359                 /* move back to the beginning of the line */
    360                 p_ref -= (ref_offset + blkwidth);   /* input */
    361                 p_cur -= (outpitch >> 2);
    362 
    363                 tmp = (uint32)(p_ref + blkwidth);
    364                 for (; (uint32)p_ref < tmp;)
    365                 {
    366 
    367                     r0 = *p_ref++;
    368                     r1 = *p_ref++;
    369                     r2 = *p_ref++;
    370                     r3 = *p_ref++;
    371                     r4 = *p_ref++;
    372                     /* first pixel */
    373                     r5 = *p_ref++;
    374                     result = (r0 + r5);
    375                     r0 = (r1 + r4);
    376                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    377                     r0 = (r2 + r3);
    378                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    379                     result = (result + 16) >> 5;
    380                     CLIP_RESULT(result)
    381                     /* 3/4 pel,  no need to clip */
    382                     result = (result + p_ref[dx] + 1);
    383                     pkres = (result >> 1) ;
    384                     /* second pixel */
    385                     r0 = *p_ref++;
    386                     result = (r1 + r0);
    387                     r1 = (r2 + r5);
    388                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    389                     r1 = (r3 + r4);
    390                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    391                     result = (result + 16) >> 5;
    392                     CLIP_RESULT(result)
    393                     /* 3/4 pel,  no need to clip */
    394                     result = (result + p_ref[dx] + 1);
    395                     result = (result >> 1);
    396                     pkres  |= (result << 8);
    397                     /* third pixel */
    398                     r1 = *p_ref++;
    399                     result = (r2 + r1);
    400                     r2 = (r3 + r0);
    401                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    402                     r2 = (r4 + r5);
    403                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    404                     result = (result + 16) >> 5;
    405                     CLIP_RESULT(result)
    406                     /* 3/4 pel,  no need to clip */
    407                     result = (result + p_ref[dx] + 1);
    408                     result = (result >> 1);
    409                     pkres  |= (result << 16);
    410                     /* fourth pixel */
    411                     r2 = *p_ref++;
    412                     result = (r3 + r2);
    413                     r3 = (r4 + r1);
    414                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    415                     r3 = (r5 + r0);
    416                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    417                     result = (result + 16) >> 5;
    418                     CLIP_RESULT(result)
    419                     /* 3/4 pel,  no need to clip */
    420                     result = (result + p_ref[dx] + 1);
    421                     result = (result >> 1);
    422                     pkres  |= (result << 24);
    423                     *p_cur++ = pkres; /* write 4 pixels */
    424                     p_ref -= 5;  /* offset back to the middle of filter */
    425                 }
    426                 p_cur += curr_offset;  /* move to the next line */
    427                 p_ref += ref_offset;    /* move to the next line */
    428             }
    429         }
    430     }
    431     else
    432     {
    433         p_ref -= 2;
    434         r13 = 0;
    435         for (j = blkheight; j > 0; j--)
    436         {
    437             tmp = (uint32)(p_ref + blkwidth);
    438             r0 = p_ref[0];
    439             r1 = p_ref[2];
    440             r0 |= (r1 << 16);           /* 0,c,0,a */
    441             r1 = p_ref[1];
    442             r2 = p_ref[3];
    443             r1 |= (r2 << 16);           /* 0,d,0,b */
    444             while ((uint32)p_ref < tmp)
    445             {
    446                 r2 = *(p_ref += 4); /* move pointer to e */
    447                 r3 = p_ref[2];
    448                 r2 |= (r3 << 16);           /* 0,g,0,e */
    449                 r3 = p_ref[1];
    450                 r4 = p_ref[3];
    451                 r3 |= (r4 << 16);           /* 0,h,0,f */
    452 
    453                 r4 = r0 + r3;       /* c+h, a+f */
    454                 r5 = r0 + r1;   /* c+d, a+b */
    455                 r6 = r2 + r3;   /* g+h, e+f */
    456                 r5 >>= 16;
    457                 r5 |= (r6 << 16);   /* e+f, c+d */
    458                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
    459                 r4 += 0x100010; /* +16, +16 */
    460                 r5 = r1 + r2;       /* d+g, b+e */
    461                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
    462                 r4 >>= 5;
    463                 r13 |= r4;      /* check clipping */
    464                 r4 &= 0xFF00FF; /* mask */
    465 
    466                 r5 = p_ref[4];  /* i */
    467                 r6 = (r5 << 16);
    468                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
    469                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
    470                 r1 >>= 16;
    471                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
    472                 r1 += r2;       /* f+g, d+e */
    473                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
    474                 r0 >>= 16;
    475                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
    476                 r0 += r3;       /* e+h, c+f */
    477                 r5 += 0x100010; /* 16,16 */
    478                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
    479                 r5 >>= 5;
    480                 r13 |= r5;      /* check clipping */
    481                 r5 &= 0xFF00FF; /* mask */
    482 
    483                 r4 |= (r5 << 8);    /* pack them together */
    484                 *p_cur++ = r4;
    485                 r1 = r3;
    486                 r0 = r2;
    487             }
    488             p_cur += curr_offset; /* move to the next line */
    489             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
    490 
    491             if (r13&0xFF000700) /* need clipping */
    492             {
    493                 /* move back to the beginning of the line */
    494                 p_ref -= (ref_offset + blkwidth);   /* input */
    495                 p_cur -= (outpitch >> 2);
    496 
    497                 tmp = (uint32)(p_ref + blkwidth);
    498                 for (; (uint32)p_ref < tmp;)
    499                 {
    500 
    501                     r0 = *p_ref++;
    502                     r1 = *p_ref++;
    503                     r2 = *p_ref++;
    504                     r3 = *p_ref++;
    505                     r4 = *p_ref++;
    506                     /* first pixel */
    507                     r5 = *p_ref++;
    508                     result = (r0 + r5);
    509                     r0 = (r1 + r4);
    510                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    511                     r0 = (r2 + r3);
    512                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    513                     result = (result + 16) >> 5;
    514                     CLIP_RESULT(result)
    515                     pkres  = result;
    516                     /* second pixel */
    517                     r0 = *p_ref++;
    518                     result = (r1 + r0);
    519                     r1 = (r2 + r5);
    520                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    521                     r1 = (r3 + r4);
    522                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    523                     result = (result + 16) >> 5;
    524                     CLIP_RESULT(result)
    525                     pkres  |= (result << 8);
    526                     /* third pixel */
    527                     r1 = *p_ref++;
    528                     result = (r2 + r1);
    529                     r2 = (r3 + r0);
    530                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    531                     r2 = (r4 + r5);
    532                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    533                     result = (result + 16) >> 5;
    534                     CLIP_RESULT(result)
    535                     pkres  |= (result << 16);
    536                     /* fourth pixel */
    537                     r2 = *p_ref++;
    538                     result = (r3 + r2);
    539                     r3 = (r4 + r1);
    540                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    541                     r3 = (r5 + r0);
    542                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    543                     result = (result + 16) >> 5;
    544                     CLIP_RESULT(result)
    545                     pkres  |= (result << 24);
    546                     *p_cur++ = pkres;   /* write 4 pixels */
    547                     p_ref -= 5;
    548                 }
    549                 p_cur += curr_offset; /* move to the next line */
    550                 p_ref += ref_offset;
    551             }
    552         }
    553     }
    554 
    555     return ;
    556 }
    557 
    558 void eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
    559                     int blkwidth, int blkheight, int dx)
    560 {
    561     int *p_ref;
    562     uint32 *p_cur;
    563     uint32 tmp, pkres;
    564     int result, result2, curr_offset, ref_offset;
    565     int j, r0, r1, r2, r3, r4, r5;
    566 
    567     p_cur = (uint32*)out; /* assume it's word aligned */
    568     curr_offset = (outpitch - blkwidth) >> 2;
    569     p_ref = in;
    570     ref_offset = inpitch - blkwidth;
    571 
    572     if (dx&1)
    573     {
    574         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
    575 
    576         for (j = blkheight; j > 0 ; j--)
    577         {
    578             tmp = (uint32)(p_ref + blkwidth);
    579             for (; (uint32)p_ref < tmp;)
    580             {
    581 
    582                 r0 = p_ref[-2];
    583                 r1 = p_ref[-1];
    584                 r2 = *p_ref++;
    585                 r3 = *p_ref++;
    586                 r4 = *p_ref++;
    587                 /* first pixel */
    588                 r5 = *p_ref++;
    589                 result = (r0 + r5);
    590                 r0 = (r1 + r4);
    591                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    592                 r0 = (r2 + r3);
    593                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    594                 result = (result + 512) >> 10;
    595                 CLIP_RESULT(result)
    596                 result2 = ((p_ref[dx] + 16) >> 5);
    597                 CLIP_RESULT(result2)
    598                 /* 3/4 pel,  no need to clip */
    599                 result = (result + result2 + 1);
    600                 pkres = (result >> 1);
    601                 /* second pixel */
    602                 r0 = *p_ref++;
    603                 result = (r1 + r0);
    604                 r1 = (r2 + r5);
    605                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    606                 r1 = (r3 + r4);
    607                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    608                 result = (result + 512) >> 10;
    609                 CLIP_RESULT(result)
    610                 result2 = ((p_ref[dx] + 16) >> 5);
    611                 CLIP_RESULT(result2)
    612                 /* 3/4 pel,  no need to clip */
    613                 result = (result + result2 + 1);
    614                 result = (result >> 1);
    615                 pkres  |= (result << 8);
    616                 /* third pixel */
    617                 r1 = *p_ref++;
    618                 result = (r2 + r1);
    619                 r2 = (r3 + r0);
    620                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    621                 r2 = (r4 + r5);
    622                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    623                 result = (result + 512) >> 10;
    624                 CLIP_RESULT(result)
    625                 result2 = ((p_ref[dx] + 16) >> 5);
    626                 CLIP_RESULT(result2)
    627                 /* 3/4 pel,  no need to clip */
    628                 result = (result + result2 + 1);
    629                 result = (result >> 1);
    630                 pkres  |= (result << 16);
    631                 /* fourth pixel */
    632                 r2 = *p_ref++;
    633                 result = (r3 + r2);
    634                 r3 = (r4 + r1);
    635                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    636                 r3 = (r5 + r0);
    637                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    638                 result = (result + 512) >> 10;
    639                 CLIP_RESULT(result)
    640                 result2 = ((p_ref[dx] + 16) >> 5);
    641                 CLIP_RESULT(result2)
    642                 /* 3/4 pel,  no need to clip */
    643                 result = (result + result2 + 1);
    644                 result = (result >> 1);
    645                 pkres  |= (result << 24);
    646                 *p_cur++ = pkres; /* write 4 pixels */
    647                 p_ref -= 3;  /* offset back to the middle of filter */
    648             }
    649             p_cur += curr_offset;  /* move to the next line */
    650             p_ref += ref_offset;    /* move to the next line */
    651         }
    652     }
    653     else
    654     {
    655         for (j = blkheight; j > 0 ; j--)
    656         {
    657             tmp = (uint32)(p_ref + blkwidth);
    658             for (; (uint32)p_ref < tmp;)
    659             {
    660 
    661                 r0 = p_ref[-2];
    662                 r1 = p_ref[-1];
    663                 r2 = *p_ref++;
    664                 r3 = *p_ref++;
    665                 r4 = *p_ref++;
    666                 /* first pixel */
    667                 r5 = *p_ref++;
    668                 result = (r0 + r5);
    669                 r0 = (r1 + r4);
    670                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    671                 r0 = (r2 + r3);
    672                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    673                 result = (result + 512) >> 10;
    674                 CLIP_RESULT(result)
    675                 pkres  = result;
    676                 /* second pixel */
    677                 r0 = *p_ref++;
    678                 result = (r1 + r0);
    679                 r1 = (r2 + r5);
    680                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    681                 r1 = (r3 + r4);
    682                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    683                 result = (result + 512) >> 10;
    684                 CLIP_RESULT(result)
    685                 pkres  |= (result << 8);
    686                 /* third pixel */
    687                 r1 = *p_ref++;
    688                 result = (r2 + r1);
    689                 r2 = (r3 + r0);
    690                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    691                 r2 = (r4 + r5);
    692                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    693                 result = (result + 512) >> 10;
    694                 CLIP_RESULT(result)
    695                 pkres  |= (result << 16);
    696                 /* fourth pixel */
    697                 r2 = *p_ref++;
    698                 result = (r3 + r2);
    699                 r3 = (r4 + r1);
    700                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    701                 r3 = (r5 + r0);
    702                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    703                 result = (result + 512) >> 10;
    704                 CLIP_RESULT(result)
    705                 pkres  |= (result << 24);
    706                 *p_cur++ = pkres; /* write 4 pixels */
    707                 p_ref -= 3;  /* offset back to the middle of filter */
    708             }
    709             p_cur += curr_offset;  /* move to the next line */
    710             p_ref += ref_offset;    /* move to the next line */
    711         }
    712     }
    713 
    714     return ;
    715 }
    716 
    717 void eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
    718                     int blkwidth, int blkheight)
    719 {
    720     uint8 *p_ref;
    721     int   *p_cur;
    722     uint32 tmp;
    723     int result, curr_offset, ref_offset;
    724     int j, r0, r1, r2, r3, r4, r5;
    725 
    726     p_cur = out;
    727     curr_offset = (outpitch - blkwidth);
    728     p_ref = in;
    729     ref_offset = inpitch - blkwidth;
    730 
    731     for (j = blkheight; j > 0 ; j--)
    732     {
    733         tmp = (uint32)(p_ref + blkwidth);
    734         for (; (uint32)p_ref < tmp;)
    735         {
    736 
    737             r0 = p_ref[-2];
    738             r1 = p_ref[-1];
    739             r2 = *p_ref++;
    740             r3 = *p_ref++;
    741             r4 = *p_ref++;
    742             /* first pixel */
    743             r5 = *p_ref++;
    744             result = (r0 + r5);
    745             r0 = (r1 + r4);
    746             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    747             r0 = (r2 + r3);
    748             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    749             *p_cur++ = result;
    750             /* second pixel */
    751             r0 = *p_ref++;
    752             result = (r1 + r0);
    753             r1 = (r2 + r5);
    754             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    755             r1 = (r3 + r4);
    756             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    757             *p_cur++ = result;
    758             /* third pixel */
    759             r1 = *p_ref++;
    760             result = (r2 + r1);
    761             r2 = (r3 + r0);
    762             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    763             r2 = (r4 + r5);
    764             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    765             *p_cur++ = result;
    766             /* fourth pixel */
    767             r2 = *p_ref++;
    768             result = (r3 + r2);
    769             r3 = (r4 + r1);
    770             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    771             r3 = (r5 + r0);
    772             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    773             *p_cur++ = result;
    774             p_ref -= 3; /* move back to the middle of the filter */
    775         }
    776         p_cur += curr_offset; /* move to the next line */
    777         p_ref += ref_offset;
    778     }
    779 
    780     return ;
    781 }
    782 void eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
    783                     int blkwidth, int blkheight, int dy)
    784 {
    785     uint8 *p_cur, *p_ref;
    786     uint32 tmp;
    787     int result, curr_offset, ref_offset;
    788     int j, i;
    789     int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
    790     uint8  tmp_in[24][24];
    791 
    792     /* not word-aligned */
    793     if (((uint32)in)&0x3)
    794     {
    795         eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
    796         in = &tmp_in[2][0];
    797         inpitch = 24;
    798     }
    799     p_cur = out;
    800     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
    801     ref_offset = blkheight * inpitch; /* for limit */
    802 
    803     curr_offset += 3;
    804 
    805     if (dy&1)
    806     {
    807         dy = (dy >> 1) ? 0 : -inpitch;
    808 
    809         for (j = 0; j < blkwidth; j += 4, in += 4)
    810         {
    811             r13 = 0;
    812             p_ref = in;
    813             p_cur -= outpitch;  /* compensate for the first offset */
    814             tmp = (uint32)(p_ref + ref_offset); /* limit */
    815             while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
    816             {
    817                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
    818                 p_ref += inpitch;
    819                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
    820                 r0 &= 0xFF00FF;
    821 
    822                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
    823                 r7 = (r1 >> 8) & 0xFF00FF;
    824                 r1 &= 0xFF00FF;
    825 
    826                 r0 += r1;
    827                 r6 += r7;
    828 
    829                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
    830                 r8 = (r2 >> 8) & 0xFF00FF;
    831                 r2 &= 0xFF00FF;
    832 
    833                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
    834                 r7 = (r1 >> 8) & 0xFF00FF;
    835                 r1 &= 0xFF00FF;
    836                 r1 += r2;
    837 
    838                 r7 += r8;
    839 
    840                 r0 += 20 * r1;
    841                 r6 += 20 * r7;
    842                 r0 += 0x100010;
    843                 r6 += 0x100010;
    844 
    845                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
    846                 r8 = (r2 >> 8) & 0xFF00FF;
    847                 r2 &= 0xFF00FF;
    848 
    849                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
    850                 r7 = (r1 >> 8) & 0xFF00FF;
    851                 r1 &= 0xFF00FF;
    852                 r1 += r2;
    853 
    854                 r7 += r8;
    855 
    856                 r0 -= 5 * r1;
    857                 r6 -= 5 * r7;
    858 
    859                 r0 >>= 5;
    860                 r6 >>= 5;
    861                 /* clip */
    862                 r13 |= r6;
    863                 r13 |= r0;
    864                 //CLIPPACK(r6,result)
    865 
    866                 r1 = *((uint32*)(p_ref + dy));
    867                 r2 = (r1 >> 8) & 0xFF00FF;
    868                 r1 &= 0xFF00FF;
    869                 r0 += r1;
    870                 r6 += r2;
    871                 r0 += 0x10001;
    872                 r6 += 0x10001;
    873                 r0 = (r0 >> 1) & 0xFF00FF;
    874                 r6 = (r6 >> 1) & 0xFF00FF;
    875 
    876                 r0 |= (r6 << 8);  /* pack it back */
    877                 *((uint32*)(p_cur += outpitch)) = r0;
    878             }
    879             p_cur += curr_offset; /* offset to the next pixel */
    880             if (r13 & 0xFF000700) /* this column need clipping */
    881             {
    882                 p_cur -= 4;
    883                 for (i = 0; i < 4; i++)
    884                 {
    885                     p_ref = in + i;
    886                     p_cur -= outpitch;  /* compensate for the first offset */
    887 
    888                     tmp = (uint32)(p_ref + ref_offset); /* limit */
    889                     while ((uint32)p_ref < tmp)
    890                     {                           /* loop un-rolled */
    891                         r0 = *(p_ref - (inpitch << 1));
    892                         r1 = *(p_ref - inpitch);
    893                         r2 = *p_ref;
    894                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
    895                         r4 = *(p_ref += inpitch);
    896                         /* first pixel */
    897                         r5 = *(p_ref += inpitch);
    898                         result = (r0 + r5);
    899                         r0 = (r1 + r4);
    900                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    901                         r0 = (r2 + r3);
    902                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    903                         result = (result + 16) >> 5;
    904                         CLIP_RESULT(result)
    905                         /* 3/4 pel,  no need to clip */
    906                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
    907                         result = (result >> 1);
    908                         *(p_cur += outpitch) = result;
    909                         /* second pixel */
    910                         r0 = *(p_ref += inpitch);
    911                         result = (r1 + r0);
    912                         r1 = (r2 + r5);
    913                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    914                         r1 = (r3 + r4);
    915                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    916                         result = (result + 16) >> 5;
    917                         CLIP_RESULT(result)
    918                         /* 3/4 pel,  no need to clip */
    919                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
    920                         result = (result >> 1);
    921                         *(p_cur += outpitch) = result;
    922                         /* third pixel */
    923                         r1 = *(p_ref += inpitch);
    924                         result = (r2 + r1);
    925                         r2 = (r3 + r0);
    926                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    927                         r2 = (r4 + r5);
    928                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    929                         result = (result + 16) >> 5;
    930                         CLIP_RESULT(result)
    931                         /* 3/4 pel,  no need to clip */
    932                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
    933                         result = (result >> 1);
    934                         *(p_cur += outpitch) = result;
    935                         /* fourth pixel */
    936                         r2 = *(p_ref += inpitch);
    937                         result = (r3 + r2);
    938                         r3 = (r4 + r1);
    939                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    940                         r3 = (r5 + r0);
    941                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    942                         result = (result + 16) >> 5;
    943                         CLIP_RESULT(result)
    944                         /* 3/4 pel,  no need to clip */
    945                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
    946                         result = (result >> 1);
    947                         *(p_cur += outpitch) = result;
    948                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
    949                     }
    950                     p_cur += (curr_offset - 3);
    951                 }
    952             }
    953         }
    954     }
    955     else
    956     {
    957         for (j = 0; j < blkwidth; j += 4, in += 4)
    958         {
    959             r13 = 0;
    960             p_ref = in;
    961             p_cur -= outpitch;  /* compensate for the first offset */
    962             tmp = (uint32)(p_ref + ref_offset); /* limit */
    963             while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
    964             {
    965                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
    966                 p_ref += inpitch;
    967                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
    968                 r0 &= 0xFF00FF;
    969 
    970                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
    971                 r7 = (r1 >> 8) & 0xFF00FF;
    972                 r1 &= 0xFF00FF;
    973 
    974                 r0 += r1;
    975                 r6 += r7;
    976 
    977                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
    978                 r8 = (r2 >> 8) & 0xFF00FF;
    979                 r2 &= 0xFF00FF;
    980 
    981                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
    982                 r7 = (r1 >> 8) & 0xFF00FF;
    983                 r1 &= 0xFF00FF;
    984                 r1 += r2;
    985 
    986                 r7 += r8;
    987 
    988                 r0 += 20 * r1;
    989                 r6 += 20 * r7;
    990                 r0 += 0x100010;
    991                 r6 += 0x100010;
    992 
    993                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
    994                 r8 = (r2 >> 8) & 0xFF00FF;
    995                 r2 &= 0xFF00FF;
    996 
    997                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
    998                 r7 = (r1 >> 8) & 0xFF00FF;
    999                 r1 &= 0xFF00FF;
   1000                 r1 += r2;
   1001 
   1002                 r7 += r8;
   1003 
   1004                 r0 -= 5 * r1;
   1005                 r6 -= 5 * r7;
   1006 
   1007                 r0 >>= 5;
   1008                 r6 >>= 5;
   1009                 /* clip */
   1010                 r13 |= r6;
   1011                 r13 |= r0;
   1012                 //CLIPPACK(r6,result)
   1013                 r0 &= 0xFF00FF;
   1014                 r6 &= 0xFF00FF;
   1015                 r0 |= (r6 << 8);  /* pack it back */
   1016                 *((uint32*)(p_cur += outpitch)) = r0;
   1017             }
   1018             p_cur += curr_offset; /* offset to the next pixel */
   1019             if (r13 & 0xFF000700) /* this column need clipping */
   1020             {
   1021                 p_cur -= 4;
   1022                 for (i = 0; i < 4; i++)
   1023                 {
   1024                     p_ref = in + i;
   1025                     p_cur -= outpitch;  /* compensate for the first offset */
   1026                     tmp = (uint32)(p_ref + ref_offset); /* limit */
   1027                     while ((uint32)p_ref < tmp)
   1028                     {                           /* loop un-rolled */
   1029                         r0 = *(p_ref - (inpitch << 1));
   1030                         r1 = *(p_ref - inpitch);
   1031                         r2 = *p_ref;
   1032                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1033                         r4 = *(p_ref += inpitch);
   1034                         /* first pixel */
   1035                         r5 = *(p_ref += inpitch);
   1036                         result = (r0 + r5);
   1037                         r0 = (r1 + r4);
   1038                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1039                         r0 = (r2 + r3);
   1040                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1041                         result = (result + 16) >> 5;
   1042                         CLIP_RESULT(result)
   1043                         *(p_cur += outpitch) = result;
   1044                         /* second pixel */
   1045                         r0 = *(p_ref += inpitch);
   1046                         result = (r1 + r0);
   1047                         r1 = (r2 + r5);
   1048                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1049                         r1 = (r3 + r4);
   1050                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1051                         result = (result + 16) >> 5;
   1052                         CLIP_RESULT(result)
   1053                         *(p_cur += outpitch) = result;
   1054                         /* third pixel */
   1055                         r1 = *(p_ref += inpitch);
   1056                         result = (r2 + r1);
   1057                         r2 = (r3 + r0);
   1058                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1059                         r2 = (r4 + r5);
   1060                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1061                         result = (result + 16) >> 5;
   1062                         CLIP_RESULT(result)
   1063                         *(p_cur += outpitch) = result;
   1064                         /* fourth pixel */
   1065                         r2 = *(p_ref += inpitch);
   1066                         result = (r3 + r2);
   1067                         r3 = (r4 + r1);
   1068                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1069                         r3 = (r5 + r0);
   1070                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1071                         result = (result + 16) >> 5;
   1072                         CLIP_RESULT(result)
   1073                         *(p_cur += outpitch) = result;
   1074                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1075                     }
   1076                     p_cur += (curr_offset - 3);
   1077                 }
   1078             }
   1079         }
   1080     }
   1081 
   1082     return ;
   1083 }
   1084 
   1085 void eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
   1086                     int blkwidth, int blkheight)
   1087 {
   1088     int *p_cur;
   1089     uint8 *p_ref;
   1090     uint32 tmp;
   1091     int result, curr_offset, ref_offset;
   1092     int j, r0, r1, r2, r3, r4, r5;
   1093 
   1094     p_cur = out;
   1095     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
   1096     ref_offset = blkheight * inpitch; /* for limit */
   1097 
   1098     for (j = 0; j < blkwidth; j++)
   1099     {
   1100         p_cur -= outpitch; /* compensate for the first offset */
   1101         p_ref = in++;
   1102 
   1103         tmp = (uint32)(p_ref + ref_offset); /* limit */
   1104         while ((uint32)p_ref < tmp)
   1105         {                           /* loop un-rolled */
   1106             r0 = *(p_ref - (inpitch << 1));
   1107             r1 = *(p_ref - inpitch);
   1108             r2 = *p_ref;
   1109             r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1110             r4 = *(p_ref += inpitch);
   1111             /* first pixel */
   1112             r5 = *(p_ref += inpitch);
   1113             result = (r0 + r5);
   1114             r0 = (r1 + r4);
   1115             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1116             r0 = (r2 + r3);
   1117             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1118             *(p_cur += outpitch) = result;
   1119             /* second pixel */
   1120             r0 = *(p_ref += inpitch);
   1121             result = (r1 + r0);
   1122             r1 = (r2 + r5);
   1123             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1124             r1 = (r3 + r4);
   1125             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1126             *(p_cur += outpitch) = result;
   1127             /* third pixel */
   1128             r1 = *(p_ref += inpitch);
   1129             result = (r2 + r1);
   1130             r2 = (r3 + r0);
   1131             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1132             r2 = (r4 + r5);
   1133             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1134             *(p_cur += outpitch) = result;
   1135             /* fourth pixel */
   1136             r2 = *(p_ref += inpitch);
   1137             result = (r3 + r2);
   1138             r3 = (r4 + r1);
   1139             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1140             r3 = (r5 + r0);
   1141             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1142             *(p_cur += outpitch) = result;
   1143             p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1144         }
   1145         p_cur += curr_offset;
   1146     }
   1147 
   1148     return ;
   1149 }
   1150 
   1151 void eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
   1152                     int blkwidth, int blkheight, int dy)
   1153 {
   1154     uint8 *p_cur;
   1155     int *p_ref;
   1156     uint32 tmp;
   1157     int result, result2, curr_offset, ref_offset;
   1158     int j, r0, r1, r2, r3, r4, r5;
   1159 
   1160     p_cur = out;
   1161     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
   1162     ref_offset = blkheight * inpitch; /* for limit */
   1163 
   1164     if (dy&1)
   1165     {
   1166         dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
   1167 
   1168         for (j = 0; j < blkwidth; j++)
   1169         {
   1170             p_cur -= outpitch; /* compensate for the first offset */
   1171             p_ref = in++;
   1172 
   1173             tmp = (uint32)(p_ref + ref_offset); /* limit */
   1174             while ((uint32)p_ref < tmp)
   1175             {                           /* loop un-rolled */
   1176                 r0 = *(p_ref - (inpitch << 1));
   1177                 r1 = *(p_ref - inpitch);
   1178                 r2 = *p_ref;
   1179                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1180                 r4 = *(p_ref += inpitch);
   1181                 /* first pixel */
   1182                 r5 = *(p_ref += inpitch);
   1183                 result = (r0 + r5);
   1184                 r0 = (r1 + r4);
   1185                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1186                 r0 = (r2 + r3);
   1187                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1188                 result = (result + 512) >> 10;
   1189                 CLIP_RESULT(result)
   1190                 result2 = ((p_ref[dy] + 16) >> 5);
   1191                 CLIP_RESULT(result2)
   1192                 /* 3/4 pel,  no need to clip */
   1193                 result = (result + result2 + 1);
   1194                 result = (result >> 1);
   1195                 *(p_cur += outpitch) = result;
   1196                 /* second pixel */
   1197                 r0 = *(p_ref += inpitch);
   1198                 result = (r1 + r0);
   1199                 r1 = (r2 + r5);
   1200                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1201                 r1 = (r3 + r4);
   1202                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1203                 result = (result + 512) >> 10;
   1204                 CLIP_RESULT(result)
   1205                 result2 = ((p_ref[dy] + 16) >> 5);
   1206                 CLIP_RESULT(result2)
   1207                 /* 3/4 pel,  no need to clip */
   1208                 result = (result + result2 + 1);
   1209                 result = (result >> 1);
   1210                 *(p_cur += outpitch) = result;
   1211                 /* third pixel */
   1212                 r1 = *(p_ref += inpitch);
   1213                 result = (r2 + r1);
   1214                 r2 = (r3 + r0);
   1215                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1216                 r2 = (r4 + r5);
   1217                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1218                 result = (result + 512) >> 10;
   1219                 CLIP_RESULT(result)
   1220                 result2 = ((p_ref[dy] + 16) >> 5);
   1221                 CLIP_RESULT(result2)
   1222                 /* 3/4 pel,  no need to clip */
   1223                 result = (result + result2 + 1);
   1224                 result = (result >> 1);
   1225                 *(p_cur += outpitch) = result;
   1226                 /* fourth pixel */
   1227                 r2 = *(p_ref += inpitch);
   1228                 result = (r3 + r2);
   1229                 r3 = (r4 + r1);
   1230                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1231                 r3 = (r5 + r0);
   1232                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1233                 result = (result + 512) >> 10;
   1234                 CLIP_RESULT(result)
   1235                 result2 = ((p_ref[dy] + 16) >> 5);
   1236                 CLIP_RESULT(result2)
   1237                 /* 3/4 pel,  no need to clip */
   1238                 result = (result + result2 + 1);
   1239                 result = (result >> 1);
   1240                 *(p_cur += outpitch) = result;
   1241                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1242             }
   1243             p_cur += curr_offset;
   1244         }
   1245     }
   1246     else
   1247     {
   1248         for (j = 0; j < blkwidth; j++)
   1249         {
   1250             p_cur -= outpitch; /* compensate for the first offset */
   1251             p_ref = in++;
   1252 
   1253             tmp = (uint32)(p_ref + ref_offset); /* limit */
   1254             while ((uint32)p_ref < tmp)
   1255             {                           /* loop un-rolled */
   1256                 r0 = *(p_ref - (inpitch << 1));
   1257                 r1 = *(p_ref - inpitch);
   1258                 r2 = *p_ref;
   1259                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1260                 r4 = *(p_ref += inpitch);
   1261                 /* first pixel */
   1262                 r5 = *(p_ref += inpitch);
   1263                 result = (r0 + r5);
   1264                 r0 = (r1 + r4);
   1265                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1266                 r0 = (r2 + r3);
   1267                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1268                 result = (result + 512) >> 10;
   1269                 CLIP_RESULT(result)
   1270                 *(p_cur += outpitch) = result;
   1271                 /* second pixel */
   1272                 r0 = *(p_ref += inpitch);
   1273                 result = (r1 + r0);
   1274                 r1 = (r2 + r5);
   1275                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1276                 r1 = (r3 + r4);
   1277                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1278                 result = (result + 512) >> 10;
   1279                 CLIP_RESULT(result)
   1280                 *(p_cur += outpitch) = result;
   1281                 /* third pixel */
   1282                 r1 = *(p_ref += inpitch);
   1283                 result = (r2 + r1);
   1284                 r2 = (r3 + r0);
   1285                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1286                 r2 = (r4 + r5);
   1287                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1288                 result = (result + 512) >> 10;
   1289                 CLIP_RESULT(result)
   1290                 *(p_cur += outpitch) = result;
   1291                 /* fourth pixel */
   1292                 r2 = *(p_ref += inpitch);
   1293                 result = (r3 + r2);
   1294                 r3 = (r4 + r1);
   1295                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1296                 r3 = (r5 + r0);
   1297                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1298                 result = (result + 512) >> 10;
   1299                 CLIP_RESULT(result)
   1300                 *(p_cur += outpitch) = result;
   1301                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1302             }
   1303             p_cur += curr_offset;
   1304         }
   1305     }
   1306 
   1307     return ;
   1308 }
   1309 
   1310 void eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
   1311                        uint8 *out, int outpitch,
   1312                        int blkwidth, int blkheight)
   1313 {
   1314     int j, i;
   1315     int result;
   1316     uint8 *p_cur, *p_ref, *p_tmp8;
   1317     int curr_offset, ref_offset;
   1318     uint8 tmp_res[24][24], tmp_in[24][24];
   1319     uint32 *p_tmp;
   1320     uint32 tmp, pkres, tmp_result;
   1321     int32 r0, r1, r2, r3, r4, r5;
   1322     int32 r6, r7, r8, r9, r10, r13;
   1323 
   1324     ref_offset = inpitch - blkwidth;
   1325     p_ref = in1 - 2;
   1326     /* perform horizontal interpolation */
   1327     /* not word-aligned */
   1328     /* It is faster to read 1 byte at time to avoid calling CreateAlign */
   1329     /*  if(((uint32)p_ref)&0x3)
   1330         {
   1331             CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
   1332             p_ref = &tmp_in[0][0];
   1333             ref_offset = 24-blkwidth;
   1334         }*/
   1335 
   1336     p_tmp = (uint32*) & (tmp_res[0][0]);
   1337     for (j = blkheight; j > 0; j--)
   1338     {
   1339         r13 = 0;
   1340         tmp = (uint32)(p_ref + blkwidth);
   1341 
   1342         //r0 = *((uint32*)p_ref);   /* d,c,b,a */
   1343         //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
   1344         //r0 &= 0xFF00FF;           /* 0,c,0,a */
   1345         /* It is faster to read 1 byte at a time */
   1346         r0 = p_ref[0];
   1347         r1 = p_ref[2];
   1348         r0 |= (r1 << 16);           /* 0,c,0,a */
   1349         r1 = p_ref[1];
   1350         r2 = p_ref[3];
   1351         r1 |= (r2 << 16);           /* 0,d,0,b */
   1352 
   1353         while ((uint32)p_ref < tmp)
   1354         {
   1355             //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
   1356             //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
   1357             //r2 &= 0xFF00FF;           /* 0,g,0,e */
   1358             /* It is faster to read 1 byte at a time */
   1359             r2 = *(p_ref += 4);
   1360             r3 = p_ref[2];
   1361             r2 |= (r3 << 16);           /* 0,g,0,e */
   1362             r3 = p_ref[1];
   1363             r4 = p_ref[3];
   1364             r3 |= (r4 << 16);           /* 0,h,0,f */
   1365 
   1366             r4 = r0 + r3;       /* c+h, a+f */
   1367             r5 = r0 + r1;   /* c+d, a+b */
   1368             r6 = r2 + r3;   /* g+h, e+f */
   1369             r5 >>= 16;
   1370             r5 |= (r6 << 16);   /* e+f, c+d */
   1371             r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
   1372             r4 += 0x100010; /* +16, +16 */
   1373             r5 = r1 + r2;       /* d+g, b+e */
   1374             r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
   1375             r4 >>= 5;
   1376             r13 |= r4;      /* check clipping */
   1377             r4 &= 0xFF00FF; /* mask */
   1378 
   1379             r5 = p_ref[4];  /* i */
   1380             r6 = (r5 << 16);
   1381             r5 = r6 | (r2 >> 16);/* 0,i,0,g */
   1382             r5 += r1;       /* d+i, b+g */ /* r5 not free */
   1383             r1 >>= 16;
   1384             r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
   1385             r1 += r2;       /* f+g, d+e */
   1386             r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
   1387             r0 >>= 16;
   1388             r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
   1389             r0 += r3;       /* e+h, c+f */
   1390             r5 += 0x100010; /* 16,16 */
   1391             r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
   1392             r5 >>= 5;
   1393             r13 |= r5;      /* check clipping */
   1394             r5 &= 0xFF00FF; /* mask */
   1395 
   1396             r4 |= (r5 << 8);    /* pack them together */
   1397             *p_tmp++ = r4;
   1398             r1 = r3;
   1399             r0 = r2;
   1400         }
   1401         p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
   1402         p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
   1403 
   1404         if (r13&0xFF000700) /* need clipping */
   1405         {
   1406             /* move back to the beginning of the line */
   1407             p_ref -= (ref_offset + blkwidth);   /* input */
   1408             p_tmp -= 6; /* intermediate output */
   1409             tmp = (uint32)(p_ref + blkwidth);
   1410             while ((uint32)p_ref < tmp)
   1411             {
   1412                 r0 = *p_ref++;
   1413                 r1 = *p_ref++;
   1414                 r2 = *p_ref++;
   1415                 r3 = *p_ref++;
   1416                 r4 = *p_ref++;
   1417                 /* first pixel */
   1418                 r5 = *p_ref++;
   1419                 result = (r0 + r5);
   1420                 r0 = (r1 + r4);
   1421                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1422                 r0 = (r2 + r3);
   1423                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1424                 result = (result + 16) >> 5;
   1425                 CLIP_RESULT(result)
   1426                 pkres = result;
   1427                 /* second pixel */
   1428                 r0 = *p_ref++;
   1429                 result = (r1 + r0);
   1430                 r1 = (r2 + r5);
   1431                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1432                 r1 = (r3 + r4);
   1433                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1434                 result = (result + 16) >> 5;
   1435                 CLIP_RESULT(result)
   1436                 pkres |= (result << 8);
   1437                 /* third pixel */
   1438                 r1 = *p_ref++;
   1439                 result = (r2 + r1);
   1440                 r2 = (r3 + r0);
   1441                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1442                 r2 = (r4 + r5);
   1443                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1444                 result = (result + 16) >> 5;
   1445                 CLIP_RESULT(result)
   1446                 pkres |= (result << 16);
   1447                 /* fourth pixel */
   1448                 r2 = *p_ref++;
   1449                 result = (r3 + r2);
   1450                 r3 = (r4 + r1);
   1451                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1452                 r3 = (r5 + r0);
   1453                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1454                 result = (result + 16) >> 5;
   1455                 CLIP_RESULT(result)
   1456                 pkres |= (result << 24);
   1457 
   1458                 *p_tmp++ = pkres; /* write 4 pixel */
   1459                 p_ref -= 5;
   1460             }
   1461             p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
   1462             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
   1463         }
   1464     }
   1465 
   1466     /*  perform vertical interpolation */
   1467     /* not word-aligned */
   1468     if (((uint32)in2)&0x3)
   1469     {
   1470         eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
   1471         in2 = &tmp_in[2][0];
   1472         inpitch = 24;
   1473     }
   1474 
   1475     p_cur = out;
   1476     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
   1477     pkres = blkheight * inpitch; /* reuse it for limit */
   1478 
   1479     curr_offset += 3;
   1480 
   1481     for (j = 0; j < blkwidth; j += 4, in2 += 4)
   1482     {
   1483         r13 = 0;
   1484         p_ref = in2;
   1485         p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
   1486         p_tmp8 -= 24;  /* compensate for the first offset */
   1487         p_cur -= outpitch;  /* compensate for the first offset */
   1488         tmp = (uint32)(p_ref + pkres); /* limit */
   1489         while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
   1490         {
   1491             /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */
   1492             /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
   1493             r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
   1494             r6 |= (r1<<16);         p_ref+=inpitch; */
   1495             r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
   1496             p_ref += inpitch;
   1497             r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
   1498             r0 &= 0xFF00FF;
   1499 
   1500             /*p_ref8 = p_ref+(inpitch<<1);
   1501             r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
   1502             r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1503             r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
   1504             r7 = (r1 >> 8) & 0xFF00FF;
   1505             r1 &= 0xFF00FF;
   1506 
   1507             r0 += r1;
   1508             r6 += r7;
   1509 
   1510             /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
   1511             r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
   1512             r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
   1513             r8 = (r2 >> 8) & 0xFF00FF;
   1514             r2 &= 0xFF00FF;
   1515 
   1516             /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
   1517             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
   1518             r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1519             r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
   1520             r7 = (r1 >> 8) & 0xFF00FF;
   1521             r1 &= 0xFF00FF;
   1522             r1 += r2;
   1523 
   1524             r7 += r8;
   1525 
   1526             r0 += 20 * r1;
   1527             r6 += 20 * r7;
   1528             r0 += 0x100010;
   1529             r6 += 0x100010;
   1530 
   1531             /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
   1532             r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
   1533             r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
   1534             r8 = (r2 >> 8) & 0xFF00FF;
   1535             r2 &= 0xFF00FF;
   1536 
   1537             /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
   1538             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
   1539             r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1540             r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
   1541             r7 = (r1 >> 8) & 0xFF00FF;
   1542             r1 &= 0xFF00FF;
   1543             r1 += r2;
   1544 
   1545             r7 += r8;
   1546 
   1547             r0 -= 5 * r1;
   1548             r6 -= 5 * r7;
   1549 
   1550             r0 >>= 5;
   1551             r6 >>= 5;
   1552             /* clip */
   1553             r13 |= r6;
   1554             r13 |= r0;
   1555             //CLIPPACK(r6,result)
   1556             /* add with horizontal results */
   1557             r10 = *((uint32*)(p_tmp8 += 24));
   1558             r9 = (r10 >> 8) & 0xFF00FF;
   1559             r10 &= 0xFF00FF;
   1560 
   1561             r0 += r10;
   1562             r0 += 0x10001;
   1563             r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
   1564 
   1565             r6 += r9;
   1566             r6 += 0x10001;
   1567             r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
   1568 
   1569             r0 |= (r6 << 8);  /* pack it back */
   1570             *((uint32*)(p_cur += outpitch)) = r0;
   1571         }
   1572         p_cur += curr_offset; /* offset to the next pixel */
   1573         if (r13 & 0xFF000700) /* this column need clipping */
   1574         {
   1575             p_cur -= 4;
   1576             for (i = 0; i < 4; i++)
   1577             {
   1578                 p_ref = in2 + i;
   1579                 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
   1580                 p_tmp8 -= 24;  /* compensate for the first offset */
   1581                 p_cur -= outpitch;  /* compensate for the first offset */
   1582                 tmp = (uint32)(p_ref + pkres); /* limit */
   1583                 while ((uint32)p_ref < tmp)  /* the loop un-rolled  */
   1584                 {
   1585                     r0 = *(p_ref - (inpitch << 1));
   1586                     r1 = *(p_ref - inpitch);
   1587                     r2 = *p_ref;
   1588                     r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1589                     r4 = *(p_ref += inpitch);
   1590                     /* first pixel */
   1591                     r5 = *(p_ref += inpitch);
   1592                     result = (r0 + r5);
   1593                     r0 = (r1 + r4);
   1594                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1595                     r0 = (r2 + r3);
   1596                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1597                     result = (result + 16) >> 5;
   1598                     CLIP_RESULT(result)
   1599                     tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
   1600                     result = (result + tmp_result + 1);  /* no clip */
   1601                     result = (result >> 1);
   1602                     *(p_cur += outpitch) = result;
   1603                     /* second pixel */
   1604                     r0 = *(p_ref += inpitch);
   1605                     result = (r1 + r0);
   1606                     r1 = (r2 + r5);
   1607                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1608                     r1 = (r3 + r4);
   1609                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1610                     result = (result + 16) >> 5;
   1611                     CLIP_RESULT(result)
   1612                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1613                     result = (result + tmp_result + 1);  /* no clip */
   1614                     result = (result >> 1);
   1615                     *(p_cur += outpitch) = result;
   1616                     /* third pixel */
   1617                     r1 = *(p_ref += inpitch);
   1618                     result = (r2 + r1);
   1619                     r2 = (r3 + r0);
   1620                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1621                     r2 = (r4 + r5);
   1622                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1623                     result = (result + 16) >> 5;
   1624                     CLIP_RESULT(result)
   1625                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1626                     result = (result + tmp_result + 1);  /* no clip */
   1627                     result = (result >> 1);
   1628                     *(p_cur += outpitch) = result;
   1629                     /* fourth pixel */
   1630                     r2 = *(p_ref += inpitch);
   1631                     result = (r3 + r2);
   1632                     r3 = (r4 + r1);
   1633                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1634                     r3 = (r5 + r0);
   1635                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1636                     result = (result + 16) >> 5;
   1637                     CLIP_RESULT(result)
   1638                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1639                     result = (result + tmp_result + 1);  /* no clip */
   1640                     result = (result >> 1);
   1641                     *(p_cur += outpitch) = result;
   1642                     p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1643                 }
   1644                 p_cur += (curr_offset - 3);
   1645             }
   1646         }
   1647     }
   1648 
   1649     return ;
   1650 }
   1651 
   1652 /* position G */
   1653 void eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
   1654                 int blkwidth, int blkheight)
   1655 {
   1656     int i, j;
   1657     int offset_in = inpitch - blkwidth;
   1658     int offset_out = outpitch - blkwidth;
   1659     uint32 temp;
   1660     uint8 byte;
   1661 
   1662     if (((uint32)in)&3)
   1663     {
   1664         for (j = blkheight; j > 0; j--)
   1665         {
   1666             for (i = blkwidth; i > 0; i -= 4)
   1667             {
   1668                 temp = *in++;
   1669                 byte = *in++;
   1670                 temp |= (byte << 8);
   1671                 byte = *in++;
   1672                 temp |= (byte << 16);
   1673                 byte = *in++;
   1674                 temp |= (byte << 24);
   1675 
   1676                 *((uint32*)out) = temp; /* write 4 bytes */
   1677                 out += 4;
   1678             }
   1679             out += offset_out;
   1680             in += offset_in;
   1681         }
   1682     }
   1683     else
   1684     {
   1685         for (j = blkheight; j > 0; j--)
   1686         {
   1687             for (i = blkwidth; i > 0; i -= 4)
   1688             {
   1689                 temp = *((uint32*)in);
   1690                 *((uint32*)out) = temp;
   1691                 in += 4;
   1692                 out += 4;
   1693             }
   1694             out += offset_out;
   1695             in += offset_in;
   1696         }
   1697     }
   1698     return ;
   1699 }
   1700 
   1701 void ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos)
   1702 {
   1703     int pad_height;
   1704     int pad_width;
   1705     uint8 *start;
   1706     uint32 word1, word2, word3;
   1707     int offset, j;
   1708 
   1709 
   1710     pad_height = 8 + ((y_pos & 7) ? 1 : 0);
   1711     pad_width = 8 + ((x_pos & 7) ? 1 : 0);
   1712 
   1713     y_pos >>= 3;
   1714     x_pos >>= 3;
   1715     // pad vertical first
   1716     if (y_pos < 0) // need to pad up
   1717     {
   1718         if (x_pos < -8) start = ref - 8;
   1719         else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width;
   1720         else start = ref + x_pos;
   1721 
   1722         /* word-align start */
   1723         offset = (uint32)start & 0x3;
   1724         if (offset) start -= offset;
   1725 
   1726         word1 = *((uint32*)start);
   1727         word2 = *((uint32*)(start + 4));
   1728         word3 = *((uint32*)(start + 8));
   1729 
   1730         /* pad up N rows */
   1731         j = -y_pos;
   1732         if (j > 8) j = 8;
   1733         while (j--)
   1734         {
   1735             *((uint32*)(start -= picpitch)) = word1;
   1736             *((uint32*)(start + 4)) = word2;
   1737             *((uint32*)(start + 8)) = word3;
   1738         }
   1739 
   1740     }
   1741     else if (y_pos + pad_height >= picheight) /* pad down */
   1742     {
   1743         if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8;
   1744         else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) +
   1745                     picwidth + 7 - pad_width;
   1746         else    start = ref + picpitch * (picheight - 1) + x_pos;
   1747 
   1748         /* word-align start */
   1749         offset = (uint32)start & 0x3;
   1750         if (offset) start -= offset;
   1751 
   1752         word1 = *((uint32*)start);
   1753         word2 = *((uint32*)(start + 4));
   1754         word3 = *((uint32*)(start + 8));
   1755 
   1756         /* pad down N rows */
   1757         j = y_pos + pad_height - picheight;
   1758         if (j > 8) j = 8;
   1759         while (j--)
   1760         {
   1761             *((uint32*)(start += picpitch)) = word1;
   1762             *((uint32*)(start + 4)) = word2;
   1763             *((uint32*)(start + 8)) = word3;
   1764         }
   1765     }
   1766 
   1767     /* now pad horizontal */
   1768     if (x_pos < 0) // pad left
   1769     {
   1770         if (y_pos < -8) start = ref - (picpitch << 3);
   1771         else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch;
   1772         else start = ref + y_pos * picpitch;
   1773 
   1774         // now pad left 8 pixels for pad_height rows */
   1775         j = pad_height;
   1776         start -= picpitch;
   1777         while (j--)
   1778         {
   1779             word1 = *(start += picpitch);
   1780             word1 |= (word1 << 8);
   1781             word1 |= (word1 << 16);
   1782             *((uint32*)(start - 8)) = word1;
   1783             *((uint32*)(start - 4)) = word1;
   1784         }
   1785     }
   1786     else if (x_pos + pad_width >= picwidth) /* pad right */
   1787     {
   1788         if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1;
   1789         else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1;
   1790         else start = ref + y_pos * picpitch + picwidth - 1;
   1791 
   1792         // now pad right 8 pixels for pad_height rows */
   1793         j = pad_height;
   1794         start -= picpitch;
   1795         while (j--)
   1796         {
   1797             word1 = *(start += picpitch);
   1798             word1 |= (word1 << 8);
   1799             word1 |= (word1 << 16);
   1800             *((uint32*)(start + 1)) = word1;
   1801             *((uint32*)(start + 5)) = word1;
   1802         }
   1803     }
   1804 
   1805     return ;
   1806 }
   1807 
   1808 
   1809 void eChromaMotionComp(uint8 *ref, int picwidth, int picheight,
   1810                        int x_pos, int y_pos,
   1811                        uint8 *pred, int picpitch,
   1812                        int blkwidth, int blkheight)
   1813 {
   1814     int dx, dy;
   1815     int offset_dx, offset_dy;
   1816     int index;
   1817 
   1818     ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos);
   1819 
   1820     dx = x_pos & 7;
   1821     dy = y_pos & 7;
   1822     offset_dx = (dx + 7) >> 3;
   1823     offset_dy = (dy + 7) >> 3;
   1824     x_pos = x_pos >> 3;  /* round it to full-pel resolution */
   1825     y_pos = y_pos >> 3;
   1826 
   1827     ref += y_pos * picpitch + x_pos;
   1828 
   1829     index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
   1830 
   1831     (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight);
   1832     return ;
   1833 }
   1834 
   1835 
   1836 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */
   1837 void eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   1838                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   1839 {
   1840     int32 r0, r1, r2, r3, result0, result1;
   1841     uint8 temp[288];
   1842     uint8 *ref, *out;
   1843     int i, j;
   1844     int dx_8 = 8 - dx;
   1845     int dy_8 = 8 - dy;
   1846 
   1847     /* horizontal first */
   1848     out = temp;
   1849     for (i = 0; i < blkheight + 1; i++)
   1850     {
   1851         ref = pRef;
   1852         r0 = ref[0];
   1853         for (j = 0; j < blkwidth; j += 4)
   1854         {
   1855             r0 |= (ref[2] << 16);
   1856             result0 = dx_8 * r0;
   1857 
   1858             r1 = ref[1] | (ref[3] << 16);
   1859             result0 += dx * r1;
   1860             *(int32 *)out = result0;
   1861 
   1862             result0 = dx_8 * r1;
   1863 
   1864             r2 = ref[4];
   1865             r0 = r0 >> 16;
   1866             r1 = r0 | (r2 << 16);
   1867             result0 += dx * r1;
   1868             *(int32 *)(out + 16) = result0;
   1869 
   1870             ref += 4;
   1871             out += 4;
   1872             r0 = r2;
   1873         }
   1874         pRef += srcPitch;
   1875         out += (32 - blkwidth);
   1876     }
   1877 
   1878 //  pRef -= srcPitch*(blkheight+1);
   1879     ref = temp;
   1880 
   1881     for (j = 0; j < blkwidth; j += 4)
   1882     {
   1883         r0 = *(int32 *)ref;
   1884         r1 = *(int32 *)(ref + 16);
   1885         ref += 32;
   1886         out = pOut;
   1887         for (i = 0; i < (blkheight >> 1); i++)
   1888         {
   1889             result0 = dy_8 * r0 + 0x00200020;
   1890             r2 = *(int32 *)ref;
   1891             result0 += dy * r2;
   1892             result0 >>= 6;
   1893             result0 &= 0x00FF00FF;
   1894             r0 = r2;
   1895 
   1896             result1 = dy_8 * r1 + 0x00200020;
   1897             r3 = *(int32 *)(ref + 16);
   1898             result1 += dy * r3;
   1899             result1 >>= 6;
   1900             result1 &= 0x00FF00FF;
   1901             r1 = r3;
   1902             *(int32 *)out = result0 | (result1 << 8);
   1903             out += predPitch;
   1904             ref += 32;
   1905 
   1906             result0 = dy_8 * r0 + 0x00200020;
   1907             r2 = *(int32 *)ref;
   1908             result0 += dy * r2;
   1909             result0 >>= 6;
   1910             result0 &= 0x00FF00FF;
   1911             r0 = r2;
   1912 
   1913             result1 = dy_8 * r1 + 0x00200020;
   1914             r3 = *(int32 *)(ref + 16);
   1915             result1 += dy * r3;
   1916             result1 >>= 6;
   1917             result1 &= 0x00FF00FF;
   1918             r1 = r3;
   1919             *(int32 *)out = result0 | (result1 << 8);
   1920             out += predPitch;
   1921             ref += 32;
   1922         }
   1923         pOut += 4;
   1924         ref = temp + 4; /* since it can only iterate twice max */
   1925     }
   1926     return;
   1927 }
   1928 
   1929 void eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   1930                               uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   1931 {
   1932     (void)(dy);
   1933 
   1934     int32 r0, r1, r2, result0, result1;
   1935     uint8 *ref, *out;
   1936     int i, j;
   1937     int dx_8 = 8 - dx;
   1938 
   1939     /* horizontal first */
   1940     for (i = 0; i < blkheight; i++)
   1941     {
   1942         ref = pRef;
   1943         out = pOut;
   1944 
   1945         r0 = ref[0];
   1946         for (j = 0; j < blkwidth; j += 4)
   1947         {
   1948             r0 |= (ref[2] << 16);
   1949             result0 = dx_8 * r0 + 0x00040004;
   1950 
   1951             r1 = ref[1] | (ref[3] << 16);
   1952             result0 += dx * r1;
   1953             result0 >>= 3;
   1954             result0 &= 0x00FF00FF;
   1955 
   1956             result1 = dx_8 * r1 + 0x00040004;
   1957 
   1958             r2 = ref[4];
   1959             r0 = r0 >> 16;
   1960             r1 = r0 | (r2 << 16);
   1961             result1 += dx * r1;
   1962             result1 >>= 3;
   1963             result1 &= 0x00FF00FF;
   1964 
   1965             *(int32 *)out = result0 | (result1 << 8);
   1966 
   1967             ref += 4;
   1968             out += 4;
   1969             r0 = r2;
   1970         }
   1971 
   1972         pRef += srcPitch;
   1973         pOut += predPitch;
   1974     }
   1975     return;
   1976 }
   1977 
   1978 void eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   1979                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   1980 {
   1981     (void)(dx);
   1982 
   1983     int32 r0, r1, r2, r3, result0, result1;
   1984     int i, j;
   1985     uint8 *ref, *out;
   1986     int dy_8 = 8 - dy;
   1987     /* vertical first */
   1988     for (i = 0; i < blkwidth; i += 4)
   1989     {
   1990         ref = pRef;
   1991         out = pOut;
   1992 
   1993         r0 = ref[0] | (ref[2] << 16);
   1994         r1 = ref[1] | (ref[3] << 16);
   1995         ref += srcPitch;
   1996         for (j = 0; j < blkheight; j++)
   1997         {
   1998             result0 = dy_8 * r0 + 0x00040004;
   1999             r2 = ref[0] | (ref[2] << 16);
   2000             result0 += dy * r2;
   2001             result0 >>= 3;
   2002             result0 &= 0x00FF00FF;
   2003             r0 = r2;
   2004 
   2005             result1 = dy_8 * r1 + 0x00040004;
   2006             r3 = ref[1] | (ref[3] << 16);
   2007             result1 += dy * r3;
   2008             result1 >>= 3;
   2009             result1 &= 0x00FF00FF;
   2010             r1 = r3;
   2011             *(int32 *)out = result0 | (result1 << 8);
   2012             ref += srcPitch;
   2013             out += predPitch;
   2014         }
   2015         pOut += 4;
   2016         pRef += 4;
   2017     }
   2018     return;
   2019 }
   2020 
   2021 void eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2022                              uint8 *pOut,  int predPitch, int blkwidth, int blkheight)
   2023 {
   2024     (void)(blkwidth);
   2025 
   2026     int32 r0, r1, temp0, temp1, result;
   2027     int32 temp[9];
   2028     int32 *out;
   2029     int i, r_temp;
   2030     int dy_8 = 8 - dy;
   2031 
   2032     /* horizontal first */
   2033     out = temp;
   2034     for (i = 0; i < blkheight + 1; i++)
   2035     {
   2036         r_temp = pRef[1];
   2037         temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
   2038         temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
   2039         r0 = temp0 | (temp1 << 16);
   2040         *out++ = r0;
   2041         pRef += srcPitch;
   2042     }
   2043 
   2044     pRef -= srcPitch * (blkheight + 1);
   2045 
   2046     out = temp;
   2047 
   2048     r0 = *out++;
   2049 
   2050     for (i = 0; i < blkheight; i++)
   2051     {
   2052         result = dy_8 * r0 + 0x00200020;
   2053         r1 = *out++;
   2054         result += dy * r1;
   2055         result >>= 6;
   2056         result &= 0x00FF00FF;
   2057         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
   2058         r0 = r1;
   2059         pOut += predPitch;
   2060     }
   2061     return;
   2062 }
   2063 
   2064 void eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2065                                uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2066 {
   2067     (void)(dy);
   2068     (void)(blkwidth);
   2069 
   2070     int i, temp, temp0, temp1;
   2071 
   2072     /* horizontal first */
   2073     for (i = 0; i < blkheight; i++)
   2074     {
   2075         temp = pRef[1];
   2076         temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
   2077         temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
   2078 
   2079         *(int16 *)pOut = temp0 | (temp1 << 8);
   2080         pRef += srcPitch;
   2081         pOut += predPitch;
   2082 
   2083     }
   2084     return;
   2085 }
   2086 void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2087                              uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2088 {
   2089     (void)(dx);
   2090     (void)(blkwidth);
   2091 
   2092     int32 r0, r1, result;
   2093     int i;
   2094     int dy_8 = 8 - dy;
   2095     r0 = pRef[0] | (pRef[1] << 16);
   2096     pRef += srcPitch;
   2097     for (i = 0; i < blkheight; i++)
   2098     {
   2099         result = dy_8 * r0 + 0x00040004;
   2100         r1 = pRef[0] | (pRef[1] << 16);
   2101         result += dy * r1;
   2102         result >>= 3;
   2103         result &= 0x00FF00FF;
   2104         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
   2105         r0 = r1;
   2106         pRef += srcPitch;
   2107         pOut += predPitch;
   2108     }
   2109     return;
   2110 }
   2111 
   2112 void eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2113                         uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2114 {
   2115     (void)(dx);
   2116     (void)(dy);
   2117 
   2118     int i, j;
   2119     int offset_in = srcPitch - blkwidth;
   2120     int offset_out = predPitch - blkwidth;
   2121     uint16 temp;
   2122     uint8 byte;
   2123 
   2124     if (((uint32)pRef)&1)
   2125     {
   2126         for (j = blkheight; j > 0; j--)
   2127         {
   2128             for (i = blkwidth; i > 0; i -= 2)
   2129             {
   2130                 temp = *pRef++;
   2131                 byte = *pRef++;
   2132                 temp |= (byte << 8);
   2133                 *((uint16*)pOut) = temp; /* write 2 bytes */
   2134                 pOut += 2;
   2135             }
   2136             pOut += offset_out;
   2137             pRef += offset_in;
   2138         }
   2139     }
   2140     else
   2141     {
   2142         for (j = blkheight; j > 0; j--)
   2143         {
   2144             for (i = blkwidth; i > 0; i -= 2)
   2145             {
   2146                 temp = *((uint16*)pRef);
   2147                 *((uint16*)pOut) = temp;
   2148                 pRef += 2;
   2149                 pOut += 2;
   2150             }
   2151             pOut += offset_out;
   2152             pRef += offset_in;
   2153         }
   2154     }
   2155     return ;
   2156 }
   2157