Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 #include "avcenc_lib.h"
     19 #include "avcenc_int.h"
     20 
     21 
     22 #define CLIP_RESULT(x)      if((uint)x > 0xFF){ \
     23                  x = 0xFF & (~(x>>31));}
     24 
     25 /* (blkwidth << 2) + (dy << 1) + dx */
     26 static void (*const eChromaMC_SIMD[8])(uint8 *, int , int , int , uint8 *, int, int , int) =
     27 {
     28     &eChromaFullMC_SIMD,
     29     &eChromaHorizontalMC_SIMD,
     30     &eChromaVerticalMC_SIMD,
     31     &eChromaDiagonalMC_SIMD,
     32     &eChromaFullMC_SIMD,
     33     &eChromaHorizontalMC2_SIMD,
     34     &eChromaVerticalMC2_SIMD,
     35     &eChromaDiagonalMC2_SIMD
     36 };
     37 /* Perform motion prediction and compensation with residue if exist. */
     38 void AVCMBMotionComp(AVCEncObject *encvid, AVCCommonObj *video)
     39 {
     40     (void)(encvid);
     41 
     42     AVCMacroblock *currMB = video->currMB;
     43     AVCPictureData *currPic = video->currPic;
     44     int mbPartIdx, subMbPartIdx;
     45     int ref_idx;
     46     int offset_MbPart_indx = 0;
     47     int16 *mv;
     48     uint32 x_pos, y_pos;
     49     uint8 *curL, *curCb, *curCr;
     50     uint8 *ref_l, *ref_Cb, *ref_Cr;
     51     uint8 *predBlock, *predCb, *predCr;
     52     int block_x, block_y, offset_x, offset_y, offsetP, offset;
     53     int x_position = (video->mb_x << 4);
     54     int y_position = (video->mb_y << 4);
     55     int MbHeight, MbWidth, mbPartIdx_X, mbPartIdx_Y, offset_indx;
     56     int picWidth = currPic->width;
     57     int picPitch = currPic->pitch;
     58     int picHeight = currPic->height;
     59     uint32 tmp_word;
     60 
     61     tmp_word = y_position * picPitch;
     62     curL = currPic->Sl + tmp_word + x_position;
     63     offset = (tmp_word >> 2) + (x_position >> 1);
     64     curCb = currPic->Scb + offset;
     65     curCr = currPic->Scr + offset;
     66 
     67     predBlock = curL;
     68     predCb = curCb;
     69     predCr = curCr;
     70 
     71     GetMotionVectorPredictor(video, 1);
     72 
     73     for (mbPartIdx = 0; mbPartIdx < currMB->NumMbPart; mbPartIdx++)
     74     {
     75         MbHeight = currMB->SubMbPartHeight[mbPartIdx];
     76         MbWidth = currMB->SubMbPartWidth[mbPartIdx];
     77         mbPartIdx_X = ((mbPartIdx + offset_MbPart_indx) & 1);
     78         mbPartIdx_Y = (mbPartIdx + offset_MbPart_indx) >> 1;
     79         ref_idx = currMB->ref_idx_L0[(mbPartIdx_Y << 1) + mbPartIdx_X];
     80         offset_indx = 0;
     81 
     82         ref_l = video->RefPicList0[ref_idx]->Sl;
     83         ref_Cb = video->RefPicList0[ref_idx]->Scb;
     84         ref_Cr = video->RefPicList0[ref_idx]->Scr;
     85 
     86         for (subMbPartIdx = 0; subMbPartIdx < currMB->NumSubMbPart[mbPartIdx]; subMbPartIdx++)
     87         {
     88             block_x = (mbPartIdx_X << 1) + ((subMbPartIdx + offset_indx) & 1);
     89             block_y = (mbPartIdx_Y << 1) + (((subMbPartIdx + offset_indx) >> 1) & 1);
     90             mv = (int16*)(currMB->mvL0 + block_x + (block_y << 2));
     91             offset_x = x_position + (block_x << 2);
     92             offset_y = y_position + (block_y << 2);
     93             x_pos = (offset_x << 2) + *mv++;   /*quarter pel */
     94             y_pos = (offset_y << 2) + *mv;   /*quarter pel */
     95 
     96             //offset = offset_y * currPic->width;
     97             //offsetC = (offset >> 2) + (offset_x >> 1);
     98             offsetP = (block_y << 2) * picPitch + (block_x << 2);
     99             eLumaMotionComp(ref_l, picPitch, picHeight, x_pos, y_pos,
    100                             /*comp_Sl + offset + offset_x,*/
    101                             predBlock + offsetP, picPitch, MbWidth, MbHeight);
    102 
    103             offsetP = (block_y * picWidth) + (block_x << 1);
    104             eChromaMotionComp(ref_Cb, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
    105                               /*comp_Scb +  offsetC,*/
    106                               predCb + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
    107             eChromaMotionComp(ref_Cr, picWidth >> 1, picHeight >> 1, x_pos, y_pos,
    108                               /*comp_Scr +  offsetC,*/
    109                               predCr + offsetP, picPitch >> 1, MbWidth >> 1, MbHeight >> 1);
    110 
    111             offset_indx = currMB->SubMbPartWidth[mbPartIdx] >> 3;
    112         }
    113         offset_MbPart_indx = currMB->MbPartWidth >> 4;
    114     }
    115 
    116     return ;
    117 }
    118 
    119 
    120 /* preform the actual  motion comp here */
    121 void eLumaMotionComp(uint8 *ref, int picpitch, int picheight,
    122                      int x_pos, int y_pos,
    123                      uint8 *pred, int pred_pitch,
    124                      int blkwidth, int blkheight)
    125 {
    126     (void)(picheight);
    127 
    128     int dx, dy;
    129     int temp2[21][21]; /* for intermediate results */
    130     uint8 *ref2;
    131 
    132     dx = x_pos & 3;
    133     dy = y_pos & 3;
    134     x_pos = x_pos >> 2;  /* round it to full-pel resolution */
    135     y_pos = y_pos >> 2;
    136 
    137     /* perform actual motion compensation */
    138     if (dx == 0 && dy == 0)
    139     {  /* fullpel position *//* G */
    140 
    141         ref += y_pos * picpitch + x_pos;
    142 
    143         eFullPelMC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
    144 
    145     }   /* other positions */
    146     else  if (dy == 0)
    147     { /* no vertical interpolation *//* a,b,c*/
    148 
    149         ref += y_pos * picpitch + x_pos;
    150 
    151         eHorzInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dx);
    152     }
    153     else if (dx == 0)
    154     { /*no horizontal interpolation *//* d,h,n */
    155 
    156         ref += y_pos * picpitch + x_pos;
    157 
    158         eVertInterp1MC(ref, picpitch, pred, pred_pitch, blkwidth, blkheight, dy);
    159     }
    160     else if (dy == 2)
    161     {  /* horizontal cross *//* i, j, k */
    162 
    163         ref += y_pos * picpitch + x_pos - 2; /* move to the left 2 pixels */
    164 
    165         eVertInterp2MC(ref, picpitch, &temp2[0][0], 21, blkwidth + 5, blkheight);
    166 
    167         eHorzInterp2MC(&temp2[0][2], 21, pred, pred_pitch, blkwidth, blkheight, dx);
    168     }
    169     else if (dx == 2)
    170     { /* vertical cross */ /* f,q */
    171 
    172         ref += (y_pos - 2) * picpitch + x_pos; /* move to up 2 lines */
    173 
    174         eHorzInterp3MC(ref, picpitch, &temp2[0][0], 21, blkwidth, blkheight + 5);
    175         eVertInterp3MC(&temp2[2][0], 21, pred, pred_pitch, blkwidth, blkheight, dy);
    176     }
    177     else
    178     { /* diagonal *//* e,g,p,r */
    179 
    180         ref2 = ref + (y_pos + (dy / 2)) * picpitch + x_pos;
    181 
    182         ref += (y_pos * picpitch) + x_pos + (dx / 2);
    183 
    184         eDiagonalInterpMC(ref2, ref, picpitch, pred, pred_pitch, blkwidth, blkheight);
    185     }
    186 
    187     return ;
    188 }
    189 
    190 void eCreateAlign(uint8 *ref, int picpitch, int y_pos,
    191                   uint8 *out, int blkwidth, int blkheight)
    192 {
    193     int i, j;
    194     int offset, out_offset;
    195     uint32 prev_pix, result, pix1, pix2, pix4;
    196 
    197     ref += y_pos * picpitch;// + x_pos;
    198     out_offset = 24 - blkwidth;
    199 
    200     //switch(x_pos&0x3){
    201     switch (((intptr_t)ref)&0x3)
    202     {
    203         case 1:
    204             offset =  picpitch - blkwidth - 3;
    205             for (j = 0; j < blkheight; j++)
    206             {
    207                 pix1 = *ref++;
    208                 pix2 = *((uint16*)ref);
    209                 ref += 2;
    210                 result = (pix2 << 8) | pix1;
    211 
    212                 for (i = 3; i < blkwidth; i += 4)
    213                 {
    214                     pix4 = *((uint32*)ref);
    215                     ref += 4;
    216                     prev_pix = (pix4 << 24) & 0xFF000000; /* mask out byte belong to previous word */
    217                     result |= prev_pix;
    218                     *((uint32*)out) = result;  /* write 4 bytes */
    219                     out += 4;
    220                     result = pix4 >> 8; /* for the next loop */
    221                 }
    222                 ref += offset;
    223                 out += out_offset;
    224             }
    225             break;
    226         case 2:
    227             offset =  picpitch - blkwidth - 2;
    228             for (j = 0; j < blkheight; j++)
    229             {
    230                 result = *((uint16*)ref);
    231                 ref += 2;
    232                 for (i = 2; i < blkwidth; i += 4)
    233                 {
    234                     pix4 = *((uint32*)ref);
    235                     ref += 4;
    236                     prev_pix = (pix4 << 16) & 0xFFFF0000; /* mask out byte belong to previous word */
    237                     result |= prev_pix;
    238                     *((uint32*)out) = result;  /* write 4 bytes */
    239                     out += 4;
    240                     result = pix4 >> 16; /* for the next loop */
    241                 }
    242                 ref += offset;
    243                 out += out_offset;
    244             }
    245             break;
    246         case 3:
    247             offset =  picpitch - blkwidth - 1;
    248             for (j = 0; j < blkheight; j++)
    249             {
    250                 result = *ref++;
    251                 for (i = 1; i < blkwidth; i += 4)
    252                 {
    253                     pix4 = *((uint32*)ref);
    254                     ref += 4;
    255                     prev_pix = (pix4 << 8) & 0xFFFFFF00; /* mask out byte belong to previous word */
    256                     result |= prev_pix;
    257                     *((uint32*)out) = result;  /* write 4 bytes */
    258                     out += 4;
    259                     result = pix4 >> 24; /* for the next loop */
    260                 }
    261                 ref += offset;
    262                 out += out_offset;
    263             }
    264             break;
    265     }
    266 }
    267 
    268 void eHorzInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
    269                     int blkwidth, int blkheight, int dx)
    270 {
    271     uint8 *p_ref, *tmp;
    272     uint32 *p_cur;
    273     uint32 pkres;
    274     int result, curr_offset, ref_offset;
    275     int j;
    276     int32 r0, r1, r2, r3, r4, r5;
    277     int32 r13, r6;
    278 
    279     p_cur = (uint32*)out; /* assume it's word aligned */
    280     curr_offset = (outpitch - blkwidth) >> 2;
    281     p_ref = in;
    282     ref_offset = inpitch - blkwidth;
    283 
    284     if (dx&1)
    285     {
    286         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
    287         p_ref -= 2;
    288         r13 = 0;
    289         for (j = blkheight; j > 0; j--)
    290         {
    291             tmp = p_ref + blkwidth;
    292             r0 = p_ref[0];
    293             r1 = p_ref[2];
    294             r0 |= (r1 << 16);           /* 0,c,0,a */
    295             r1 = p_ref[1];
    296             r2 = p_ref[3];
    297             r1 |= (r2 << 16);           /* 0,d,0,b */
    298             while (p_ref < tmp)
    299             {
    300                 r2 = *(p_ref += 4); /* move pointer to e */
    301                 r3 = p_ref[2];
    302                 r2 |= (r3 << 16);           /* 0,g,0,e */
    303                 r3 = p_ref[1];
    304                 r4 = p_ref[3];
    305                 r3 |= (r4 << 16);           /* 0,h,0,f */
    306 
    307                 r4 = r0 + r3;       /* c+h, a+f */
    308                 r5 = r0 + r1;   /* c+d, a+b */
    309                 r6 = r2 + r3;   /* g+h, e+f */
    310                 r5 >>= 16;
    311                 r5 |= (r6 << 16);   /* e+f, c+d */
    312                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
    313                 r4 += 0x100010; /* +16, +16 */
    314                 r5 = r1 + r2;       /* d+g, b+e */
    315                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
    316                 r4 >>= 5;
    317                 r13 |= r4;      /* check clipping */
    318 
    319                 r5 = p_ref[dx+2];
    320                 r6 = p_ref[dx+4];
    321                 r5 |= (r6 << 16);
    322                 r4 += r5;
    323                 r4 += 0x10001;
    324                 r4 = (r4 >> 1) & 0xFF00FF;
    325 
    326                 r5 = p_ref[4];  /* i */
    327                 r6 = (r5 << 16);
    328                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
    329                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
    330                 r1 >>= 16;
    331                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
    332                 r1 += r2;       /* f+g, d+e */
    333                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
    334                 r0 >>= 16;
    335                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
    336                 r0 += r3;       /* e+h, c+f */
    337                 r5 += 0x100010; /* 16,16 */
    338                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
    339                 r5 >>= 5;
    340                 r13 |= r5;      /* check clipping */
    341 
    342                 r0 = p_ref[dx+3];
    343                 r1 = p_ref[dx+5];
    344                 r0 |= (r1 << 16);
    345                 r5 += r0;
    346                 r5 += 0x10001;
    347                 r5 = (r5 >> 1) & 0xFF00FF;
    348 
    349                 r4 |= (r5 << 8);    /* pack them together */
    350                 *p_cur++ = r4;
    351                 r1 = r3;
    352                 r0 = r2;
    353             }
    354             p_cur += curr_offset; /* move to the next line */
    355             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
    356 
    357             if (r13&0xFF000700) /* need clipping */
    358             {
    359                 /* move back to the beginning of the line */
    360                 p_ref -= (ref_offset + blkwidth);   /* input */
    361                 p_cur -= (outpitch >> 2);
    362 
    363                 tmp = p_ref + blkwidth;
    364                 for (; p_ref < tmp;)
    365                 {
    366 
    367                     r0 = *p_ref++;
    368                     r1 = *p_ref++;
    369                     r2 = *p_ref++;
    370                     r3 = *p_ref++;
    371                     r4 = *p_ref++;
    372                     /* first pixel */
    373                     r5 = *p_ref++;
    374                     result = (r0 + r5);
    375                     r0 = (r1 + r4);
    376                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    377                     r0 = (r2 + r3);
    378                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    379                     result = (result + 16) >> 5;
    380                     CLIP_RESULT(result)
    381                     /* 3/4 pel,  no need to clip */
    382                     result = (result + p_ref[dx] + 1);
    383                     pkres = (result >> 1) ;
    384                     /* second pixel */
    385                     r0 = *p_ref++;
    386                     result = (r1 + r0);
    387                     r1 = (r2 + r5);
    388                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    389                     r1 = (r3 + r4);
    390                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    391                     result = (result + 16) >> 5;
    392                     CLIP_RESULT(result)
    393                     /* 3/4 pel,  no need to clip */
    394                     result = (result + p_ref[dx] + 1);
    395                     result = (result >> 1);
    396                     pkres  |= (result << 8);
    397                     /* third pixel */
    398                     r1 = *p_ref++;
    399                     result = (r2 + r1);
    400                     r2 = (r3 + r0);
    401                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    402                     r2 = (r4 + r5);
    403                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    404                     result = (result + 16) >> 5;
    405                     CLIP_RESULT(result)
    406                     /* 3/4 pel,  no need to clip */
    407                     result = (result + p_ref[dx] + 1);
    408                     result = (result >> 1);
    409                     pkres  |= (result << 16);
    410                     /* fourth pixel */
    411                     r2 = *p_ref++;
    412                     result = (r3 + r2);
    413                     r3 = (r4 + r1);
    414                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    415                     r3 = (r5 + r0);
    416                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    417                     result = (result + 16) >> 5;
    418                     CLIP_RESULT(result)
    419                     /* 3/4 pel,  no need to clip */
    420                     result = (result + p_ref[dx] + 1);
    421                     result = (result >> 1);
    422                     pkres  |= (result << 24);
    423                     *p_cur++ = pkres; /* write 4 pixels */
    424                     p_ref -= 5;  /* offset back to the middle of filter */
    425                 }
    426                 p_cur += curr_offset;  /* move to the next line */
    427                 p_ref += ref_offset;    /* move to the next line */
    428             }
    429         }
    430     }
    431     else
    432     {
    433         p_ref -= 2;
    434         r13 = 0;
    435         for (j = blkheight; j > 0; j--)
    436         {
    437             tmp = p_ref + blkwidth;
    438             r0 = p_ref[0];
    439             r1 = p_ref[2];
    440             r0 |= (r1 << 16);           /* 0,c,0,a */
    441             r1 = p_ref[1];
    442             r2 = p_ref[3];
    443             r1 |= (r2 << 16);           /* 0,d,0,b */
    444             while (p_ref < tmp)
    445             {
    446                 r2 = *(p_ref += 4); /* move pointer to e */
    447                 r3 = p_ref[2];
    448                 r2 |= (r3 << 16);           /* 0,g,0,e */
    449                 r3 = p_ref[1];
    450                 r4 = p_ref[3];
    451                 r3 |= (r4 << 16);           /* 0,h,0,f */
    452 
    453                 r4 = r0 + r3;       /* c+h, a+f */
    454                 r5 = r0 + r1;   /* c+d, a+b */
    455                 r6 = r2 + r3;   /* g+h, e+f */
    456                 r5 >>= 16;
    457                 r5 |= (r6 << 16);   /* e+f, c+d */
    458                 r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
    459                 r4 += 0x100010; /* +16, +16 */
    460                 r5 = r1 + r2;       /* d+g, b+e */
    461                 r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
    462                 r4 >>= 5;
    463                 r13 |= r4;      /* check clipping */
    464                 r4 &= 0xFF00FF; /* mask */
    465 
    466                 r5 = p_ref[4];  /* i */
    467                 r6 = (r5 << 16);
    468                 r5 = r6 | (r2 >> 16);/* 0,i,0,g */
    469                 r5 += r1;       /* d+i, b+g */ /* r5 not free */
    470                 r1 >>= 16;
    471                 r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
    472                 r1 += r2;       /* f+g, d+e */
    473                 r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
    474                 r0 >>= 16;
    475                 r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
    476                 r0 += r3;       /* e+h, c+f */
    477                 r5 += 0x100010; /* 16,16 */
    478                 r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
    479                 r5 >>= 5;
    480                 r13 |= r5;      /* check clipping */
    481                 r5 &= 0xFF00FF; /* mask */
    482 
    483                 r4 |= (r5 << 8);    /* pack them together */
    484                 *p_cur++ = r4;
    485                 r1 = r3;
    486                 r0 = r2;
    487             }
    488             p_cur += curr_offset; /* move to the next line */
    489             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
    490 
    491             if (r13&0xFF000700) /* need clipping */
    492             {
    493                 /* move back to the beginning of the line */
    494                 p_ref -= (ref_offset + blkwidth);   /* input */
    495                 p_cur -= (outpitch >> 2);
    496 
    497                 tmp = p_ref + blkwidth;
    498                 for (; p_ref < tmp;)
    499                 {
    500 
    501                     r0 = *p_ref++;
    502                     r1 = *p_ref++;
    503                     r2 = *p_ref++;
    504                     r3 = *p_ref++;
    505                     r4 = *p_ref++;
    506                     /* first pixel */
    507                     r5 = *p_ref++;
    508                     result = (r0 + r5);
    509                     r0 = (r1 + r4);
    510                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    511                     r0 = (r2 + r3);
    512                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    513                     result = (result + 16) >> 5;
    514                     CLIP_RESULT(result)
    515                     pkres  = result;
    516                     /* second pixel */
    517                     r0 = *p_ref++;
    518                     result = (r1 + r0);
    519                     r1 = (r2 + r5);
    520                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    521                     r1 = (r3 + r4);
    522                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    523                     result = (result + 16) >> 5;
    524                     CLIP_RESULT(result)
    525                     pkres  |= (result << 8);
    526                     /* third pixel */
    527                     r1 = *p_ref++;
    528                     result = (r2 + r1);
    529                     r2 = (r3 + r0);
    530                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    531                     r2 = (r4 + r5);
    532                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    533                     result = (result + 16) >> 5;
    534                     CLIP_RESULT(result)
    535                     pkres  |= (result << 16);
    536                     /* fourth pixel */
    537                     r2 = *p_ref++;
    538                     result = (r3 + r2);
    539                     r3 = (r4 + r1);
    540                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    541                     r3 = (r5 + r0);
    542                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    543                     result = (result + 16) >> 5;
    544                     CLIP_RESULT(result)
    545                     pkres  |= (result << 24);
    546                     *p_cur++ = pkres;   /* write 4 pixels */
    547                     p_ref -= 5;
    548                 }
    549                 p_cur += curr_offset; /* move to the next line */
    550                 p_ref += ref_offset;
    551             }
    552         }
    553     }
    554 
    555     return ;
    556 }
    557 
    558 void eHorzInterp2MC(int *in, int inpitch, uint8 *out, int outpitch,
    559                     int blkwidth, int blkheight, int dx)
    560 {
    561     int *p_ref, *tmp;
    562     uint32 *p_cur;
    563     uint32 pkres;
    564     int result, result2, curr_offset, ref_offset;
    565     int j, r0, r1, r2, r3, r4, r5;
    566 
    567     p_cur = (uint32*)out; /* assume it's word aligned */
    568     curr_offset = (outpitch - blkwidth) >> 2;
    569     p_ref = in;
    570     ref_offset = inpitch - blkwidth;
    571 
    572     if (dx&1)
    573     {
    574         dx = ((dx >> 1) ? -3 : -4); /* use in 3/4 pel */
    575 
    576         for (j = blkheight; j > 0 ; j--)
    577         {
    578             tmp = p_ref + blkwidth;
    579             for (; p_ref < tmp;)
    580             {
    581 
    582                 r0 = p_ref[-2];
    583                 r1 = p_ref[-1];
    584                 r2 = *p_ref++;
    585                 r3 = *p_ref++;
    586                 r4 = *p_ref++;
    587                 /* first pixel */
    588                 r5 = *p_ref++;
    589                 result = (r0 + r5);
    590                 r0 = (r1 + r4);
    591                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    592                 r0 = (r2 + r3);
    593                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    594                 result = (result + 512) >> 10;
    595                 CLIP_RESULT(result)
    596                 result2 = ((p_ref[dx] + 16) >> 5);
    597                 CLIP_RESULT(result2)
    598                 /* 3/4 pel,  no need to clip */
    599                 result = (result + result2 + 1);
    600                 pkres = (result >> 1);
    601                 /* second pixel */
    602                 r0 = *p_ref++;
    603                 result = (r1 + r0);
    604                 r1 = (r2 + r5);
    605                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    606                 r1 = (r3 + r4);
    607                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    608                 result = (result + 512) >> 10;
    609                 CLIP_RESULT(result)
    610                 result2 = ((p_ref[dx] + 16) >> 5);
    611                 CLIP_RESULT(result2)
    612                 /* 3/4 pel,  no need to clip */
    613                 result = (result + result2 + 1);
    614                 result = (result >> 1);
    615                 pkres  |= (result << 8);
    616                 /* third pixel */
    617                 r1 = *p_ref++;
    618                 result = (r2 + r1);
    619                 r2 = (r3 + r0);
    620                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    621                 r2 = (r4 + r5);
    622                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    623                 result = (result + 512) >> 10;
    624                 CLIP_RESULT(result)
    625                 result2 = ((p_ref[dx] + 16) >> 5);
    626                 CLIP_RESULT(result2)
    627                 /* 3/4 pel,  no need to clip */
    628                 result = (result + result2 + 1);
    629                 result = (result >> 1);
    630                 pkres  |= (result << 16);
    631                 /* fourth pixel */
    632                 r2 = *p_ref++;
    633                 result = (r3 + r2);
    634                 r3 = (r4 + r1);
    635                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    636                 r3 = (r5 + r0);
    637                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    638                 result = (result + 512) >> 10;
    639                 CLIP_RESULT(result)
    640                 result2 = ((p_ref[dx] + 16) >> 5);
    641                 CLIP_RESULT(result2)
    642                 /* 3/4 pel,  no need to clip */
    643                 result = (result + result2 + 1);
    644                 result = (result >> 1);
    645                 pkres  |= (result << 24);
    646                 *p_cur++ = pkres; /* write 4 pixels */
    647                 p_ref -= 3;  /* offset back to the middle of filter */
    648             }
    649             p_cur += curr_offset;  /* move to the next line */
    650             p_ref += ref_offset;    /* move to the next line */
    651         }
    652     }
    653     else
    654     {
    655         for (j = blkheight; j > 0 ; j--)
    656         {
    657             tmp = p_ref + blkwidth;
    658             for (; p_ref < tmp;)
    659             {
    660 
    661                 r0 = p_ref[-2];
    662                 r1 = p_ref[-1];
    663                 r2 = *p_ref++;
    664                 r3 = *p_ref++;
    665                 r4 = *p_ref++;
    666                 /* first pixel */
    667                 r5 = *p_ref++;
    668                 result = (r0 + r5);
    669                 r0 = (r1 + r4);
    670                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    671                 r0 = (r2 + r3);
    672                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    673                 result = (result + 512) >> 10;
    674                 CLIP_RESULT(result)
    675                 pkres  = result;
    676                 /* second pixel */
    677                 r0 = *p_ref++;
    678                 result = (r1 + r0);
    679                 r1 = (r2 + r5);
    680                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    681                 r1 = (r3 + r4);
    682                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    683                 result = (result + 512) >> 10;
    684                 CLIP_RESULT(result)
    685                 pkres  |= (result << 8);
    686                 /* third pixel */
    687                 r1 = *p_ref++;
    688                 result = (r2 + r1);
    689                 r2 = (r3 + r0);
    690                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    691                 r2 = (r4 + r5);
    692                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    693                 result = (result + 512) >> 10;
    694                 CLIP_RESULT(result)
    695                 pkres  |= (result << 16);
    696                 /* fourth pixel */
    697                 r2 = *p_ref++;
    698                 result = (r3 + r2);
    699                 r3 = (r4 + r1);
    700                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    701                 r3 = (r5 + r0);
    702                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    703                 result = (result + 512) >> 10;
    704                 CLIP_RESULT(result)
    705                 pkres  |= (result << 24);
    706                 *p_cur++ = pkres; /* write 4 pixels */
    707                 p_ref -= 3;  /* offset back to the middle of filter */
    708             }
    709             p_cur += curr_offset;  /* move to the next line */
    710             p_ref += ref_offset;    /* move to the next line */
    711         }
    712     }
    713 
    714     return ;
    715 }
    716 
    717 void eHorzInterp3MC(uint8 *in, int inpitch, int *out, int outpitch,
    718                     int blkwidth, int blkheight)
    719 {
    720     uint8 *p_ref, *tmp;
    721     int   *p_cur;
    722     int result, curr_offset, ref_offset;
    723     int j, r0, r1, r2, r3, r4, r5;
    724 
    725     p_cur = out;
    726     curr_offset = (outpitch - blkwidth);
    727     p_ref = in;
    728     ref_offset = inpitch - blkwidth;
    729 
    730     for (j = blkheight; j > 0 ; j--)
    731     {
    732         tmp = p_ref + blkwidth;
    733         for (; p_ref < tmp;)
    734         {
    735 
    736             r0 = p_ref[-2];
    737             r1 = p_ref[-1];
    738             r2 = *p_ref++;
    739             r3 = *p_ref++;
    740             r4 = *p_ref++;
    741             /* first pixel */
    742             r5 = *p_ref++;
    743             result = (r0 + r5);
    744             r0 = (r1 + r4);
    745             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    746             r0 = (r2 + r3);
    747             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    748             *p_cur++ = result;
    749             /* second pixel */
    750             r0 = *p_ref++;
    751             result = (r1 + r0);
    752             r1 = (r2 + r5);
    753             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    754             r1 = (r3 + r4);
    755             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    756             *p_cur++ = result;
    757             /* third pixel */
    758             r1 = *p_ref++;
    759             result = (r2 + r1);
    760             r2 = (r3 + r0);
    761             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    762             r2 = (r4 + r5);
    763             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    764             *p_cur++ = result;
    765             /* fourth pixel */
    766             r2 = *p_ref++;
    767             result = (r3 + r2);
    768             r3 = (r4 + r1);
    769             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    770             r3 = (r5 + r0);
    771             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    772             *p_cur++ = result;
    773             p_ref -= 3; /* move back to the middle of the filter */
    774         }
    775         p_cur += curr_offset; /* move to the next line */
    776         p_ref += ref_offset;
    777     }
    778 
    779     return ;
    780 }
    781 void eVertInterp1MC(uint8 *in, int inpitch, uint8 *out, int outpitch,
    782                     int blkwidth, int blkheight, int dy)
    783 {
    784     uint8 *p_cur, *p_ref, *tmp;
    785     int result, curr_offset, ref_offset;
    786     int j, i;
    787     int32 r0, r1, r2, r3, r4, r5, r6, r7, r8, r13;
    788     uint8  tmp_in[24][24];
    789 
    790     /* not word-aligned */
    791     if (((intptr_t)in)&0x3)
    792     {
    793         eCreateAlign(in, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
    794         in = &tmp_in[2][0];
    795         inpitch = 24;
    796     }
    797     p_cur = out;
    798     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
    799     ref_offset = blkheight * inpitch; /* for limit */
    800 
    801     curr_offset += 3;
    802 
    803     if (dy&1)
    804     {
    805         dy = (dy >> 1) ? 0 : -inpitch;
    806 
    807         for (j = 0; j < blkwidth; j += 4, in += 4)
    808         {
    809             r13 = 0;
    810             p_ref = in;
    811             p_cur -= outpitch;  /* compensate for the first offset */
    812             tmp = p_ref + ref_offset; /* limit */
    813             while (p_ref < tmp)  /* the loop un-rolled  */
    814             {
    815                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
    816                 p_ref += inpitch;
    817                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
    818                 r0 &= 0xFF00FF;
    819 
    820                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
    821                 r7 = (r1 >> 8) & 0xFF00FF;
    822                 r1 &= 0xFF00FF;
    823 
    824                 r0 += r1;
    825                 r6 += r7;
    826 
    827                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
    828                 r8 = (r2 >> 8) & 0xFF00FF;
    829                 r2 &= 0xFF00FF;
    830 
    831                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
    832                 r7 = (r1 >> 8) & 0xFF00FF;
    833                 r1 &= 0xFF00FF;
    834                 r1 += r2;
    835 
    836                 r7 += r8;
    837 
    838                 r0 += 20 * r1;
    839                 r6 += 20 * r7;
    840                 r0 += 0x100010;
    841                 r6 += 0x100010;
    842 
    843                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
    844                 r8 = (r2 >> 8) & 0xFF00FF;
    845                 r2 &= 0xFF00FF;
    846 
    847                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
    848                 r7 = (r1 >> 8) & 0xFF00FF;
    849                 r1 &= 0xFF00FF;
    850                 r1 += r2;
    851 
    852                 r7 += r8;
    853 
    854                 r0 -= 5 * r1;
    855                 r6 -= 5 * r7;
    856 
    857                 r0 >>= 5;
    858                 r6 >>= 5;
    859                 /* clip */
    860                 r13 |= r6;
    861                 r13 |= r0;
    862                 //CLIPPACK(r6,result)
    863 
    864                 r1 = *((uint32*)(p_ref + dy));
    865                 r2 = (r1 >> 8) & 0xFF00FF;
    866                 r1 &= 0xFF00FF;
    867                 r0 += r1;
    868                 r6 += r2;
    869                 r0 += 0x10001;
    870                 r6 += 0x10001;
    871                 r0 = (r0 >> 1) & 0xFF00FF;
    872                 r6 = (r6 >> 1) & 0xFF00FF;
    873 
    874                 r0 |= (r6 << 8);  /* pack it back */
    875                 *((uint32*)(p_cur += outpitch)) = r0;
    876             }
    877             p_cur += curr_offset; /* offset to the next pixel */
    878             if (r13 & 0xFF000700) /* this column need clipping */
    879             {
    880                 p_cur -= 4;
    881                 for (i = 0; i < 4; i++)
    882                 {
    883                     p_ref = in + i;
    884                     p_cur -= outpitch;  /* compensate for the first offset */
    885 
    886                     tmp = p_ref + ref_offset; /* limit */
    887                     while (p_ref < tmp)
    888                     {                           /* loop un-rolled */
    889                         r0 = *(p_ref - (inpitch << 1));
    890                         r1 = *(p_ref - inpitch);
    891                         r2 = *p_ref;
    892                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
    893                         r4 = *(p_ref += inpitch);
    894                         /* first pixel */
    895                         r5 = *(p_ref += inpitch);
    896                         result = (r0 + r5);
    897                         r0 = (r1 + r4);
    898                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
    899                         r0 = (r2 + r3);
    900                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
    901                         result = (result + 16) >> 5;
    902                         CLIP_RESULT(result)
    903                         /* 3/4 pel,  no need to clip */
    904                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
    905                         result = (result >> 1);
    906                         *(p_cur += outpitch) = result;
    907                         /* second pixel */
    908                         r0 = *(p_ref += inpitch);
    909                         result = (r1 + r0);
    910                         r1 = (r2 + r5);
    911                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
    912                         r1 = (r3 + r4);
    913                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
    914                         result = (result + 16) >> 5;
    915                         CLIP_RESULT(result)
    916                         /* 3/4 pel,  no need to clip */
    917                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
    918                         result = (result >> 1);
    919                         *(p_cur += outpitch) = result;
    920                         /* third pixel */
    921                         r1 = *(p_ref += inpitch);
    922                         result = (r2 + r1);
    923                         r2 = (r3 + r0);
    924                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
    925                         r2 = (r4 + r5);
    926                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
    927                         result = (result + 16) >> 5;
    928                         CLIP_RESULT(result)
    929                         /* 3/4 pel,  no need to clip */
    930                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
    931                         result = (result >> 1);
    932                         *(p_cur += outpitch) = result;
    933                         /* fourth pixel */
    934                         r2 = *(p_ref += inpitch);
    935                         result = (r3 + r2);
    936                         r3 = (r4 + r1);
    937                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
    938                         r3 = (r5 + r0);
    939                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
    940                         result = (result + 16) >> 5;
    941                         CLIP_RESULT(result)
    942                         /* 3/4 pel,  no need to clip */
    943                         result = (result + p_ref[dy-(inpitch<<1)] + 1);
    944                         result = (result >> 1);
    945                         *(p_cur += outpitch) = result;
    946                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
    947                     }
    948                     p_cur += (curr_offset - 3);
    949                 }
    950             }
    951         }
    952     }
    953     else
    954     {
    955         for (j = 0; j < blkwidth; j += 4, in += 4)
    956         {
    957             r13 = 0;
    958             p_ref = in;
    959             p_cur -= outpitch;  /* compensate for the first offset */
    960             tmp = p_ref + ref_offset; /* limit */
    961             while (p_ref < tmp)  /* the loop un-rolled  */
    962             {
    963                 r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
    964                 p_ref += inpitch;
    965                 r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
    966                 r0 &= 0xFF00FF;
    967 
    968                 r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
    969                 r7 = (r1 >> 8) & 0xFF00FF;
    970                 r1 &= 0xFF00FF;
    971 
    972                 r0 += r1;
    973                 r6 += r7;
    974 
    975                 r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
    976                 r8 = (r2 >> 8) & 0xFF00FF;
    977                 r2 &= 0xFF00FF;
    978 
    979                 r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
    980                 r7 = (r1 >> 8) & 0xFF00FF;
    981                 r1 &= 0xFF00FF;
    982                 r1 += r2;
    983 
    984                 r7 += r8;
    985 
    986                 r0 += 20 * r1;
    987                 r6 += 20 * r7;
    988                 r0 += 0x100010;
    989                 r6 += 0x100010;
    990 
    991                 r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
    992                 r8 = (r2 >> 8) & 0xFF00FF;
    993                 r2 &= 0xFF00FF;
    994 
    995                 r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
    996                 r7 = (r1 >> 8) & 0xFF00FF;
    997                 r1 &= 0xFF00FF;
    998                 r1 += r2;
    999 
   1000                 r7 += r8;
   1001 
   1002                 r0 -= 5 * r1;
   1003                 r6 -= 5 * r7;
   1004 
   1005                 r0 >>= 5;
   1006                 r6 >>= 5;
   1007                 /* clip */
   1008                 r13 |= r6;
   1009                 r13 |= r0;
   1010                 //CLIPPACK(r6,result)
   1011                 r0 &= 0xFF00FF;
   1012                 r6 &= 0xFF00FF;
   1013                 r0 |= (r6 << 8);  /* pack it back */
   1014                 *((uint32*)(p_cur += outpitch)) = r0;
   1015             }
   1016             p_cur += curr_offset; /* offset to the next pixel */
   1017             if (r13 & 0xFF000700) /* this column need clipping */
   1018             {
   1019                 p_cur -= 4;
   1020                 for (i = 0; i < 4; i++)
   1021                 {
   1022                     p_ref = in + i;
   1023                     p_cur -= outpitch;  /* compensate for the first offset */
   1024                     tmp = p_ref + ref_offset; /* limit */
   1025                     while (p_ref < tmp)
   1026                     {                           /* loop un-rolled */
   1027                         r0 = *(p_ref - (inpitch << 1));
   1028                         r1 = *(p_ref - inpitch);
   1029                         r2 = *p_ref;
   1030                         r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1031                         r4 = *(p_ref += inpitch);
   1032                         /* first pixel */
   1033                         r5 = *(p_ref += inpitch);
   1034                         result = (r0 + r5);
   1035                         r0 = (r1 + r4);
   1036                         result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1037                         r0 = (r2 + r3);
   1038                         result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1039                         result = (result + 16) >> 5;
   1040                         CLIP_RESULT(result)
   1041                         *(p_cur += outpitch) = result;
   1042                         /* second pixel */
   1043                         r0 = *(p_ref += inpitch);
   1044                         result = (r1 + r0);
   1045                         r1 = (r2 + r5);
   1046                         result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1047                         r1 = (r3 + r4);
   1048                         result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1049                         result = (result + 16) >> 5;
   1050                         CLIP_RESULT(result)
   1051                         *(p_cur += outpitch) = result;
   1052                         /* third pixel */
   1053                         r1 = *(p_ref += inpitch);
   1054                         result = (r2 + r1);
   1055                         r2 = (r3 + r0);
   1056                         result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1057                         r2 = (r4 + r5);
   1058                         result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1059                         result = (result + 16) >> 5;
   1060                         CLIP_RESULT(result)
   1061                         *(p_cur += outpitch) = result;
   1062                         /* fourth pixel */
   1063                         r2 = *(p_ref += inpitch);
   1064                         result = (r3 + r2);
   1065                         r3 = (r4 + r1);
   1066                         result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1067                         r3 = (r5 + r0);
   1068                         result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1069                         result = (result + 16) >> 5;
   1070                         CLIP_RESULT(result)
   1071                         *(p_cur += outpitch) = result;
   1072                         p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1073                     }
   1074                     p_cur += (curr_offset - 3);
   1075                 }
   1076             }
   1077         }
   1078     }
   1079 
   1080     return ;
   1081 }
   1082 
   1083 void eVertInterp2MC(uint8 *in, int inpitch, int *out, int outpitch,
   1084                     int blkwidth, int blkheight)
   1085 {
   1086     int *p_cur;
   1087     uint8 *p_ref, *tmp;
   1088     int result, curr_offset, ref_offset;
   1089     int j, r0, r1, r2, r3, r4, r5;
   1090 
   1091     p_cur = out;
   1092     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
   1093     ref_offset = blkheight * inpitch; /* for limit */
   1094 
   1095     for (j = 0; j < blkwidth; j++)
   1096     {
   1097         p_cur -= outpitch; /* compensate for the first offset */
   1098         p_ref = in++;
   1099 
   1100         tmp = p_ref + ref_offset; /* limit */
   1101         while (p_ref < tmp)
   1102         {                           /* loop un-rolled */
   1103             r0 = *(p_ref - (inpitch << 1));
   1104             r1 = *(p_ref - inpitch);
   1105             r2 = *p_ref;
   1106             r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1107             r4 = *(p_ref += inpitch);
   1108             /* first pixel */
   1109             r5 = *(p_ref += inpitch);
   1110             result = (r0 + r5);
   1111             r0 = (r1 + r4);
   1112             result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1113             r0 = (r2 + r3);
   1114             result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1115             *(p_cur += outpitch) = result;
   1116             /* second pixel */
   1117             r0 = *(p_ref += inpitch);
   1118             result = (r1 + r0);
   1119             r1 = (r2 + r5);
   1120             result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1121             r1 = (r3 + r4);
   1122             result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1123             *(p_cur += outpitch) = result;
   1124             /* third pixel */
   1125             r1 = *(p_ref += inpitch);
   1126             result = (r2 + r1);
   1127             r2 = (r3 + r0);
   1128             result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1129             r2 = (r4 + r5);
   1130             result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1131             *(p_cur += outpitch) = result;
   1132             /* fourth pixel */
   1133             r2 = *(p_ref += inpitch);
   1134             result = (r3 + r2);
   1135             r3 = (r4 + r1);
   1136             result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1137             r3 = (r5 + r0);
   1138             result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1139             *(p_cur += outpitch) = result;
   1140             p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1141         }
   1142         p_cur += curr_offset;
   1143     }
   1144 
   1145     return ;
   1146 }
   1147 
   1148 void eVertInterp3MC(int *in, int inpitch, uint8 *out, int outpitch,
   1149                     int blkwidth, int blkheight, int dy)
   1150 {
   1151     uint8 *p_cur;
   1152     int *p_ref, *tmp;
   1153     int result, result2, curr_offset, ref_offset;
   1154     int j, r0, r1, r2, r3, r4, r5;
   1155 
   1156     p_cur = out;
   1157     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically back up and one pixel to right */
   1158     ref_offset = blkheight * inpitch; /* for limit */
   1159 
   1160     if (dy&1)
   1161     {
   1162         dy = (dy >> 1) ? -(inpitch << 1) : -(inpitch << 1) - inpitch;
   1163 
   1164         for (j = 0; j < blkwidth; j++)
   1165         {
   1166             p_cur -= outpitch; /* compensate for the first offset */
   1167             p_ref = in++;
   1168 
   1169             tmp = p_ref + ref_offset; /* limit */
   1170             while (p_ref < tmp)
   1171             {                           /* loop un-rolled */
   1172                 r0 = *(p_ref - (inpitch << 1));
   1173                 r1 = *(p_ref - inpitch);
   1174                 r2 = *p_ref;
   1175                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1176                 r4 = *(p_ref += inpitch);
   1177                 /* first pixel */
   1178                 r5 = *(p_ref += inpitch);
   1179                 result = (r0 + r5);
   1180                 r0 = (r1 + r4);
   1181                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1182                 r0 = (r2 + r3);
   1183                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1184                 result = (result + 512) >> 10;
   1185                 CLIP_RESULT(result)
   1186                 result2 = ((p_ref[dy] + 16) >> 5);
   1187                 CLIP_RESULT(result2)
   1188                 /* 3/4 pel,  no need to clip */
   1189                 result = (result + result2 + 1);
   1190                 result = (result >> 1);
   1191                 *(p_cur += outpitch) = result;
   1192                 /* second pixel */
   1193                 r0 = *(p_ref += inpitch);
   1194                 result = (r1 + r0);
   1195                 r1 = (r2 + r5);
   1196                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1197                 r1 = (r3 + r4);
   1198                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1199                 result = (result + 512) >> 10;
   1200                 CLIP_RESULT(result)
   1201                 result2 = ((p_ref[dy] + 16) >> 5);
   1202                 CLIP_RESULT(result2)
   1203                 /* 3/4 pel,  no need to clip */
   1204                 result = (result + result2 + 1);
   1205                 result = (result >> 1);
   1206                 *(p_cur += outpitch) = result;
   1207                 /* third pixel */
   1208                 r1 = *(p_ref += inpitch);
   1209                 result = (r2 + r1);
   1210                 r2 = (r3 + r0);
   1211                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1212                 r2 = (r4 + r5);
   1213                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1214                 result = (result + 512) >> 10;
   1215                 CLIP_RESULT(result)
   1216                 result2 = ((p_ref[dy] + 16) >> 5);
   1217                 CLIP_RESULT(result2)
   1218                 /* 3/4 pel,  no need to clip */
   1219                 result = (result + result2 + 1);
   1220                 result = (result >> 1);
   1221                 *(p_cur += outpitch) = result;
   1222                 /* fourth pixel */
   1223                 r2 = *(p_ref += inpitch);
   1224                 result = (r3 + r2);
   1225                 r3 = (r4 + r1);
   1226                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1227                 r3 = (r5 + r0);
   1228                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1229                 result = (result + 512) >> 10;
   1230                 CLIP_RESULT(result)
   1231                 result2 = ((p_ref[dy] + 16) >> 5);
   1232                 CLIP_RESULT(result2)
   1233                 /* 3/4 pel,  no need to clip */
   1234                 result = (result + result2 + 1);
   1235                 result = (result >> 1);
   1236                 *(p_cur += outpitch) = result;
   1237                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1238             }
   1239             p_cur += curr_offset;
   1240         }
   1241     }
   1242     else
   1243     {
   1244         for (j = 0; j < blkwidth; j++)
   1245         {
   1246             p_cur -= outpitch; /* compensate for the first offset */
   1247             p_ref = in++;
   1248 
   1249             tmp = p_ref + ref_offset; /* limit */
   1250             while (p_ref < tmp)
   1251             {                           /* loop un-rolled */
   1252                 r0 = *(p_ref - (inpitch << 1));
   1253                 r1 = *(p_ref - inpitch);
   1254                 r2 = *p_ref;
   1255                 r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1256                 r4 = *(p_ref += inpitch);
   1257                 /* first pixel */
   1258                 r5 = *(p_ref += inpitch);
   1259                 result = (r0 + r5);
   1260                 r0 = (r1 + r4);
   1261                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1262                 r0 = (r2 + r3);
   1263                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1264                 result = (result + 512) >> 10;
   1265                 CLIP_RESULT(result)
   1266                 *(p_cur += outpitch) = result;
   1267                 /* second pixel */
   1268                 r0 = *(p_ref += inpitch);
   1269                 result = (r1 + r0);
   1270                 r1 = (r2 + r5);
   1271                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1272                 r1 = (r3 + r4);
   1273                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1274                 result = (result + 512) >> 10;
   1275                 CLIP_RESULT(result)
   1276                 *(p_cur += outpitch) = result;
   1277                 /* third pixel */
   1278                 r1 = *(p_ref += inpitch);
   1279                 result = (r2 + r1);
   1280                 r2 = (r3 + r0);
   1281                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1282                 r2 = (r4 + r5);
   1283                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1284                 result = (result + 512) >> 10;
   1285                 CLIP_RESULT(result)
   1286                 *(p_cur += outpitch) = result;
   1287                 /* fourth pixel */
   1288                 r2 = *(p_ref += inpitch);
   1289                 result = (r3 + r2);
   1290                 r3 = (r4 + r1);
   1291                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1292                 r3 = (r5 + r0);
   1293                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1294                 result = (result + 512) >> 10;
   1295                 CLIP_RESULT(result)
   1296                 *(p_cur += outpitch) = result;
   1297                 p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1298             }
   1299             p_cur += curr_offset;
   1300         }
   1301     }
   1302 
   1303     return ;
   1304 }
   1305 
   1306 void eDiagonalInterpMC(uint8 *in1, uint8 *in2, int inpitch,
   1307                        uint8 *out, int outpitch,
   1308                        int blkwidth, int blkheight)
   1309 {
   1310     int j, i;
   1311     int result;
   1312     uint8 *p_cur, *p_ref, *p_tmp8, *tmp;
   1313     int curr_offset, ref_offset;
   1314     uint8 tmp_res[24][24], tmp_in[24][24];
   1315     uint32 *p_tmp;
   1316     uint32 pkres, tmp_result;
   1317     int32 r0, r1, r2, r3, r4, r5;
   1318     int32 r6, r7, r8, r9, r10, r13;
   1319 
   1320     ref_offset = inpitch - blkwidth;
   1321     p_ref = in1 - 2;
   1322     /* perform horizontal interpolation */
   1323     /* not word-aligned */
   1324     /* It is faster to read 1 byte at time to avoid calling CreateAlign */
   1325     /*  if(((uint32)p_ref)&0x3)
   1326         {
   1327             CreateAlign(p_ref,inpitch,0,&tmp_in[0][0],blkwidth+8,blkheight);
   1328             p_ref = &tmp_in[0][0];
   1329             ref_offset = 24-blkwidth;
   1330         }*/
   1331 
   1332     p_tmp = (uint32*) & (tmp_res[0][0]);
   1333     for (j = blkheight; j > 0; j--)
   1334     {
   1335         r13 = 0;
   1336         tmp = p_ref + blkwidth;
   1337 
   1338         //r0 = *((uint32*)p_ref);   /* d,c,b,a */
   1339         //r1 = (r0>>8)&0xFF00FF;    /* 0,d,0,b */
   1340         //r0 &= 0xFF00FF;           /* 0,c,0,a */
   1341         /* It is faster to read 1 byte at a time */
   1342         r0 = p_ref[0];
   1343         r1 = p_ref[2];
   1344         r0 |= (r1 << 16);           /* 0,c,0,a */
   1345         r1 = p_ref[1];
   1346         r2 = p_ref[3];
   1347         r1 |= (r2 << 16);           /* 0,d,0,b */
   1348 
   1349         while (p_ref < tmp)
   1350         {
   1351             //r2 = *((uint32*)(p_ref+=4));/* h,g,f,e */
   1352             //r3 = (r2>>8)&0xFF00FF;  /* 0,h,0,f */
   1353             //r2 &= 0xFF00FF;           /* 0,g,0,e */
   1354             /* It is faster to read 1 byte at a time */
   1355             r2 = *(p_ref += 4);
   1356             r3 = p_ref[2];
   1357             r2 |= (r3 << 16);           /* 0,g,0,e */
   1358             r3 = p_ref[1];
   1359             r4 = p_ref[3];
   1360             r3 |= (r4 << 16);           /* 0,h,0,f */
   1361 
   1362             r4 = r0 + r3;       /* c+h, a+f */
   1363             r5 = r0 + r1;   /* c+d, a+b */
   1364             r6 = r2 + r3;   /* g+h, e+f */
   1365             r5 >>= 16;
   1366             r5 |= (r6 << 16);   /* e+f, c+d */
   1367             r4 += r5 * 20;      /* c+20*e+20*f+h, a+20*c+20*d+f */
   1368             r4 += 0x100010; /* +16, +16 */
   1369             r5 = r1 + r2;       /* d+g, b+e */
   1370             r4 -= r5 * 5;       /* c-5*d+20*e+20*f-5*g+h, a-5*b+20*c+20*d-5*e+f */
   1371             r4 >>= 5;
   1372             r13 |= r4;      /* check clipping */
   1373             r4 &= 0xFF00FF; /* mask */
   1374 
   1375             r5 = p_ref[4];  /* i */
   1376             r6 = (r5 << 16);
   1377             r5 = r6 | (r2 >> 16);/* 0,i,0,g */
   1378             r5 += r1;       /* d+i, b+g */ /* r5 not free */
   1379             r1 >>= 16;
   1380             r1 |= (r3 << 16); /* 0,f,0,d */ /* r1 has changed */
   1381             r1 += r2;       /* f+g, d+e */
   1382             r5 += 20 * r1;  /* d+20f+20g+i, b+20d+20e+g */
   1383             r0 >>= 16;
   1384             r0 |= (r2 << 16); /* 0,e,0,c */ /* r0 has changed */
   1385             r0 += r3;       /* e+h, c+f */
   1386             r5 += 0x100010; /* 16,16 */
   1387             r5 -= r0 * 5;       /* d-5e+20f+20g-5h+i, b-5c+20d+20e-5f+g */
   1388             r5 >>= 5;
   1389             r13 |= r5;      /* check clipping */
   1390             r5 &= 0xFF00FF; /* mask */
   1391 
   1392             r4 |= (r5 << 8);    /* pack them together */
   1393             *p_tmp++ = r4;
   1394             r1 = r3;
   1395             r0 = r2;
   1396         }
   1397         p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
   1398         p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
   1399 
   1400         if (r13&0xFF000700) /* need clipping */
   1401         {
   1402             /* move back to the beginning of the line */
   1403             p_ref -= (ref_offset + blkwidth);   /* input */
   1404             p_tmp -= 6; /* intermediate output */
   1405             tmp = p_ref + blkwidth;
   1406             while (p_ref < tmp)
   1407             {
   1408                 r0 = *p_ref++;
   1409                 r1 = *p_ref++;
   1410                 r2 = *p_ref++;
   1411                 r3 = *p_ref++;
   1412                 r4 = *p_ref++;
   1413                 /* first pixel */
   1414                 r5 = *p_ref++;
   1415                 result = (r0 + r5);
   1416                 r0 = (r1 + r4);
   1417                 result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1418                 r0 = (r2 + r3);
   1419                 result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1420                 result = (result + 16) >> 5;
   1421                 CLIP_RESULT(result)
   1422                 pkres = result;
   1423                 /* second pixel */
   1424                 r0 = *p_ref++;
   1425                 result = (r1 + r0);
   1426                 r1 = (r2 + r5);
   1427                 result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1428                 r1 = (r3 + r4);
   1429                 result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1430                 result = (result + 16) >> 5;
   1431                 CLIP_RESULT(result)
   1432                 pkres |= (result << 8);
   1433                 /* third pixel */
   1434                 r1 = *p_ref++;
   1435                 result = (r2 + r1);
   1436                 r2 = (r3 + r0);
   1437                 result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1438                 r2 = (r4 + r5);
   1439                 result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1440                 result = (result + 16) >> 5;
   1441                 CLIP_RESULT(result)
   1442                 pkres |= (result << 16);
   1443                 /* fourth pixel */
   1444                 r2 = *p_ref++;
   1445                 result = (r3 + r2);
   1446                 r3 = (r4 + r1);
   1447                 result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1448                 r3 = (r5 + r0);
   1449                 result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1450                 result = (result + 16) >> 5;
   1451                 CLIP_RESULT(result)
   1452                 pkres |= (result << 24);
   1453 
   1454                 *p_tmp++ = pkres; /* write 4 pixel */
   1455                 p_ref -= 5;
   1456             }
   1457             p_tmp += ((24 - blkwidth) >> 2); /* move to the next line */
   1458             p_ref += ref_offset;  /*    ref_offset = inpitch-blkwidth; */
   1459         }
   1460     }
   1461 
   1462     /*  perform vertical interpolation */
   1463     /* not word-aligned */
   1464     if (((intptr_t)in2)&0x3)
   1465     {
   1466         eCreateAlign(in2, inpitch, -2, &tmp_in[0][0], blkwidth, blkheight + 5);
   1467         in2 = &tmp_in[2][0];
   1468         inpitch = 24;
   1469     }
   1470 
   1471     p_cur = out;
   1472     curr_offset = 1 - outpitch * (blkheight - 1); /* offset vertically up and one pixel right */
   1473     pkres = blkheight * inpitch; /* reuse it for limit */
   1474 
   1475     curr_offset += 3;
   1476 
   1477     for (j = 0; j < blkwidth; j += 4, in2 += 4)
   1478     {
   1479         r13 = 0;
   1480         p_ref = in2;
   1481         p_tmp8 = &(tmp_res[0][j]); /* intermediate result */
   1482         p_tmp8 -= 24;  /* compensate for the first offset */
   1483         p_cur -= outpitch;  /* compensate for the first offset */
   1484         tmp = p_ref + pkres; /* limit */
   1485         while (p_ref < tmp)  /* the loop un-rolled  */
   1486         {
   1487             /* Read 1 byte at a time is too slow, too many read and pack ops, need to call CreateAlign */
   1488             /*p_ref8 = p_ref-(inpitch<<1);          r0 = p_ref8[0];         r1 = p_ref8[2];
   1489             r0 |= (r1<<16);         r6 = p_ref8[1];         r1 = p_ref8[3];
   1490             r6 |= (r1<<16);         p_ref+=inpitch; */
   1491             r0 = *((uint32*)(p_ref - (inpitch << 1))); /* load 4 bytes */
   1492             p_ref += inpitch;
   1493             r6 = (r0 >> 8) & 0xFF00FF; /* second and fourth byte */
   1494             r0 &= 0xFF00FF;
   1495 
   1496             /*p_ref8 = p_ref+(inpitch<<1);
   1497             r1 = p_ref8[0];         r7 = p_ref8[2];         r1 |= (r7<<16);
   1498             r7 = p_ref8[1];         r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1499             r1 = *((uint32*)(p_ref + (inpitch << 1)));  /* r1, r7, ref[3] */
   1500             r7 = (r1 >> 8) & 0xFF00FF;
   1501             r1 &= 0xFF00FF;
   1502 
   1503             r0 += r1;
   1504             r6 += r7;
   1505 
   1506             /*r2 = p_ref[0];            r8 = p_ref[2];          r2 |= (r8<<16);
   1507             r8 = p_ref[1];          r1 = p_ref[3];          r8 |= (r1<<16);*/
   1508             r2 = *((uint32*)p_ref); /* r2, r8, ref[1] */
   1509             r8 = (r2 >> 8) & 0xFF00FF;
   1510             r2 &= 0xFF00FF;
   1511 
   1512             /*p_ref8 = p_ref-inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
   1513             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
   1514             r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1515             r1 = *((uint32*)(p_ref - inpitch)); /* r1, r7, ref[0] */
   1516             r7 = (r1 >> 8) & 0xFF00FF;
   1517             r1 &= 0xFF00FF;
   1518             r1 += r2;
   1519 
   1520             r7 += r8;
   1521 
   1522             r0 += 20 * r1;
   1523             r6 += 20 * r7;
   1524             r0 += 0x100010;
   1525             r6 += 0x100010;
   1526 
   1527             /*p_ref8 = p_ref-(inpitch<<1);          r2 = p_ref8[0];         r8 = p_ref8[2];
   1528             r2 |= (r8<<16);         r8 = p_ref8[1];         r1 = p_ref8[3];         r8 |= (r1<<16);*/
   1529             r2 = *((uint32*)(p_ref - (inpitch << 1))); /* r2, r8, ref[-1] */
   1530             r8 = (r2 >> 8) & 0xFF00FF;
   1531             r2 &= 0xFF00FF;
   1532 
   1533             /*p_ref8 = p_ref+inpitch;           r1 = p_ref8[0];         r7 = p_ref8[2];
   1534             r1 |= (r7<<16);         r1 += r2;           r7 = p_ref8[1];
   1535             r2 = p_ref8[3];         r7 |= (r2<<16);*/
   1536             r1 = *((uint32*)(p_ref + inpitch)); /* r1, r7, ref[2] */
   1537             r7 = (r1 >> 8) & 0xFF00FF;
   1538             r1 &= 0xFF00FF;
   1539             r1 += r2;
   1540 
   1541             r7 += r8;
   1542 
   1543             r0 -= 5 * r1;
   1544             r6 -= 5 * r7;
   1545 
   1546             r0 >>= 5;
   1547             r6 >>= 5;
   1548             /* clip */
   1549             r13 |= r6;
   1550             r13 |= r0;
   1551             //CLIPPACK(r6,result)
   1552             /* add with horizontal results */
   1553             r10 = *((uint32*)(p_tmp8 += 24));
   1554             r9 = (r10 >> 8) & 0xFF00FF;
   1555             r10 &= 0xFF00FF;
   1556 
   1557             r0 += r10;
   1558             r0 += 0x10001;
   1559             r0 = (r0 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
   1560 
   1561             r6 += r9;
   1562             r6 += 0x10001;
   1563             r6 = (r6 >> 1) & 0xFF00FF;   /* mask to 8 bytes */
   1564 
   1565             r0 |= (r6 << 8);  /* pack it back */
   1566             *((uint32*)(p_cur += outpitch)) = r0;
   1567         }
   1568         p_cur += curr_offset; /* offset to the next pixel */
   1569         if (r13 & 0xFF000700) /* this column need clipping */
   1570         {
   1571             p_cur -= 4;
   1572             for (i = 0; i < 4; i++)
   1573             {
   1574                 p_ref = in2 + i;
   1575                 p_tmp8 = &(tmp_res[0][j+i]); /* intermediate result */
   1576                 p_tmp8 -= 24;  /* compensate for the first offset */
   1577                 p_cur -= outpitch;  /* compensate for the first offset */
   1578                 tmp = p_ref + pkres; /* limit */
   1579                 while (p_ref < tmp)  /* the loop un-rolled  */
   1580                 {
   1581                     r0 = *(p_ref - (inpitch << 1));
   1582                     r1 = *(p_ref - inpitch);
   1583                     r2 = *p_ref;
   1584                     r3 = *(p_ref += inpitch);  /* modify pointer before loading */
   1585                     r4 = *(p_ref += inpitch);
   1586                     /* first pixel */
   1587                     r5 = *(p_ref += inpitch);
   1588                     result = (r0 + r5);
   1589                     r0 = (r1 + r4);
   1590                     result -= (r0 * 5);//result -= r0;  result -= (r0<<2);
   1591                     r0 = (r2 + r3);
   1592                     result += (r0 * 20);//result += (r0<<4);    result += (r0<<2);
   1593                     result = (result + 16) >> 5;
   1594                     CLIP_RESULT(result)
   1595                     tmp_result = *(p_tmp8 += 24);  /* modify pointer before loading */
   1596                     result = (result + tmp_result + 1);  /* no clip */
   1597                     result = (result >> 1);
   1598                     *(p_cur += outpitch) = result;
   1599                     /* second pixel */
   1600                     r0 = *(p_ref += inpitch);
   1601                     result = (r1 + r0);
   1602                     r1 = (r2 + r5);
   1603                     result -= (r1 * 5);//result -= r1;  result -= (r1<<2);
   1604                     r1 = (r3 + r4);
   1605                     result += (r1 * 20);//result += (r1<<4);    result += (r1<<2);
   1606                     result = (result + 16) >> 5;
   1607                     CLIP_RESULT(result)
   1608                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1609                     result = (result + tmp_result + 1);  /* no clip */
   1610                     result = (result >> 1);
   1611                     *(p_cur += outpitch) = result;
   1612                     /* third pixel */
   1613                     r1 = *(p_ref += inpitch);
   1614                     result = (r2 + r1);
   1615                     r2 = (r3 + r0);
   1616                     result -= (r2 * 5);//result -= r2;  result -= (r2<<2);
   1617                     r2 = (r4 + r5);
   1618                     result += (r2 * 20);//result += (r2<<4);    result += (r2<<2);
   1619                     result = (result + 16) >> 5;
   1620                     CLIP_RESULT(result)
   1621                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1622                     result = (result + tmp_result + 1);  /* no clip */
   1623                     result = (result >> 1);
   1624                     *(p_cur += outpitch) = result;
   1625                     /* fourth pixel */
   1626                     r2 = *(p_ref += inpitch);
   1627                     result = (r3 + r2);
   1628                     r3 = (r4 + r1);
   1629                     result -= (r3 * 5);//result -= r3;  result -= (r3<<2);
   1630                     r3 = (r5 + r0);
   1631                     result += (r3 * 20);//result += (r3<<4);    result += (r3<<2);
   1632                     result = (result + 16) >> 5;
   1633                     CLIP_RESULT(result)
   1634                     tmp_result = *(p_tmp8 += 24);  /* intermediate result */
   1635                     result = (result + tmp_result + 1);  /* no clip */
   1636                     result = (result >> 1);
   1637                     *(p_cur += outpitch) = result;
   1638                     p_ref -= (inpitch << 1);  /* move back to center of the filter of the next one */
   1639                 }
   1640                 p_cur += (curr_offset - 3);
   1641             }
   1642         }
   1643     }
   1644 
   1645     return ;
   1646 }
   1647 
   1648 /* position G */
   1649 void eFullPelMC(uint8 *in, int inpitch, uint8 *out, int outpitch,
   1650                 int blkwidth, int blkheight)
   1651 {
   1652     int i, j;
   1653     int offset_in = inpitch - blkwidth;
   1654     int offset_out = outpitch - blkwidth;
   1655     uint32 temp;
   1656     uint8 byte;
   1657 
   1658     if (((intptr_t)in)&3)
   1659     {
   1660         for (j = blkheight; j > 0; j--)
   1661         {
   1662             for (i = blkwidth; i > 0; i -= 4)
   1663             {
   1664                 temp = *in++;
   1665                 byte = *in++;
   1666                 temp |= (byte << 8);
   1667                 byte = *in++;
   1668                 temp |= (byte << 16);
   1669                 byte = *in++;
   1670                 temp |= (byte << 24);
   1671 
   1672                 *((uint32*)out) = temp; /* write 4 bytes */
   1673                 out += 4;
   1674             }
   1675             out += offset_out;
   1676             in += offset_in;
   1677         }
   1678     }
   1679     else
   1680     {
   1681         for (j = blkheight; j > 0; j--)
   1682         {
   1683             for (i = blkwidth; i > 0; i -= 4)
   1684             {
   1685                 temp = *((uint32*)in);
   1686                 *((uint32*)out) = temp;
   1687                 in += 4;
   1688                 out += 4;
   1689             }
   1690             out += offset_out;
   1691             in += offset_in;
   1692         }
   1693     }
   1694     return ;
   1695 }
   1696 
   1697 void ePadChroma(uint8 *ref, int picwidth, int picheight, int picpitch, int x_pos, int y_pos)
   1698 {
   1699     int pad_height;
   1700     int pad_width;
   1701     uint8 *start;
   1702     uint32 word1, word2, word3;
   1703     int offset, j;
   1704 
   1705 
   1706     pad_height = 8 + ((y_pos & 7) ? 1 : 0);
   1707     pad_width = 8 + ((x_pos & 7) ? 1 : 0);
   1708 
   1709     y_pos >>= 3;
   1710     x_pos >>= 3;
   1711     // pad vertical first
   1712     if (y_pos < 0) // need to pad up
   1713     {
   1714         if (x_pos < -8) start = ref - 8;
   1715         else if (x_pos + pad_width > picwidth + 7) start = ref + picwidth + 7 - pad_width;
   1716         else start = ref + x_pos;
   1717 
   1718         /* word-align start */
   1719         offset = (intptr_t)start & 0x3;
   1720         if (offset) start -= offset;
   1721 
   1722         word1 = *((uint32*)start);
   1723         word2 = *((uint32*)(start + 4));
   1724         word3 = *((uint32*)(start + 8));
   1725 
   1726         /* pad up N rows */
   1727         j = -y_pos;
   1728         if (j > 8) j = 8;
   1729         while (j--)
   1730         {
   1731             *((uint32*)(start -= picpitch)) = word1;
   1732             *((uint32*)(start + 4)) = word2;
   1733             *((uint32*)(start + 8)) = word3;
   1734         }
   1735 
   1736     }
   1737     else if (y_pos + pad_height >= picheight) /* pad down */
   1738     {
   1739         if (x_pos < -8) start = ref + picpitch * (picheight - 1) - 8;
   1740         else if (x_pos + pad_width > picwidth + 7) start = ref + picpitch * (picheight - 1) +
   1741                     picwidth + 7 - pad_width;
   1742         else    start = ref + picpitch * (picheight - 1) + x_pos;
   1743 
   1744         /* word-align start */
   1745         offset = (intptr_t)start & 0x3;
   1746         if (offset) start -= offset;
   1747 
   1748         word1 = *((uint32*)start);
   1749         word2 = *((uint32*)(start + 4));
   1750         word3 = *((uint32*)(start + 8));
   1751 
   1752         /* pad down N rows */
   1753         j = y_pos + pad_height - picheight;
   1754         if (j > 8) j = 8;
   1755         while (j--)
   1756         {
   1757             *((uint32*)(start += picpitch)) = word1;
   1758             *((uint32*)(start + 4)) = word2;
   1759             *((uint32*)(start + 8)) = word3;
   1760         }
   1761     }
   1762 
   1763     /* now pad horizontal */
   1764     if (x_pos < 0) // pad left
   1765     {
   1766         if (y_pos < -8) start = ref - (picpitch << 3);
   1767         else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch;
   1768         else start = ref + y_pos * picpitch;
   1769 
   1770         // now pad left 8 pixels for pad_height rows */
   1771         j = pad_height;
   1772         start -= picpitch;
   1773         while (j--)
   1774         {
   1775             word1 = *(start += picpitch);
   1776             word1 |= (word1 << 8);
   1777             word1 |= (word1 << 16);
   1778             *((uint32*)(start - 8)) = word1;
   1779             *((uint32*)(start - 4)) = word1;
   1780         }
   1781     }
   1782     else if (x_pos + pad_width >= picwidth) /* pad right */
   1783     {
   1784         if (y_pos < -8) start = ref - (picpitch << 3) + picwidth - 1;
   1785         else if (y_pos + pad_height > picheight + 7) start = ref + (picheight + 7 - pad_height) * picpitch + picwidth - 1;
   1786         else start = ref + y_pos * picpitch + picwidth - 1;
   1787 
   1788         // now pad right 8 pixels for pad_height rows */
   1789         j = pad_height;
   1790         start -= picpitch;
   1791         while (j--)
   1792         {
   1793             word1 = *(start += picpitch);
   1794             word1 |= (word1 << 8);
   1795             word1 |= (word1 << 16);
   1796             *((uint32*)(start + 1)) = word1;
   1797             *((uint32*)(start + 5)) = word1;
   1798         }
   1799     }
   1800 
   1801     return ;
   1802 }
   1803 
   1804 
   1805 void eChromaMotionComp(uint8 *ref, int picwidth, int picheight,
   1806                        int x_pos, int y_pos,
   1807                        uint8 *pred, int picpitch,
   1808                        int blkwidth, int blkheight)
   1809 {
   1810     int dx, dy;
   1811     int offset_dx, offset_dy;
   1812     int index;
   1813 
   1814     ePadChroma(ref, picwidth, picheight, picpitch, x_pos, y_pos);
   1815 
   1816     dx = x_pos & 7;
   1817     dy = y_pos & 7;
   1818     offset_dx = (dx + 7) >> 3;
   1819     offset_dy = (dy + 7) >> 3;
   1820     x_pos = x_pos >> 3;  /* round it to full-pel resolution */
   1821     y_pos = y_pos >> 3;
   1822 
   1823     ref += y_pos * picpitch + x_pos;
   1824 
   1825     index = offset_dx + (offset_dy << 1) + ((blkwidth << 1) & 0x7);
   1826 
   1827     (*(eChromaMC_SIMD[index]))(ref, picpitch , dx, dy, pred, picpitch, blkwidth, blkheight);
   1828     return ;
   1829 }
   1830 
   1831 
   1832 /* SIMD routines, unroll the loops in vertical direction, decreasing loops (things to be done) */
   1833 void eChromaDiagonalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   1834                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   1835 {
   1836     int32 r0, r1, r2, r3, result0, result1;
   1837     uint8 temp[288];
   1838     uint8 *ref, *out;
   1839     int i, j;
   1840     int dx_8 = 8 - dx;
   1841     int dy_8 = 8 - dy;
   1842 
   1843     /* horizontal first */
   1844     out = temp;
   1845     for (i = 0; i < blkheight + 1; i++)
   1846     {
   1847         ref = pRef;
   1848         r0 = ref[0];
   1849         for (j = 0; j < blkwidth; j += 4)
   1850         {
   1851             r0 |= (ref[2] << 16);
   1852             result0 = dx_8 * r0;
   1853 
   1854             r1 = ref[1] | (ref[3] << 16);
   1855             result0 += dx * r1;
   1856             *(int32 *)out = result0;
   1857 
   1858             result0 = dx_8 * r1;
   1859 
   1860             r2 = ref[4];
   1861             r0 = r0 >> 16;
   1862             r1 = r0 | (r2 << 16);
   1863             result0 += dx * r1;
   1864             *(int32 *)(out + 16) = result0;
   1865 
   1866             ref += 4;
   1867             out += 4;
   1868             r0 = r2;
   1869         }
   1870         pRef += srcPitch;
   1871         out += (32 - blkwidth);
   1872     }
   1873 
   1874 //  pRef -= srcPitch*(blkheight+1);
   1875     ref = temp;
   1876 
   1877     for (j = 0; j < blkwidth; j += 4)
   1878     {
   1879         r0 = *(int32 *)ref;
   1880         r1 = *(int32 *)(ref + 16);
   1881         ref += 32;
   1882         out = pOut;
   1883         for (i = 0; i < (blkheight >> 1); i++)
   1884         {
   1885             result0 = dy_8 * r0 + 0x00200020;
   1886             r2 = *(int32 *)ref;
   1887             result0 += dy * r2;
   1888             result0 >>= 6;
   1889             result0 &= 0x00FF00FF;
   1890             r0 = r2;
   1891 
   1892             result1 = dy_8 * r1 + 0x00200020;
   1893             r3 = *(int32 *)(ref + 16);
   1894             result1 += dy * r3;
   1895             result1 >>= 6;
   1896             result1 &= 0x00FF00FF;
   1897             r1 = r3;
   1898             *(int32 *)out = result0 | (result1 << 8);
   1899             out += predPitch;
   1900             ref += 32;
   1901 
   1902             result0 = dy_8 * r0 + 0x00200020;
   1903             r2 = *(int32 *)ref;
   1904             result0 += dy * r2;
   1905             result0 >>= 6;
   1906             result0 &= 0x00FF00FF;
   1907             r0 = r2;
   1908 
   1909             result1 = dy_8 * r1 + 0x00200020;
   1910             r3 = *(int32 *)(ref + 16);
   1911             result1 += dy * r3;
   1912             result1 >>= 6;
   1913             result1 &= 0x00FF00FF;
   1914             r1 = r3;
   1915             *(int32 *)out = result0 | (result1 << 8);
   1916             out += predPitch;
   1917             ref += 32;
   1918         }
   1919         pOut += 4;
   1920         ref = temp + 4; /* since it can only iterate twice max */
   1921     }
   1922     return;
   1923 }
   1924 
   1925 void eChromaHorizontalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   1926                               uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   1927 {
   1928     (void)(dy);
   1929 
   1930     int32 r0, r1, r2, result0, result1;
   1931     uint8 *ref, *out;
   1932     int i, j;
   1933     int dx_8 = 8 - dx;
   1934 
   1935     /* horizontal first */
   1936     for (i = 0; i < blkheight; i++)
   1937     {
   1938         ref = pRef;
   1939         out = pOut;
   1940 
   1941         r0 = ref[0];
   1942         for (j = 0; j < blkwidth; j += 4)
   1943         {
   1944             r0 |= (ref[2] << 16);
   1945             result0 = dx_8 * r0 + 0x00040004;
   1946 
   1947             r1 = ref[1] | (ref[3] << 16);
   1948             result0 += dx * r1;
   1949             result0 >>= 3;
   1950             result0 &= 0x00FF00FF;
   1951 
   1952             result1 = dx_8 * r1 + 0x00040004;
   1953 
   1954             r2 = ref[4];
   1955             r0 = r0 >> 16;
   1956             r1 = r0 | (r2 << 16);
   1957             result1 += dx * r1;
   1958             result1 >>= 3;
   1959             result1 &= 0x00FF00FF;
   1960 
   1961             *(int32 *)out = result0 | (result1 << 8);
   1962 
   1963             ref += 4;
   1964             out += 4;
   1965             r0 = r2;
   1966         }
   1967 
   1968         pRef += srcPitch;
   1969         pOut += predPitch;
   1970     }
   1971     return;
   1972 }
   1973 
   1974 void eChromaVerticalMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   1975                             uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   1976 {
   1977     (void)(dx);
   1978 
   1979     int32 r0, r1, r2, r3, result0, result1;
   1980     int i, j;
   1981     uint8 *ref, *out;
   1982     int dy_8 = 8 - dy;
   1983     /* vertical first */
   1984     for (i = 0; i < blkwidth; i += 4)
   1985     {
   1986         ref = pRef;
   1987         out = pOut;
   1988 
   1989         r0 = ref[0] | (ref[2] << 16);
   1990         r1 = ref[1] | (ref[3] << 16);
   1991         ref += srcPitch;
   1992         for (j = 0; j < blkheight; j++)
   1993         {
   1994             result0 = dy_8 * r0 + 0x00040004;
   1995             r2 = ref[0] | (ref[2] << 16);
   1996             result0 += dy * r2;
   1997             result0 >>= 3;
   1998             result0 &= 0x00FF00FF;
   1999             r0 = r2;
   2000 
   2001             result1 = dy_8 * r1 + 0x00040004;
   2002             r3 = ref[1] | (ref[3] << 16);
   2003             result1 += dy * r3;
   2004             result1 >>= 3;
   2005             result1 &= 0x00FF00FF;
   2006             r1 = r3;
   2007             *(int32 *)out = result0 | (result1 << 8);
   2008             ref += srcPitch;
   2009             out += predPitch;
   2010         }
   2011         pOut += 4;
   2012         pRef += 4;
   2013     }
   2014     return;
   2015 }
   2016 
   2017 void eChromaDiagonalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2018                              uint8 *pOut,  int predPitch, int blkwidth, int blkheight)
   2019 {
   2020     (void)(blkwidth);
   2021 
   2022     int32 r0, r1, temp0, temp1, result;
   2023     int32 temp[9];
   2024     int32 *out;
   2025     int i, r_temp;
   2026     int dy_8 = 8 - dy;
   2027 
   2028     /* horizontal first */
   2029     out = temp;
   2030     for (i = 0; i < blkheight + 1; i++)
   2031     {
   2032         r_temp = pRef[1];
   2033         temp0 = (pRef[0] << 3) + dx * (r_temp - pRef[0]);
   2034         temp1 = (r_temp << 3) + dx * (pRef[2] - r_temp);
   2035         r0 = temp0 | (temp1 << 16);
   2036         *out++ = r0;
   2037         pRef += srcPitch;
   2038     }
   2039 
   2040     pRef -= srcPitch * (blkheight + 1);
   2041 
   2042     out = temp;
   2043 
   2044     r0 = *out++;
   2045 
   2046     for (i = 0; i < blkheight; i++)
   2047     {
   2048         result = dy_8 * r0 + 0x00200020;
   2049         r1 = *out++;
   2050         result += dy * r1;
   2051         result >>= 6;
   2052         result &= 0x00FF00FF;
   2053         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
   2054         r0 = r1;
   2055         pOut += predPitch;
   2056     }
   2057     return;
   2058 }
   2059 
   2060 void eChromaHorizontalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2061                                uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2062 {
   2063     (void)(dy);
   2064     (void)(blkwidth);
   2065 
   2066     int i, temp, temp0, temp1;
   2067 
   2068     /* horizontal first */
   2069     for (i = 0; i < blkheight; i++)
   2070     {
   2071         temp = pRef[1];
   2072         temp0 = ((pRef[0] << 3) + dx * (temp - pRef[0]) + 4) >> 3;
   2073         temp1 = ((temp << 3) + dx * (pRef[2] - temp) + 4) >> 3;
   2074 
   2075         *(int16 *)pOut = temp0 | (temp1 << 8);
   2076         pRef += srcPitch;
   2077         pOut += predPitch;
   2078 
   2079     }
   2080     return;
   2081 }
   2082 void eChromaVerticalMC2_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2083                              uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2084 {
   2085     (void)(dx);
   2086     (void)(blkwidth);
   2087 
   2088     int32 r0, r1, result;
   2089     int i;
   2090     int dy_8 = 8 - dy;
   2091     r0 = pRef[0] | (pRef[1] << 16);
   2092     pRef += srcPitch;
   2093     for (i = 0; i < blkheight; i++)
   2094     {
   2095         result = dy_8 * r0 + 0x00040004;
   2096         r1 = pRef[0] | (pRef[1] << 16);
   2097         result += dy * r1;
   2098         result >>= 3;
   2099         result &= 0x00FF00FF;
   2100         *(int16 *)pOut = (result >> 8) | (result & 0xFF);
   2101         r0 = r1;
   2102         pRef += srcPitch;
   2103         pOut += predPitch;
   2104     }
   2105     return;
   2106 }
   2107 
   2108 void eChromaFullMC_SIMD(uint8 *pRef, int srcPitch, int dx, int dy,
   2109                         uint8 *pOut, int predPitch, int blkwidth, int blkheight)
   2110 {
   2111     (void)(dx);
   2112     (void)(dy);
   2113 
   2114     int i, j;
   2115     int offset_in = srcPitch - blkwidth;
   2116     int offset_out = predPitch - blkwidth;
   2117     uint16 temp;
   2118     uint8 byte;
   2119 
   2120     if (((intptr_t)pRef)&1)
   2121     {
   2122         for (j = blkheight; j > 0; j--)
   2123         {
   2124             for (i = blkwidth; i > 0; i -= 2)
   2125             {
   2126                 temp = *pRef++;
   2127                 byte = *pRef++;
   2128                 temp |= (byte << 8);
   2129                 *((uint16*)pOut) = temp; /* write 2 bytes */
   2130                 pOut += 2;
   2131             }
   2132             pOut += offset_out;
   2133             pRef += offset_in;
   2134         }
   2135     }
   2136     else
   2137     {
   2138         for (j = blkheight; j > 0; j--)
   2139         {
   2140             for (i = blkwidth; i > 0; i -= 2)
   2141             {
   2142                 temp = *((uint16*)pRef);
   2143                 *((uint16*)pOut) = temp;
   2144                 pRef += 2;
   2145                 pOut += 2;
   2146             }
   2147             pOut += offset_out;
   2148             pRef += offset_in;
   2149         }
   2150     }
   2151     return ;
   2152 }
   2153