Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 #include "mp4def.h"
     19 #include "mp4lib_int.h"
     20 #include "mp4enc_lib.h"
     21 #include "dct.h"
     22 #include "m4venc_oscl.h"
     23 
     24 /* ======================================================================== */
     25 /*  Function : CodeMB_H263( )                                               */
     26 /*  Date     : 8/15/2001                                                    */
     27 /*  Purpose  : Perform residue calc (only zero MV), DCT, H263 Quant/Dequant,*/
     28 /*              IDCT and motion compensation.Modified from FastCodeMB()     */
     29 /*  Input    :                                                              */
     30 /*      video       Video encoder data structure                            */
     31 /*      function    Approximate DCT function, scaling and threshold         */
     32 /*      ncoefblck   Array for last nonzero coeff for speedup in VlcEncode   */
     33 /*      QP      Combined offset from the origin to the current          */
     34 /*                  macroblock  and QP  for current MB.                     */
     35 /*    Output     :                                                          */
     36 /*      video->outputMB     Quantized DCT coefficients.                     */
     37 /*      currVop->yChan,uChan,vChan  Reconstructed pixels                    */
     38 /*                                                                          */
     39 /*  Return   :   PV_STATUS                                                  */
     40 /*  Modified :                                                              */
     41 /*           2/26/01
     42             -modified threshold based on correlation coeff 0.75 only for mode H.263
     43             -ncoefblck[] as input,  to keep position of last non-zero coeff*/
     44 /*           8/10/01
     45             -modified threshold based on correlation coeff 0.5
     46             -used column threshold to speedup column DCT.
     47             -used bitmap zigzag to speedup RunLevel().                      */
     48 /* ======================================================================== */
     49 
     50 PV_STATUS CodeMB_H263(VideoEncData *video, approxDCT *function, Int QP, Int ncoefblck[])
     51 {
     52     Int sad, k, CBP, mbnum = video->mbnum;
     53     Short *output, *dataBlock;
     54     UChar Mode = video->headerInfo.Mode[mbnum];
     55     UChar *bitmapcol, *bitmaprow = video->bitmaprow;
     56     UInt  *bitmapzz ;
     57     UChar shortHeader = video->vol[video->currLayer]->shortVideoHeader;
     58     Int dc_scaler = 8;
     59     Int intra = (Mode == MODE_INTRA || Mode == MODE_INTRA_Q);
     60     struct QPstruct QuantParam;
     61     Int dctMode, DctTh1;
     62     Int ColTh;
     63     Int(*BlockQuantDequantH263)(Short *, Short *, struct QPstruct *,
     64                                 UChar[], UChar *, UInt *, Int, Int, Int, UChar);
     65     Int(*BlockQuantDequantH263DC)(Short *, Short *, struct QPstruct *,
     66                                   UChar *, UInt *, Int, UChar);
     67     void (*BlockDCT1x1)(Short *, UChar *, UChar *, Int);
     68     void (*BlockDCT2x2)(Short *, UChar *, UChar *, Int);
     69     void (*BlockDCT4x4)(Short *, UChar *, UChar *, Int);
     70     void (*BlockDCT8x8)(Short *, UChar *, UChar *, Int);
     71 
     72     /* motion comp. related var. */
     73     Vop *currVop = video->currVop;
     74     VideoEncFrameIO *inputFrame = video->input;
     75     Int ind_x = video->outputMB->mb_x;
     76     Int ind_y = video->outputMB->mb_y;
     77     Int lx = currVop->pitch;
     78     Int width = currVop->width;
     79     UChar *rec, *input, *pred;
     80     Int offset = QP >> 5;  /* QP is combined offset and QP */
     81     Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
     82     /*****************************/
     83 
     84     OSCL_UNUSED_ARG(function);
     85 
     86     output = video->outputMB->block[0];
     87     CBP = 0;
     88     QP = QP & 0x1F;
     89 //  M4VENC_MEMSET(output,0,(sizeof(Short)<<6)*6); /* reset quantized coeff. to zero , 7/24/01*/
     90 
     91     QuantParam.QPx2 = QP << 1;
     92     QuantParam.QP = QP;
     93     QuantParam.QPdiv2 = QP >> 1;
     94     QuantParam.QPx2plus = QuantParam.QPx2 + QuantParam.QPdiv2;
     95     QuantParam.Addition = QP - 1 + (QP & 0x1);
     96 
     97     if (intra)
     98     {
     99         BlockDCT1x1 = &Block1x1DCTIntra;
    100         BlockDCT2x2 = &Block2x2DCT_AANIntra;
    101         BlockDCT4x4 = &Block4x4DCT_AANIntra;
    102         BlockDCT8x8 = &BlockDCT_AANIntra;
    103         BlockQuantDequantH263 = &BlockQuantDequantH263Intra;
    104         BlockQuantDequantH263DC = &BlockQuantDequantH263DCIntra;
    105         if (shortHeader)
    106         {
    107             dc_scaler = 8;
    108         }
    109         else
    110         {
    111             dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
    112         }
    113         DctTh1 = (Int)(dc_scaler * 3);//*1.829
    114         ColTh = ColThIntra[QP];
    115     }
    116     else
    117     {
    118         BlockDCT1x1 = &Block1x1DCTwSub;
    119         BlockDCT2x2 = &Block2x2DCT_AANwSub;
    120         BlockDCT4x4 = &Block4x4DCT_AANwSub;
    121         BlockDCT8x8 = &BlockDCT_AANwSub;
    122 
    123         BlockQuantDequantH263 = &BlockQuantDequantH263Inter;
    124         BlockQuantDequantH263DC = &BlockQuantDequantH263DCInter;
    125         ColTh = ColThInter[QP];
    126         DctTh1 = (Int)(16 * QP);  //9*QP;
    127     }
    128 
    129     rec = currVop->yChan + offset;
    130     input = inputFrame->yChan + offset;
    131     if (lx != width) input -= (ind_y << 9);  /* non-padded offset */
    132 
    133     dataBlock = video->dataBlock;
    134     pred = video->predictedMB;
    135 
    136     for (k = 0; k < 6; k++)
    137     {
    138         CBP <<= 1;
    139         bitmapcol = video->bitmapcol[k];
    140         bitmapzz = video->bitmapzz[k];  /*  7/30/01 */
    141         if (k < 4)
    142         {
    143             sad = video->mot[mbnum][k+1].sad;
    144             if (k&1)
    145             {
    146                 rec += 8;
    147                 input += 8;
    148             }
    149             else if (k == 2)
    150             {
    151                 dctMode = ((width << 3) - 8);
    152                 input += dctMode;
    153                 dctMode = ((lx << 3) - 8);
    154                 rec += dctMode;
    155             }
    156         }
    157         else
    158         {
    159             if (k == 4)
    160             {
    161                 rec = currVop->uChan + offsetc;
    162                 input = inputFrame->uChan + offsetc;
    163                 if (lx != width) input -= (ind_y << 7);
    164                 lx >>= 1;
    165                 width >>= 1;
    166                 if (intra)
    167                 {
    168                     sad = getBlockSum(input, width);
    169                     if (shortHeader)
    170                         dc_scaler = 8;
    171                     else
    172                     {
    173                         dc_scaler = cal_dc_scalerENC(QP, 2); /* chrominance blocks */
    174                     }
    175                     DctTh1 = (Int)(dc_scaler * 3);//*1.829
    176                 }
    177                 else
    178                     sad = Sad8x8(input, pred, width);
    179             }
    180             else
    181             {
    182                 rec = currVop->vChan + offsetc;
    183                 input = inputFrame->vChan + offsetc;
    184                 if (lx != width) input -= (ind_y << 7);
    185                 if (intra)
    186                 {
    187                     sad = getBlockSum(input, width);
    188                 }
    189                 else
    190                     sad = Sad8x8(input, pred, width);
    191             }
    192         }
    193 
    194         if (sad < DctTh1 && !(shortHeader && intra)) /* all-zero */
    195         {                       /* For shortHeader intra block, DC value cannot be zero */
    196             dctMode = 0;
    197             CBP |= 0;
    198             ncoefblck[k] = 0;
    199         }
    200         else if (sad < 18*QP/*(QP<<4)*/) /* DC-only */
    201         {
    202             dctMode = 1;
    203             BlockDCT1x1(dataBlock, input, pred, width);
    204 
    205             CBP |= (*BlockQuantDequantH263DC)(dataBlock, output, &QuantParam,
    206                                               bitmaprow + k, bitmapzz, dc_scaler, shortHeader);
    207             ncoefblck[k] = 1;
    208         }
    209         else
    210         {
    211 
    212             dataBlock[64] = ColTh;
    213 
    214             if (sad < 22*QP/*(QP<<4)+(QP<<1)*/)  /* 2x2 DCT */
    215             {
    216                 dctMode = 2;
    217                 BlockDCT2x2(dataBlock, input, pred, width);
    218                 ncoefblck[k] = 6;
    219             }
    220             else if (sad < (QP << 5)) /* 4x4 DCT */
    221             {
    222                 dctMode = 4;
    223                 BlockDCT4x4(dataBlock, input, pred, width);
    224                 ncoefblck[k] = 26;
    225             }
    226             else /* Full-DCT */
    227             {
    228                 dctMode = 8;
    229                 BlockDCT8x8(dataBlock, input, pred, width);
    230                 ncoefblck[k] = 64;
    231             }
    232 
    233             CBP |= (*BlockQuantDequantH263)(dataBlock, output, &QuantParam,
    234                                             bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler, shortHeader);
    235         }
    236         BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) | intra);
    237         output += 64;
    238         if (!(k&1))
    239         {
    240             pred += 8;
    241         }
    242         else
    243         {
    244             pred += 120;
    245         }
    246     }
    247 
    248     video->headerInfo.CBP[mbnum] = CBP; /*  5/18/2001 */
    249     return PV_SUCCESS;
    250 }
    251 
    252 #ifndef NO_MPEG_QUANT
    253 /* ======================================================================== */
    254 /*  Function : CodeMB_MPEG( )                                               */
    255 /*  Date     : 8/15/2001                                                    */
    256 /*  Purpose  : Perform residue calc (only zero MV), DCT, MPEG Quant/Dequant,*/
    257 /*              IDCT and motion compensation.Modified from FastCodeMB()     */
    258 /*  Input    :                                                              */
    259 /*      video       Video encoder data structure                            */
    260 /*      function    Approximate DCT function, scaling and threshold         */
    261 /*      ncoefblck   Array for last nonzero coeff for speedup in VlcEncode   */
    262 /*      QP      Combined offset from the origin to the current          */
    263 /*                  macroblock  and QP  for current MB.                     */
    264 /*    Output     :                                                          */
    265 /*      video->outputMB     Quantized DCT coefficients.                     */
    266 /*      currVop->yChan,uChan,vChan  Reconstructed pixels                    */
    267 /*                                                                          */
    268 /*  Return   :   PV_STATUS                                                  */
    269 /*  Modified :                                                              */
    270 /*           2/26/01
    271             -modified threshold based on correlation coeff 0.75 only for mode H.263
    272             -ncoefblck[] as input, keep position of last non-zero coeff*/
    273 /*           8/10/01
    274             -modified threshold based on correlation coeff 0.5
    275             -used column threshold to speedup column DCT.
    276             -used bitmap zigzag to speedup RunLevel().                      */
    277 /* ======================================================================== */
    278 
    279 PV_STATUS CodeMB_MPEG(VideoEncData *video, approxDCT *function, Int QP, Int ncoefblck[])
    280 {
    281     Int sad, k, CBP, mbnum = video->mbnum;
    282     Short *output, *dataBlock;
    283     UChar Mode = video->headerInfo.Mode[mbnum];
    284     UChar *bitmapcol, *bitmaprow = video->bitmaprow;
    285     UInt  *bitmapzz ;
    286     Int dc_scaler = 8;
    287     Vol *currVol = video->vol[video->currLayer];
    288     Int intra = (Mode == MODE_INTRA || Mode == MODE_INTRA_Q);
    289     Int *qmat;
    290     Int dctMode, DctTh1, DctTh2, DctTh3, DctTh4;
    291     Int ColTh;
    292 
    293     Int(*BlockQuantDequantMPEG)(Short *, Short *, Int, Int *,
    294                                 UChar [], UChar *, UInt *, Int,  Int, Int);
    295     Int(*BlockQuantDequantMPEGDC)(Short *, Short *, Int, Int *,
    296                                   UChar [], UChar *, UInt *, Int);
    297 
    298     void (*BlockDCT1x1)(Short *, UChar *, UChar *, Int);
    299     void (*BlockDCT2x2)(Short *, UChar *, UChar *, Int);
    300     void (*BlockDCT4x4)(Short *, UChar *, UChar *, Int);
    301     void (*BlockDCT8x8)(Short *, UChar *, UChar *, Int);
    302 
    303     /* motion comp. related var. */
    304     Vop *currVop = video->currVop;
    305     VideoEncFrameIO *inputFrame = video->input;
    306     Int ind_x = video->outputMB->mb_x;
    307     Int ind_y = video->outputMB->mb_y;
    308     Int lx = currVop->pitch;
    309     Int width = currVop->width;
    310     UChar *rec, *input, *pred;
    311     Int offset = QP >> 5;
    312     Int offsetc = (offset >> 2) + (ind_x << 2); /* offset for chrom */
    313     /*****************************/
    314 
    315     OSCL_UNUSED_ARG(function);
    316 
    317     output = video->outputMB->block[0];
    318     CBP = 0;
    319     QP = QP & 0x1F;
    320 //  M4VENC_MEMSET(output,0,(sizeof(Short)<<6)*6); /* reset quantized coeff. to zero ,  7/24/01*/
    321 
    322     if (intra)
    323     {
    324         BlockDCT1x1 = &Block1x1DCTIntra;
    325         BlockDCT2x2 = &Block2x2DCT_AANIntra;
    326         BlockDCT4x4 = &Block4x4DCT_AANIntra;
    327         BlockDCT8x8 = &BlockDCT_AANIntra;
    328 
    329         BlockQuantDequantMPEG = &BlockQuantDequantMPEGIntra;
    330         BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCIntra;
    331         dc_scaler = cal_dc_scalerENC(QP, 1); /* luminance blocks */
    332         qmat = currVol->iqmat;
    333         DctTh1 = (Int)(3 * dc_scaler);//2*dc_scaler);
    334         DctTh2 = (Int)((1.25 * QP - 1) * qmat[1] * 0.45);//0.567);//0.567);
    335         DctTh3 = (Int)((1.25 * QP - 1) * qmat[2] * 0.55);//1.162); /*  8/2/2001 */
    336         DctTh4 = (Int)((1.25 * QP - 1) * qmat[32] * 0.8);//1.7583);//0.7942);
    337         ColTh = ColThIntra[QP];
    338     }
    339     else
    340     {
    341         BlockDCT1x1 = &Block1x1DCTwSub;
    342         BlockDCT2x2 = &Block2x2DCT_AANwSub;
    343         BlockDCT4x4 = &Block4x4DCT_AANwSub;
    344         BlockDCT8x8 = &BlockDCT_AANwSub;
    345 
    346         BlockQuantDequantMPEG = &BlockQuantDequantMPEGInter;
    347         BlockQuantDequantMPEGDC = &BlockQuantDequantMPEGDCInter;
    348         qmat = currVol->niqmat;
    349         DctTh1 = (Int)(((QP << 1) - 0.5) * qmat[0] * 0.4);//0.2286);//0.3062);
    350         DctTh2 = (Int)(((QP << 1) - 0.5) * qmat[1] * 0.45);//0.567);//0.4);
    351         DctTh3 = (Int)(((QP << 1) - 0.5) * qmat[2] * 0.55);//1.162); /*  8/2/2001 */
    352         DctTh4 = (Int)(((QP << 1) - 0.5) * qmat[32] * 0.8);//1.7583);//0.7942);
    353         ColTh = ColThInter[QP];
    354     }// get qmat, DctTh1, DctTh2, DctTh3
    355 
    356     rec = currVop->yChan + offset;
    357     input = inputFrame->yChan + offset;
    358     if (lx != width) input -= (ind_y << 9);  /* non-padded offset */
    359 
    360     dataBlock = video->dataBlock;
    361     pred = video->predictedMB;
    362 
    363     for (k = 0; k < 6; k++)
    364     {
    365         CBP <<= 1;
    366         bitmapcol = video->bitmapcol[k];
    367         bitmapzz = video->bitmapzz[k];  /*  8/2/01 */
    368         if (k < 4)
    369         {//Y block
    370             sad = video->mot[mbnum][k+1].sad;
    371             if (k&1)
    372             {
    373                 rec += 8;
    374                 input += 8;
    375             }
    376             else if (k == 2)
    377             {
    378                 dctMode = ((width << 3) - 8);
    379                 input += dctMode;
    380                 dctMode = ((lx << 3) - 8);
    381                 rec += dctMode;
    382             }
    383         }
    384         else
    385         {// U, V block
    386             if (k == 4)
    387             {
    388                 rec = currVop->uChan + offsetc;
    389                 input = inputFrame->uChan + offsetc;
    390                 if (lx != width) input -= (ind_y << 7);
    391                 lx >>= 1;
    392                 width >>= 1;
    393                 if (intra)
    394                 {
    395                     dc_scaler = cal_dc_scalerENC(QP, 2); /* luminance blocks */
    396                     DctTh1 = dc_scaler * 3;
    397                     sad = getBlockSum(input, width);
    398                 }
    399                 else
    400                     sad = Sad8x8(input, pred, width);
    401             }
    402             else
    403             {
    404                 rec = currVop->vChan + offsetc;
    405                 input = inputFrame->vChan + offsetc;
    406                 if (lx != width) input -= (ind_y << 7);
    407                 if (intra)
    408                     sad = getBlockSum(input, width);
    409                 else
    410                     sad = Sad8x8(input, pred, width);
    411             }
    412         }
    413 
    414         if (sad < DctTh1) /* all-zero */
    415         {
    416             dctMode = 0;
    417             CBP |= 0;
    418             ncoefblck[k] = 0;
    419         }
    420         else if (sad < DctTh2) /* DC-only */
    421         {
    422             dctMode = 1;
    423             BlockDCT1x1(dataBlock, input, pred, width);
    424 
    425             CBP |= (*BlockQuantDequantMPEGDC)(dataBlock, output, QP, qmat,
    426                                               bitmapcol, bitmaprow + k, bitmapzz, dc_scaler);
    427             ncoefblck[k] = 1;
    428         }
    429         else
    430         {
    431             dataBlock[64] = ColTh;
    432 
    433             if (sad < DctTh3) /* 2x2-DCT */
    434             {
    435                 dctMode = 2;
    436                 BlockDCT2x2(dataBlock, input, pred, width);
    437                 ncoefblck[k] = 6;
    438             }
    439             else if (sad < DctTh4) /* 4x4 DCT */
    440             {
    441                 dctMode = 4;
    442                 BlockDCT4x4(dataBlock, input, pred, width);
    443                 ncoefblck[k] = 26;
    444             }
    445             else /* full-DCT */
    446             {
    447                 dctMode = 8;
    448                 BlockDCT8x8(dataBlock, input, pred, width);
    449                 ncoefblck[k] = 64;
    450             }
    451 
    452             CBP |= (*BlockQuantDequantMPEG)(dataBlock, output, QP, qmat,
    453                                             bitmapcol, bitmaprow + k, bitmapzz, dctMode, k, dc_scaler); //
    454         }
    455         dctMode = 8; /* for mismatch handle */
    456         BlockIDCTMotionComp(dataBlock, bitmapcol, bitmaprow[k], dctMode, rec, pred, (lx << 1) | (intra));
    457 
    458         output += 64;
    459         if (!(k&1))
    460         {
    461             pred += 8;
    462         }
    463         else
    464         {
    465             pred += 120;
    466         }
    467     }
    468 
    469     video->headerInfo.CBP[mbnum] = CBP; /*  5/18/2001 */
    470     return PV_SUCCESS;
    471 }
    472 
    473 #endif
    474 
    475 /* ======================================================================== */
    476 /*  Function : getBlockSAV( )                                               */
    477 /*  Date     : 8/10/2000                                                    */
    478 /*  Purpose  : Get SAV for one block                                        */
    479 /*  In/out   : block[64] contain one block data                             */
    480 /*  Return   :                                                              */
    481 /*  Modified :                                                              */
    482 /* ======================================================================== */
    483 /* can be written in MMX or SSE,  2/22/2001 */
    484 Int getBlockSAV(Short block[])
    485 {
    486     Int i, val, sav = 0;
    487 
    488     i = 8;
    489     while (i--)
    490     {
    491         val = *block++;
    492         if (val > 0)    sav += val;
    493         else        sav -= val;
    494         val = *block++;
    495         if (val > 0)    sav += val;
    496         else        sav -= val;
    497         val = *block++;
    498         if (val > 0)    sav += val;
    499         else        sav -= val;
    500         val = *block++;
    501         if (val > 0)    sav += val;
    502         else        sav -= val;
    503         val = *block++;
    504         if (val > 0)    sav += val;
    505         else        sav -= val;
    506         val = *block++;
    507         if (val > 0)    sav += val;
    508         else        sav -= val;
    509         val = *block++;
    510         if (val > 0)    sav += val;
    511         else        sav -= val;
    512         val = *block++;
    513         if (val > 0)    sav += val;
    514         else        sav -= val;
    515     }
    516 
    517     return sav;
    518 
    519 }
    520 
    521 /* ======================================================================== */
    522 /*  Function : Sad8x8( )                                                    */
    523 /*  Date     : 8/10/2000                                                    */
    524 /*  Purpose  : Find SAD between prev block and current block                */
    525 /*  In/out   : Previous and current frame block pointers, and frame width   */
    526 /*  Return   :                                                              */
    527 /*  Modified :                                                              */
    528 /*      8/15/01,  - do 4 pixel at a time    assuming 32 bit register        */
    529 /* ======================================================================== */
    530 Int Sad8x8(UChar *cur, UChar *prev, Int width)
    531 {
    532     UChar *end = cur + (width << 3);
    533     Int sad = 0;
    534     Int *curInt = (Int*) cur;
    535     Int *prevInt = (Int*) prev;
    536     Int cur1, cur2, prev1, prev2;
    537     UInt mask, sgn_msk = 0x80808080;
    538     Int  sum2 = 0, sum4 = 0;
    539     Int  tmp;
    540     do
    541     {
    542         mask    = ~(0xFF00);
    543         cur1    = curInt[1];        /* load cur[4..7] */
    544         cur2    = curInt[0];
    545         curInt += (width >> 2);     /* load cur[0..3] and +=lx */
    546         prev1   = prevInt[1];
    547         prev2   = prevInt[0];
    548         prevInt += 4;
    549 
    550         tmp     = prev2 ^ cur2;
    551         cur2    = prev2 - cur2;
    552         tmp     = tmp ^ cur2;       /* (^)^(-) last bit is one if carry */
    553         tmp     = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
    554         if (cur2 < 0)   tmp = tmp | 0x80000000; /* corcurt sign of first byte */
    555         tmp     = (tmp << 8) - tmp;     /* carry borrowed bytes are marked with 0x1FE */
    556         cur2    = cur2 + (tmp >> 7);     /* negative bytes is added with 0xFF, -1 */
    557         cur2    = cur2 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */
    558 
    559         tmp     = prev1 ^ cur1;
    560         cur1    = prev1 - cur1;
    561         tmp     = tmp ^ cur1;       /* (^)^(-) last bit is one if carry */
    562         tmp     = sgn_msk & ((UInt)tmp >> 1); /* check the sign of each byte */
    563         if (cur1 < 0)   tmp = tmp | 0x80000000; /* corcurt sign of first byte */
    564         tmp     = (tmp << 8) - tmp;     /* carry borrowed bytes are marked with 0x1FE */
    565         cur1    = cur1 + (tmp >> 7);     /* negative bytes is added with 0xFF, -1 */
    566         cur1    = cur1 ^(tmp >> 7); /* take absolute by inverting bits (EOR) */
    567 
    568         sum4    = sum4 + cur1;
    569         cur1    = cur1 & (mask << 8);   /* mask first and third bytes */
    570         sum2    = sum2 + ((UInt)cur1 >> 8);
    571         sum4    = sum4 + cur2;
    572         cur2    = cur2 & (mask << 8);   /* mask first and third bytes */
    573         sum2    = sum2 + ((UInt)cur2 >> 8);
    574     }
    575     while ((UInt)curInt < (UInt)end);
    576 
    577     cur1 = sum4 - (sum2 << 8);  /* get even-sum */
    578     cur1 = cur1 + sum2;         /* add 16 bit even-sum and odd-sum*/
    579     cur1 = cur1 + (cur1 << 16); /* add upper and lower 16 bit sum */
    580     sad  = ((UInt)cur1 >> 16);  /* take upper 16 bit */
    581     return sad;
    582 }
    583 
    584 /* ======================================================================== */
    585 /*  Function : getBlockSum( )                                               */
    586 /*  Date     : 8/10/2000                                                    */
    587 /*  Purpose  : Find summation of value within a block.                      */
    588 /*  In/out   : Pointer to current block in a frame and frame width          */
    589 /*  Return   :                                                              */
    590 /*  Modified :                                                              */
    591 /*          8/15/01,  - SIMD 4 pixels at a time                         */
    592 /* ======================================================================== */
    593 
    594 Int getBlockSum(UChar *cur, Int width)
    595 {
    596     Int sad = 0, sum4 = 0, sum2 = 0;
    597     UChar *end = cur + (width << 3);
    598     Int *curInt = (Int*)cur;
    599     UInt mask   = ~(0xFF00);
    600     Int load1, load2;
    601 
    602     do
    603     {
    604         load1 = curInt[1];
    605         load2 = curInt[0];
    606         curInt += (width >> 2);
    607         sum4 += load1;
    608         load1 = load1 & (mask << 8); /* even bytes */
    609         sum2 += ((UInt)load1 >> 8); /* sum even bytes, 16 bit */
    610         sum4 += load2;
    611         load2 = load2 & (mask << 8); /* even bytes */
    612         sum2 += ((UInt)load2 >> 8); /* sum even bytes, 16 bit */
    613     }
    614     while ((UInt)curInt < (UInt)end);
    615     load1 = sum4 - (sum2 << 8);     /* get even-sum */
    616     load1 = load1 + sum2;           /* add 16 bit even-sum and odd-sum*/
    617     load1 = load1 + (load1 << 16);  /* add upper and lower 16 bit sum */
    618     sad  = ((UInt)load1 >> 16); /* take upper 16 bit */
    619 
    620     return sad;
    621 }
    622 
    623