Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 /*
     19 
     20 ------------------------------------------------------------------------------
     21  REVISION HISTORY
     22  Who:   Date: July/2001
     23  Description:   1. Optimized BlockIDCT bitmap checking.
     24                 2. Rearranged functions.
     25                 3. Do column IDCT first, then row IDCT.
     26                 4. Combine motion comp and IDCT, require
     27                    two sets of row IDCTs one for INTRA
     28                    and one for INTER.
     29                 5. Add AAN IDCT
     30 
     31  Who:   Date: 8/16/01
     32                 1. Increase the input precision to 8 bits, i.e. change RDCTBITS
     33                    to 11, have to comment out all in-line assembly since 16 bit
     34                     multiplication doesn't work. Try to use diffent precision with
     35                     32 bit mult. but hasn't finished. Turns out that without in-line
     36                     assembly the performance doesn't change much (only 1%).
     37  Who:   Date: 9/04/05
     38                 1. Replace AAN IDCT with Chen's IDCT to accommodate 16 bit data type.
     39 
     40 */
     41 #include "mp4def.h"
     42 #include "mp4enc_lib.h"
     43 #include "mp4lib_int.h"
     44 #include "dct.h"
     45 
     46 #define ADD_CLIP    { \
     47             tmp = *rec + tmp; \
     48         if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
     49         *rec++ = tmp;   \
     50         }
     51 
     52 #define INTRA_CLIP  { \
     53         if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
     54         *rec++ = tmp;   \
     55         }
     56 
     57 
     58 #define CLIP_RESULT(x)      if((UInt)x > 0xFF){x = 0xFF & (~(x>>31));}
     59 #define ADD_AND_CLIP1(x)    x += (pred_word&0xFF); CLIP_RESULT(x);
     60 #define ADD_AND_CLIP2(x)    x += ((pred_word>>8)&0xFF); CLIP_RESULT(x);
     61 #define ADD_AND_CLIP3(x)    x += ((pred_word>>16)&0xFF); CLIP_RESULT(x);
     62 #define ADD_AND_CLIP4(x)    x += ((pred_word>>24)&0xFF); CLIP_RESULT(x);
     63 
     64 
     65 void idct_col0(Short *blk)
     66 {
     67     OSCL_UNUSED_ARG(blk);
     68 
     69     return;
     70 }
     71 
     72 void idct_col1(Short *blk)
     73 {
     74     blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
     75                                               blk[0] << 3;
     76     return ;
     77 }
     78 
     79 void idct_col2(Short *blk)
     80 {
     81     int32 x0, x1, x3, x5, x7;//, x8;
     82 
     83     x1 = blk[8];
     84     x0 = ((int32)blk[0] << 11) + 128;
     85     /* both upper and lower*/
     86 
     87     x7 = W7 * x1;
     88     x1 = W1 * x1;
     89 
     90     x3 = x7;
     91     x5 = (181 * (x1 - x7) + 128) >> 8;
     92     x7 = (181 * (x1 + x7) + 128) >> 8;
     93 
     94     blk[0] = (x0 + x1) >> 8;
     95     blk[8] = (x0 + x7) >> 8;
     96     blk[16] = (x0 + x5) >> 8;
     97     blk[24] = (x0 + x3) >> 8;
     98     blk[56] = (x0 - x1) >> 8;
     99     blk[48] = (x0 - x7) >> 8;
    100     blk[40] = (x0 - x5) >> 8;
    101     blk[32] = (x0 - x3) >> 8;
    102     return ;
    103 }
    104 
    105 void idct_col3(Short *blk)
    106 {
    107     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    108 
    109     x2 = blk[16];
    110     x1 = blk[8];
    111     x0 = ((int32)blk[0] << 11) + 128;
    112 
    113     x4 = x0;
    114     x6 = W6 * x2;
    115     x2 = W2 * x2;
    116     x8 = x0 - x2;
    117     x0 += x2;
    118     x2 = x8;
    119     x8 = x4 - x6;
    120     x4 += x6;
    121     x6 = x8;
    122 
    123     x7 = W7 * x1;
    124     x1 = W1 * x1;
    125     x3 = x7;
    126     x5 = (181 * (x1 - x7) + 128) >> 8;
    127     x7 = (181 * (x1 + x7) + 128) >> 8;
    128 
    129     blk[0] = (x0 + x1) >> 8;
    130     blk[8] = (x4 + x7) >> 8;
    131     blk[16] = (x6 + x5) >> 8;
    132     blk[24] = (x2 + x3) >> 8;
    133     blk[56] = (x0 - x1) >> 8;
    134     blk[48] = (x4 - x7) >> 8;
    135     blk[40] = (x6 - x5) >> 8;
    136     blk[32] = (x2 - x3) >> 8;
    137     return ;
    138 }
    139 
    140 void idct_col4(Short *blk)
    141 {
    142     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    143     x2 = blk[16];
    144     x1 = blk[8];
    145     x3 = blk[24];
    146     x0 = ((int32)blk[0] << 11) + 128;
    147 
    148     x4 = x0;
    149     x6 = W6 * x2;
    150     x2 = W2 * x2;
    151     x8 = x0 - x2;
    152     x0 += x2;
    153     x2 = x8;
    154     x8 = x4 - x6;
    155     x4 += x6;
    156     x6 = x8;
    157 
    158     x7 = W7 * x1;
    159     x1 = W1 * x1;
    160     x5 = W3 * x3;
    161     x3 = -W5 * x3;
    162     x8 = x1 - x5;
    163     x1 += x5;
    164     x5 = x8;
    165     x8 = x7 - x3;
    166     x3 += x7;
    167     x7 = (181 * (x5 + x8) + 128) >> 8;
    168     x5 = (181 * (x5 - x8) + 128) >> 8;
    169 
    170 
    171     blk[0] = (x0 + x1) >> 8;
    172     blk[8] = (x4 + x7) >> 8;
    173     blk[16] = (x6 + x5) >> 8;
    174     blk[24] = (x2 + x3) >> 8;
    175     blk[56] = (x0 - x1) >> 8;
    176     blk[48] = (x4 - x7) >> 8;
    177     blk[40] = (x6 - x5) >> 8;
    178     blk[32] = (x2 - x3) >> 8;
    179     return ;
    180 }
    181 
    182 #ifndef SMALL_DCT
    183 void idct_col0x40(Short *blk)
    184 {
    185     int32 x1, x3, x5, x7;//, x8;
    186 
    187     x1 = blk[8];
    188     /* both upper and lower*/
    189 
    190     x7 = W7 * x1;
    191     x1 = W1 * x1;
    192 
    193     x3 = x7;
    194     x5 = (181 * (x1 - x7) + 128) >> 8;
    195     x7 = (181 * (x1 + x7) + 128) >> 8;
    196 
    197     blk[0] = (128 + x1) >> 8;
    198     blk[8] = (128 + x7) >> 8;
    199     blk[16] = (128 + x5) >> 8;
    200     blk[24] = (128 + x3) >> 8;
    201     blk[56] = (128 - x1) >> 8;
    202     blk[48] = (128 - x7) >> 8;
    203     blk[40] = (128 - x5) >> 8;
    204     blk[32] = (128 - x3) >> 8;
    205 
    206     return ;
    207 }
    208 
    209 void idct_col0x20(Short *blk)
    210 {
    211     int32 x0, x2, x4, x6;
    212 
    213     x2 = blk[16];
    214     x6 = W6 * x2;
    215     x2 = W2 * x2;
    216     x0 = 128 + x2;
    217     x2 = 128 - x2;
    218     x4 = 128 + x6;
    219     x6 = 128 - x6;
    220 
    221     blk[0] = (x0) >> 8;
    222     blk[56] = (x0) >> 8;
    223     blk[8] = (x4) >> 8;
    224     blk[48] = (x4) >> 8;
    225     blk[16] = (x6) >> 8;
    226     blk[40] = (x6) >> 8;
    227     blk[24] = (x2) >> 8;
    228     blk[32] = (x2) >> 8;
    229 
    230     return ;
    231 }
    232 
    233 void idct_col0x10(Short *blk)
    234 {
    235     int32 x1, x3, x5,  x7;
    236 
    237     x3 = blk[24];
    238     x1 = W3 * x3;
    239     x3 = W5 * x3;
    240 
    241     x7 = (181 * (x3 - x1) + 128) >> 8;
    242     x5 = (-181 * (x1 + x3) + 128) >> 8;
    243 
    244 
    245     blk[0] = (128 + x1) >> 8;
    246     blk[8] = (128 + x7) >> 8;
    247     blk[16] = (128 + x5) >> 8;
    248     blk[24] = (128 - x3) >> 8;
    249     blk[56] = (128 - x1) >> 8;
    250     blk[48] = (128 - x7) >> 8;
    251     blk[40] = (128 - x5) >> 8;
    252     blk[32] = (128 + x3) >> 8;
    253 
    254     return ;
    255 }
    256 
    257 #endif /* SMALL_DCT */
    258 
    259 void idct_col(Short *blk)
    260 {
    261     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    262 
    263     x1 = (int32)blk[32] << 11;
    264     x2 = blk[48];
    265     x3 = blk[16];
    266     x4 = blk[8];
    267     x5 = blk[56];
    268     x6 = blk[40];
    269     x7 = blk[24];
    270     x0 = ((int32)blk[0] << 11) + 128;
    271 
    272     /* first stage */
    273     x8 = W7 * (x4 + x5);
    274     x4 = x8 + (W1 - W7) * x4;
    275     x5 = x8 - (W1 + W7) * x5;
    276     x8 = W3 * (x6 + x7);
    277     x6 = x8 - (W3 - W5) * x6;
    278     x7 = x8 - (W3 + W5) * x7;
    279 
    280     /* second stage */
    281     x8 = x0 + x1;
    282     x0 -= x1;
    283     x1 = W6 * (x3 + x2);
    284     x2 = x1 - (W2 + W6) * x2;
    285     x3 = x1 + (W2 - W6) * x3;
    286     x1 = x4 + x6;
    287     x4 -= x6;
    288     x6 = x5 + x7;
    289     x5 -= x7;
    290 
    291     /* third stage */
    292     x7 = x8 + x3;
    293     x8 -= x3;
    294     x3 = x0 + x2;
    295     x0 -= x2;
    296     x2 = (181 * (x4 + x5) + 128) >> 8;
    297     x4 = (181 * (x4 - x5) + 128) >> 8;
    298 
    299     /* fourth stage */
    300     blk[0]    = (x7 + x1) >> 8;
    301     blk[8] = (x3 + x2) >> 8;
    302     blk[16] = (x0 + x4) >> 8;
    303     blk[24] = (x8 + x6) >> 8;
    304     blk[32] = (x8 - x6) >> 8;
    305     blk[40] = (x0 - x4) >> 8;
    306     blk[48] = (x3 - x2) >> 8;
    307     blk[56] = (x7 - x1) >> 8;
    308 
    309     return ;
    310 }
    311 
    312 /* This function should not be called at all ****/
    313 void idct_row0Inter(Short *srce, UChar *rec, Int lx)
    314 {
    315     OSCL_UNUSED_ARG(srce);
    316 
    317     OSCL_UNUSED_ARG(rec);
    318 
    319     OSCL_UNUSED_ARG(lx);
    320 
    321     return;
    322 }
    323 
    324 void idct_row1Inter(Short *blk, UChar *rec, Int lx)
    325 {
    326     int tmp;
    327     int i = 8;
    328     uint32 pred_word, dst_word;
    329     int res, res2;
    330 
    331     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    332     rec -= lx;
    333     blk -= 8;
    334 
    335     while (i--)
    336     {
    337         tmp = (*(blk += 8) + 32) >> 6;
    338         *blk = 0;
    339 
    340         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    341         res = tmp + (pred_word & 0xFF);
    342         CLIP_RESULT(res);
    343         res2 = tmp + ((pred_word >> 8) & 0xFF);
    344         CLIP_RESULT(res2);
    345         dst_word = (res2 << 8) | res;
    346         res = tmp + ((pred_word >> 16) & 0xFF);
    347         CLIP_RESULT(res);
    348         dst_word |= (res << 16);
    349         res = tmp + ((pred_word >> 24) & 0xFF);
    350         CLIP_RESULT(res);
    351         dst_word |= (res << 24);
    352         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    353 
    354         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    355         res = tmp + (pred_word & 0xFF);
    356         CLIP_RESULT(res);
    357         res2 = tmp + ((pred_word >> 8) & 0xFF);
    358         CLIP_RESULT(res2);
    359         dst_word = (res2 << 8) | res;
    360         res = tmp + ((pred_word >> 16) & 0xFF);
    361         CLIP_RESULT(res);
    362         dst_word |= (res << 16);
    363         res = tmp + ((pred_word >> 24) & 0xFF);
    364         CLIP_RESULT(res);
    365         dst_word |= (res << 24);
    366         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    367     }
    368     return;
    369 }
    370 
    371 void idct_row2Inter(Short *blk, UChar *rec, Int lx)
    372 {
    373     int32 x0, x1, x2, x4, x5;
    374     int i = 8;
    375     uint32 pred_word, dst_word;
    376     int res, res2;
    377 
    378     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    379     rec -= lx;
    380     blk -= 8;
    381 
    382     while (i--)
    383     {
    384         /* shortcut */
    385         x4 = blk[9];
    386         blk[9] = 0;
    387         x0 = ((*(blk += 8)) << 8) + 8192;
    388         *blk = 0;  /* for proper rounding in the fourth stage */
    389 
    390         /* first stage */
    391         x5 = (W7 * x4 + 4) >> 3;
    392         x4 = (W1 * x4 + 4) >> 3;
    393 
    394         /* third stage */
    395         x2 = (181 * (x4 + x5) + 128) >> 8;
    396         x1 = (181 * (x4 - x5) + 128) >> 8;
    397 
    398         /* fourth stage */
    399         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    400         res = (x0 + x4) >> 14;
    401         ADD_AND_CLIP1(res);
    402         res2 = (x0 + x2) >> 14;
    403         ADD_AND_CLIP2(res2);
    404         dst_word = (res2 << 8) | res;
    405         res = (x0 + x1) >> 14;
    406         ADD_AND_CLIP3(res);
    407         dst_word |= (res << 16);
    408         res = (x0 + x5) >> 14;
    409         ADD_AND_CLIP4(res);
    410         dst_word |= (res << 24);
    411         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    412 
    413         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    414         res = (x0 - x5) >> 14;
    415         ADD_AND_CLIP1(res);
    416         res2 = (x0 - x1) >> 14;
    417         ADD_AND_CLIP2(res2);
    418         dst_word = (res2 << 8) | res;
    419         res = (x0 - x2) >> 14;
    420         ADD_AND_CLIP3(res);
    421         dst_word |= (res << 16);
    422         res = (x0 - x4) >> 14;
    423         ADD_AND_CLIP4(res);
    424         dst_word |= (res << 24);
    425         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    426     }
    427     return ;
    428 }
    429 
    430 void idct_row3Inter(Short *blk, UChar *rec, Int lx)
    431 {
    432     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    433     int i = 8;
    434     uint32 pred_word, dst_word;
    435     int res, res2;
    436 
    437     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    438     rec -= lx;
    439     blk -= 8;
    440 
    441     while (i--)
    442     {
    443         x2 = blk[10];
    444         blk[10] = 0;
    445         x1 = blk[9];
    446         blk[9] = 0;
    447         x0 = ((*(blk += 8)) << 8) + 8192;
    448         *blk = 0;  /* for proper rounding in the fourth stage */
    449         /* both upper and lower*/
    450         /* both x2orx6 and x0orx4 */
    451 
    452         x4 = x0;
    453         x6 = (W6 * x2 + 4) >> 3;
    454         x2 = (W2 * x2 + 4) >> 3;
    455         x8 = x0 - x2;
    456         x0 += x2;
    457         x2 = x8;
    458         x8 = x4 - x6;
    459         x4 += x6;
    460         x6 = x8;
    461 
    462         x7 = (W7 * x1 + 4) >> 3;
    463         x1 = (W1 * x1 + 4) >> 3;
    464         x3 = x7;
    465         x5 = (181 * (x1 - x7) + 128) >> 8;
    466         x7 = (181 * (x1 + x7) + 128) >> 8;
    467 
    468         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    469         res = (x0 + x1) >> 14;
    470         ADD_AND_CLIP1(res);
    471         res2 = (x4 + x7) >> 14;
    472         ADD_AND_CLIP2(res2);
    473         dst_word = (res2 << 8) | res;
    474         res = (x6 + x5) >> 14;
    475         ADD_AND_CLIP3(res);
    476         dst_word |= (res << 16);
    477         res = (x2 + x3) >> 14;
    478         ADD_AND_CLIP4(res);
    479         dst_word |= (res << 24);
    480         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    481 
    482         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    483         res = (x2 - x3) >> 14;
    484         ADD_AND_CLIP1(res);
    485         res2 = (x6 - x5) >> 14;
    486         ADD_AND_CLIP2(res2);
    487         dst_word = (res2 << 8) | res;
    488         res = (x4 - x7) >> 14;
    489         ADD_AND_CLIP3(res);
    490         dst_word |= (res << 16);
    491         res = (x0 - x1) >> 14;
    492         ADD_AND_CLIP4(res);
    493         dst_word |= (res << 24);
    494         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    495     }
    496 
    497     return ;
    498 }
    499 
    500 void idct_row4Inter(Short *blk, UChar *rec, Int lx)
    501 {
    502     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    503     int i = 8;
    504     uint32 pred_word, dst_word;
    505     int res, res2;
    506 
    507     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    508     rec -= lx;
    509     blk -= 8;
    510 
    511     while (i--)
    512     {
    513         x2 = blk[10];
    514         blk[10] = 0;
    515         x1 = blk[9];
    516         blk[9] = 0;
    517         x3 = blk[11];
    518         blk[11] = 0;
    519         x0 = ((*(blk += 8)) << 8) + 8192;
    520         *blk = 0;   /* for proper rounding in the fourth stage */
    521 
    522         x4 = x0;
    523         x6 = (W6 * x2 + 4) >> 3;
    524         x2 = (W2 * x2 + 4) >> 3;
    525         x8 = x0 - x2;
    526         x0 += x2;
    527         x2 = x8;
    528         x8 = x4 - x6;
    529         x4 += x6;
    530         x6 = x8;
    531 
    532         x7 = (W7 * x1 + 4) >> 3;
    533         x1 = (W1 * x1 + 4) >> 3;
    534         x5 = (W3 * x3 + 4) >> 3;
    535         x3 = (- W5 * x3 + 4) >> 3;
    536         x8 = x1 - x5;
    537         x1 += x5;
    538         x5 = x8;
    539         x8 = x7 - x3;
    540         x3 += x7;
    541         x7 = (181 * (x5 + x8) + 128) >> 8;
    542         x5 = (181 * (x5 - x8) + 128) >> 8;
    543 
    544         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    545         res = (x0 + x1) >> 14;
    546         ADD_AND_CLIP1(res);
    547         res2 = (x4 + x7) >> 14;
    548         ADD_AND_CLIP2(res2);
    549         dst_word = (res2 << 8) | res;
    550         res = (x6 + x5) >> 14;
    551         ADD_AND_CLIP3(res);
    552         dst_word |= (res << 16);
    553         res = (x2 + x3) >> 14;
    554         ADD_AND_CLIP4(res);
    555         dst_word |= (res << 24);
    556         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    557 
    558         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    559         res = (x2 - x3) >> 14;
    560         ADD_AND_CLIP1(res);
    561         res2 = (x6 - x5) >> 14;
    562         ADD_AND_CLIP2(res2);
    563         dst_word = (res2 << 8) | res;
    564         res = (x4 - x7) >> 14;
    565         ADD_AND_CLIP3(res);
    566         dst_word |= (res << 16);
    567         res = (x0 - x1) >> 14;
    568         ADD_AND_CLIP4(res);
    569         dst_word |= (res << 24);
    570         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    571     }
    572     return ;
    573 }
    574 
    575 #ifndef SMALL_DCT
    576 void idct_row0x40Inter(Short *blk, UChar *rec, Int lx)
    577 {
    578     int32 x1, x2, x4, x5;
    579     int i = 8;
    580     uint32 pred_word, dst_word;
    581     int res, res2;
    582 
    583     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    584     rec -= lx;
    585 
    586     while (i--)
    587     {
    588         /* shortcut */
    589         x4 = blk[1];
    590         blk[1] = 0;
    591         blk += 8;  /* for proper rounding in the fourth stage */
    592 
    593         /* first stage */
    594         x5 = (W7 * x4 + 4) >> 3;
    595         x4 = (W1 * x4 + 4) >> 3;
    596 
    597         /* third stage */
    598         x2 = (181 * (x4 + x5) + 128) >> 8;
    599         x1 = (181 * (x4 - x5) + 128) >> 8;
    600 
    601         /* fourth stage */
    602         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    603         res = (8192 + x4) >> 14;
    604         ADD_AND_CLIP1(res);
    605         res2 = (8192 + x2) >> 14;
    606         ADD_AND_CLIP2(res2);
    607         dst_word = (res2 << 8) | res;
    608         res = (8192 + x1) >> 14;
    609         ADD_AND_CLIP3(res);
    610         dst_word |= (res << 16);
    611         res = (8192 + x5) >> 14;
    612         ADD_AND_CLIP4(res);
    613         dst_word |= (res << 24);
    614         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    615 
    616         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    617         res = (8192 - x5) >> 14;
    618         ADD_AND_CLIP1(res);
    619         res2 = (8192 - x1) >> 14;
    620         ADD_AND_CLIP2(res2);
    621         dst_word = (res2 << 8) | res;
    622         res = (8192 - x2) >> 14;
    623         ADD_AND_CLIP3(res);
    624         dst_word |= (res << 16);
    625         res = (8192 - x4) >> 14;
    626         ADD_AND_CLIP4(res);
    627         dst_word |= (res << 24);
    628         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    629     }
    630     return ;
    631 }
    632 
    633 void idct_row0x20Inter(Short *blk, UChar *rec, Int lx)
    634 {
    635     int32 x0, x2, x4, x6;
    636     int i = 8;
    637     uint32 pred_word, dst_word;
    638     int res, res2;
    639 
    640     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    641     rec -= lx;
    642 
    643     while (i--)
    644     {
    645         x2 = blk[2];
    646         blk[2] = 0;
    647         blk += 8; /* for proper rounding in the fourth stage */
    648         /* both upper and lower*/
    649         /* both x2orx6 and x0orx4 */
    650         x6 = (W6 * x2 + 4) >> 3;
    651         x2 = (W2 * x2 + 4) >> 3;
    652         x0 = 8192 + x2;
    653         x2 = 8192 - x2;
    654         x4 = 8192 + x6;
    655         x6 = 8192 - x6;
    656 
    657         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    658         res = (x0) >> 14;
    659         ADD_AND_CLIP1(res);
    660         res2 = (x4) >> 14;
    661         ADD_AND_CLIP2(res2);
    662         dst_word = (res2 << 8) | res;
    663         res = (x6) >> 14;
    664         ADD_AND_CLIP3(res);
    665         dst_word |= (res << 16);
    666         res = (x2) >> 14;
    667         ADD_AND_CLIP4(res);
    668         dst_word |= (res << 24);
    669         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    670 
    671         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    672         res = (x2) >> 14;
    673         ADD_AND_CLIP1(res);
    674         res2 = (x6) >> 14;
    675         ADD_AND_CLIP2(res2);
    676         dst_word = (res2 << 8) | res;
    677         res = (x4) >> 14;
    678         ADD_AND_CLIP3(res);
    679         dst_word |= (res << 16);
    680         res = (x0) >> 14;
    681         ADD_AND_CLIP4(res);
    682         dst_word |= (res << 24);
    683         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    684     }
    685 
    686     return ;
    687 }
    688 
    689 void idct_row0x10Inter(Short *blk, UChar *rec, Int lx)
    690 {
    691     int32 x1, x3, x5, x7;
    692     int i = 8;
    693     uint32 pred_word, dst_word;
    694     int res, res2;
    695 
    696     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    697     rec -= lx;
    698 
    699     while (i--)
    700     {
    701         x3 = blk[3];
    702         blk[3] = 0;
    703         blk += 8;
    704 
    705         x1 = (W3 * x3 + 4) >> 3;
    706         x3 = (-W5 * x3 + 4) >> 3;
    707 
    708         x7 = (-181 * (x3 + x1) + 128) >> 8;
    709         x5 = (181 * (x3 - x1) + 128) >> 8;
    710 
    711         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    712         res = (8192 + x1) >> 14;
    713         ADD_AND_CLIP1(res);
    714         res2 = (8192 + x7) >> 14;
    715         ADD_AND_CLIP2(res2);
    716         dst_word = (res2 << 8) | res;
    717         res = (8192 + x5) >> 14;
    718         ADD_AND_CLIP3(res);
    719         dst_word |= (res << 16);
    720         res = (8192 + x3) >> 14;
    721         ADD_AND_CLIP4(res);
    722         dst_word |= (res << 24);
    723         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    724 
    725         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    726         res = (8192 - x3) >> 14;
    727         ADD_AND_CLIP1(res);
    728         res2 = (8192 - x5) >> 14;
    729         ADD_AND_CLIP2(res2);
    730         dst_word = (res2 << 8) | res;
    731         res = (8192 - x7) >> 14;
    732         ADD_AND_CLIP3(res);
    733         dst_word |= (res << 16);
    734         res = (8192 - x1) >> 14;
    735         ADD_AND_CLIP4(res);
    736         dst_word |= (res << 24);
    737         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    738     }
    739     return ;
    740 }
    741 
    742 #endif /* SMALL_DCT */
    743 
    744 void idct_rowInter(Short *blk, UChar *rec, Int lx)
    745 {
    746     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    747     int i = 8;
    748     uint32 pred_word, dst_word;
    749     int res, res2;
    750 
    751     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    752     rec -= lx;
    753     blk -= 8;
    754 
    755     while (i--)
    756     {
    757         x1 = (int32)blk[12] << 8;
    758         blk[12] = 0;
    759         x2 = blk[14];
    760         blk[14] = 0;
    761         x3 = blk[10];
    762         blk[10] = 0;
    763         x4 = blk[9];
    764         blk[9] = 0;
    765         x5 = blk[15];
    766         blk[15] = 0;
    767         x6 = blk[13];
    768         blk[13] = 0;
    769         x7 = blk[11];
    770         blk[11] = 0;
    771         x0 = ((*(blk += 8)) << 8) + 8192;
    772         *blk = 0;   /* for proper rounding in the fourth stage */
    773 
    774         /* first stage */
    775         x8 = W7 * (x4 + x5) + 4;
    776         x4 = (x8 + (W1 - W7) * x4) >> 3;
    777         x5 = (x8 - (W1 + W7) * x5) >> 3;
    778         x8 = W3 * (x6 + x7) + 4;
    779         x6 = (x8 - (W3 - W5) * x6) >> 3;
    780         x7 = (x8 - (W3 + W5) * x7) >> 3;
    781 
    782         /* second stage */
    783         x8 = x0 + x1;
    784         x0 -= x1;
    785         x1 = W6 * (x3 + x2) + 4;
    786         x2 = (x1 - (W2 + W6) * x2) >> 3;
    787         x3 = (x1 + (W2 - W6) * x3) >> 3;
    788         x1 = x4 + x6;
    789         x4 -= x6;
    790         x6 = x5 + x7;
    791         x5 -= x7;
    792 
    793         /* third stage */
    794         x7 = x8 + x3;
    795         x8 -= x3;
    796         x3 = x0 + x2;
    797         x0 -= x2;
    798         x2 = (181 * (x4 + x5) + 128) >> 8;
    799         x4 = (181 * (x4 - x5) + 128) >> 8;
    800 
    801         /* fourth stage */
    802         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    803 
    804         res = (x7 + x1) >> 14;
    805         ADD_AND_CLIP1(res);
    806         res2 = (x3 + x2) >> 14;
    807         ADD_AND_CLIP2(res2);
    808         dst_word = (res2 << 8) | res;
    809         res = (x0 + x4) >> 14;
    810         ADD_AND_CLIP3(res);
    811         dst_word |= (res << 16);
    812         res = (x8 + x6) >> 14;
    813         ADD_AND_CLIP4(res);
    814         dst_word |= (res << 24);
    815         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    816 
    817         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    818 
    819         res = (x8 - x6) >> 14;
    820         ADD_AND_CLIP1(res);
    821         res2 = (x0 - x4) >> 14;
    822         ADD_AND_CLIP2(res2);
    823         dst_word = (res2 << 8) | res;
    824         res = (x3 - x2) >> 14;
    825         ADD_AND_CLIP3(res);
    826         dst_word |= (res << 16);
    827         res = (x7 - x1) >> 14;
    828         ADD_AND_CLIP4(res);
    829         dst_word |= (res << 24);
    830         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    831     }
    832     return;
    833 }
    834 
    835 void idct_row0Intra(Short *srce, UChar *rec, Int lx)
    836 {
    837     OSCL_UNUSED_ARG(srce);
    838 
    839     OSCL_UNUSED_ARG(rec);
    840 
    841     OSCL_UNUSED_ARG(lx);
    842 
    843     return;
    844 }
    845 
    846 void idct_row1Intra(Short *blk, UChar *rec, Int lx)
    847 {
    848     int32 tmp;
    849     int i = 8;
    850 
    851     rec -= lx;
    852     blk -= 8;
    853     while (i--)
    854     {
    855         tmp = ((*(blk += 8) + 32) >> 6);
    856         *blk = 0;
    857         CLIP_RESULT(tmp)
    858 
    859         tmp |= (tmp << 8);
    860         tmp |= (tmp << 16);
    861         *((uint32*)(rec += lx)) = tmp;
    862         *((uint32*)(rec + 4)) = tmp;
    863     }
    864     return;
    865 }
    866 
    867 void idct_row2Intra(Short *blk, UChar *rec, Int lx)
    868 {
    869     int32 x0, x1, x2, x4, x5;
    870     int res, res2;
    871     uint32 dst_word;
    872     int i = 8;
    873 
    874     rec -= lx;
    875     blk -= 8;
    876     while (i--)
    877     {
    878         /* shortcut */
    879         x4 = blk[9];
    880         blk[9] = 0;
    881         x0 = ((*(blk += 8)) << 8) + 8192;
    882         *blk = 0;   /* for proper rounding in the fourth stage */
    883 
    884         /* first stage */
    885         x5 = (W7 * x4 + 4) >> 3;
    886         x4 = (W1 * x4 + 4) >> 3;
    887 
    888         /* third stage */
    889         x2 = (181 * (x4 + x5) + 128) >> 8;
    890         x1 = (181 * (x4 - x5) + 128) >> 8;
    891 
    892         /* fourth stage */
    893         res = ((x0 + x4) >> 14);
    894         CLIP_RESULT(res)
    895         res2 = ((x0 + x2) >> 14);
    896         CLIP_RESULT(res2)
    897         dst_word = (res2 << 8) | res;
    898         res = ((x0 + x1) >> 14);
    899         CLIP_RESULT(res)
    900         dst_word |= (res << 16);
    901         res = ((x0 + x5) >> 14);
    902         CLIP_RESULT(res)
    903         dst_word |= (res << 24);
    904         *((uint32*)(rec += lx)) = dst_word;
    905 
    906         res = ((x0 - x5) >> 14);
    907         CLIP_RESULT(res)
    908         res2 = ((x0 - x1) >> 14);
    909         CLIP_RESULT(res2)
    910         dst_word = (res2 << 8) | res;
    911         res = ((x0 - x2) >> 14);
    912         CLIP_RESULT(res)
    913         dst_word |= (res << 16);
    914         res = ((x0 - x4) >> 14);
    915         CLIP_RESULT(res)
    916         dst_word |= (res << 24);
    917         *((uint32*)(rec + 4)) = dst_word;
    918     }
    919     return ;
    920 }
    921 
    922 void idct_row3Intra(Short *blk, UChar *rec, Int lx)
    923 {
    924     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    925     int res, res2;
    926     uint32 dst_word;
    927     int i = 8;
    928 
    929     rec -= lx;
    930     blk -= 8;
    931     while (i--)
    932     {
    933         x2 = blk[10];
    934         blk[10] = 0;
    935         x1 = blk[9];
    936         blk[9] = 0;
    937         x0 = ((*(blk += 8)) << 8) + 8192;
    938         *blk = 0;/* for proper rounding in the fourth stage */
    939         /* both upper and lower*/
    940         /* both x2orx6 and x0orx4 */
    941 
    942         x4 = x0;
    943         x6 = (W6 * x2 + 4) >> 3;
    944         x2 = (W2 * x2 + 4) >> 3;
    945         x8 = x0 - x2;
    946         x0 += x2;
    947         x2 = x8;
    948         x8 = x4 - x6;
    949         x4 += x6;
    950         x6 = x8;
    951 
    952         x7 = (W7 * x1 + 4) >> 3;
    953         x1 = (W1 * x1 + 4) >> 3;
    954         x3 = x7;
    955         x5 = (181 * (x1 - x7) + 128) >> 8;
    956         x7 = (181 * (x1 + x7) + 128) >> 8;
    957 
    958         res = ((x0 + x1) >> 14);
    959         CLIP_RESULT(res)
    960         res2 = ((x4 + x7) >> 14);
    961         CLIP_RESULT(res2)
    962         dst_word = (res2 << 8) | res;
    963         res = ((x6 + x5) >> 14);
    964         CLIP_RESULT(res)
    965         dst_word |= (res << 16);
    966         res = ((x2 + x3) >> 14);
    967         CLIP_RESULT(res)
    968         dst_word |= (res << 24);
    969         *((uint32*)(rec += lx)) = dst_word;
    970 
    971         res = ((x2 - x3) >> 14);
    972         CLIP_RESULT(res)
    973         res2 = ((x6 - x5) >> 14);
    974         CLIP_RESULT(res2)
    975         dst_word = (res2 << 8) | res;
    976         res = ((x4 - x7) >> 14);
    977         CLIP_RESULT(res)
    978         dst_word |= (res << 16);
    979         res = ((x0 - x1) >> 14);
    980         CLIP_RESULT(res)
    981         dst_word |= (res << 24);
    982         *((uint32*)(rec + 4)) = dst_word;
    983 
    984     }
    985     return ;
    986 }
    987 
    988 void idct_row4Intra(Short *blk, UChar *rec, Int lx)
    989 {
    990     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    991     int res, res2;
    992     uint32 dst_word;
    993     int i = 8;
    994 
    995     rec -= lx;
    996     blk -= 8;
    997     while (i--)
    998     {
    999         x2 = blk[10];
   1000         blk[10] = 0;
   1001         x1 = blk[9];
   1002         blk[9] = 0;
   1003         x3 = blk[11];
   1004         blk[11] = 0;
   1005         x0 = ((*(blk += 8)) << 8) + 8192;
   1006         *blk = 0; /* for proper rounding in the fourth stage */
   1007 
   1008         x4 = x0;
   1009         x6 = (W6 * x2 + 4) >> 3;
   1010         x2 = (W2 * x2 + 4) >> 3;
   1011         x8 = x0 - x2;
   1012         x0 += x2;
   1013         x2 = x8;
   1014         x8 = x4 - x6;
   1015         x4 += x6;
   1016         x6 = x8;
   1017 
   1018         x7 = (W7 * x1 + 4) >> 3;
   1019         x1 = (W1 * x1 + 4) >> 3;
   1020         x5 = (W3 * x3 + 4) >> 3;
   1021         x3 = (- W5 * x3 + 4) >> 3;
   1022         x8 = x1 - x5;
   1023         x1 += x5;
   1024         x5 = x8;
   1025         x8 = x7 - x3;
   1026         x3 += x7;
   1027         x7 = (181 * (x5 + x8) + 128) >> 8;
   1028         x5 = (181 * (x5 - x8) + 128) >> 8;
   1029 
   1030         res = ((x0 + x1) >> 14);
   1031         CLIP_RESULT(res)
   1032         res2 = ((x4 + x7) >> 14);
   1033         CLIP_RESULT(res2)
   1034         dst_word = (res2 << 8) | res;
   1035         res = ((x6 + x5) >> 14);
   1036         CLIP_RESULT(res)
   1037         dst_word |= (res << 16);
   1038         res = ((x2 + x3) >> 14);
   1039         CLIP_RESULT(res)
   1040         dst_word |= (res << 24);
   1041         *((uint32*)(rec += lx)) = dst_word;
   1042 
   1043         res = ((x2 - x3) >> 14);
   1044         CLIP_RESULT(res)
   1045         res2 = ((x6 - x5) >> 14);
   1046         CLIP_RESULT(res2)
   1047         dst_word = (res2 << 8) | res;
   1048         res = ((x4 - x7) >> 14);
   1049         CLIP_RESULT(res)
   1050         dst_word |= (res << 16);
   1051         res = ((x0 - x1) >> 14);
   1052         CLIP_RESULT(res)
   1053         dst_word |= (res << 24);
   1054         *((uint32*)(rec + 4)) = dst_word;
   1055     }
   1056 
   1057     return ;
   1058 }
   1059 
   1060 #ifndef SMALL_DCT
   1061 void idct_row0x40Intra(Short *blk, UChar *rec, Int lx)
   1062 {
   1063     int32  x1, x2, x4, x5;
   1064     int res, res2;
   1065     uint32 dst_word;
   1066     int i = 8;
   1067 
   1068     rec -= lx;
   1069 
   1070     while (i--)
   1071     {
   1072         /* shortcut */
   1073         x4 = blk[1];
   1074         blk[1] = 0;
   1075         blk += 8;
   1076 
   1077         /* first stage */
   1078         x5 = (W7 * x4 + 4) >> 3;
   1079         x4 = (W1 * x4 + 4) >> 3;
   1080 
   1081         /* third stage */
   1082         x2 = (181 * (x4 + x5) + 128) >> 8;
   1083         x1 = (181 * (x4 - x5) + 128) >> 8;
   1084 
   1085         /* fourth stage */
   1086         res = ((8192 + x4) >> 14);
   1087         CLIP_RESULT(res)
   1088         res2 = ((8192 + x2) >> 14);
   1089         CLIP_RESULT(res2)
   1090         dst_word = (res2 << 8) | res;
   1091         res = ((8192 + x1) >> 14);
   1092         CLIP_RESULT(res)
   1093         dst_word |= (res << 16);
   1094         res = ((8192 + x5) >> 14);
   1095         CLIP_RESULT(res)
   1096         dst_word |= (res << 24);
   1097         *((uint32*)(rec += lx)) = dst_word;
   1098 
   1099         res = ((8192 - x5) >> 14);
   1100         CLIP_RESULT(res)
   1101         res2 = ((8192 - x1) >> 14);
   1102         CLIP_RESULT(res2)
   1103         dst_word = (res2 << 8) | res;
   1104         res = ((8192 - x2) >> 14);
   1105         CLIP_RESULT(res)
   1106         dst_word |= (res << 16);
   1107         res = ((8192 - x4) >> 14);
   1108         CLIP_RESULT(res)
   1109         dst_word |= (res << 24);
   1110         *((uint32*)(rec + 4)) = dst_word;
   1111 
   1112     }
   1113     return ;
   1114 }
   1115 
   1116 void idct_row0x20Intra(Short *blk, UChar *rec, Int lx)
   1117 {
   1118     int32 x0, x2, x4, x6;
   1119     int res, res2;
   1120     uint32 dst_word;
   1121     int i = 8;
   1122 
   1123     rec -= lx;
   1124     while (i--)
   1125     {
   1126         x2 = blk[2];
   1127         blk[2] = 0;
   1128         blk += 8;
   1129 
   1130         /* both upper and lower*/
   1131         /* both x2orx6 and x0orx4 */
   1132         x6 = (W6 * x2 + 4) >> 3;
   1133         x2 = (W2 * x2 + 4) >> 3;
   1134         x0 = 8192 + x2;
   1135         x2 = 8192 - x2;
   1136         x4 = 8192 + x6;
   1137         x6 = 8192 - x6;
   1138 
   1139         res = ((x0) >> 14);
   1140         CLIP_RESULT(res)
   1141         res2 = ((x4) >> 14);
   1142         CLIP_RESULT(res2)
   1143         dst_word = (res2 << 8) | res;
   1144         res = ((x6) >> 14);
   1145         CLIP_RESULT(res)
   1146         dst_word |= (res << 16);
   1147         res = ((x2) >> 14);
   1148         CLIP_RESULT(res)
   1149         dst_word |= (res << 24);
   1150         *((uint32*)(rec += lx)) = dst_word;
   1151 
   1152         res = ((x2) >> 14);
   1153         CLIP_RESULT(res)
   1154         res2 = ((x6) >> 14);
   1155         CLIP_RESULT(res2)
   1156         dst_word = (res2 << 8) | res;
   1157         res = ((x4) >> 14);
   1158         CLIP_RESULT(res)
   1159         dst_word |= (res << 16);
   1160         res = ((x0) >> 14);
   1161         CLIP_RESULT(res)
   1162         dst_word |= (res << 24);
   1163         *((uint32*)(rec + 4)) = dst_word;
   1164 
   1165     }
   1166     return ;
   1167 }
   1168 
   1169 void idct_row0x10Intra(Short *blk, UChar *rec, Int lx)
   1170 {
   1171     int32 x1, x3, x5, x7;
   1172     int res, res2;
   1173     uint32 dst_word;
   1174     int i = 8;
   1175 
   1176     rec -= lx;
   1177     while (i--)
   1178     {
   1179         x3 = blk[3];
   1180         blk[3] = 0 ;
   1181         blk += 8;
   1182 
   1183         x1 = (W3 * x3 + 4) >> 3;
   1184         x3 = (W5 * x3 + 4) >> 3;
   1185 
   1186         x7 = (181 * (x3 - x1) + 128) >> 8;
   1187         x5 = (-181 * (x1 + x3) + 128) >> 8;
   1188 
   1189         res = ((8192 + x1) >> 14);
   1190         CLIP_RESULT(res)
   1191         res2 = ((8192 + x7) >> 14);
   1192         CLIP_RESULT(res2)
   1193         dst_word = (res2 << 8) | res;
   1194         res = ((8192 + x5) >> 14);
   1195         CLIP_RESULT(res)
   1196         dst_word |= (res << 16);
   1197         res = ((8192 - x3) >> 14);
   1198         CLIP_RESULT(res)
   1199         dst_word |= (res << 24);
   1200         *((uint32*)(rec += lx)) = dst_word;
   1201 
   1202         res = ((8192 + x3) >> 14);
   1203         CLIP_RESULT(res)
   1204         res2 = ((8192 - x5) >> 14);
   1205         CLIP_RESULT(res2)
   1206         dst_word = (res2 << 8) | res;
   1207         res = ((8192 - x7) >> 14);
   1208         CLIP_RESULT(res)
   1209         dst_word |= (res << 16);
   1210         res = ((8192 - x1) >> 14);
   1211         CLIP_RESULT(res)
   1212         dst_word |= (res << 24);
   1213         *((uint32*)(rec + 4)) = dst_word;
   1214 
   1215     }
   1216 
   1217     return ;
   1218 }
   1219 
   1220 #endif /* SMALL_DCT */
   1221 void idct_rowIntra(Short *blk, UChar *rec, Int lx)
   1222 {
   1223     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
   1224     int i = 8;
   1225     int res, res2;
   1226     uint32 dst_word;
   1227 
   1228     blk -= 8;
   1229     rec -= lx;
   1230 
   1231     while (i--)
   1232     {
   1233         x1 = (int32)blk[12] << 8;
   1234         blk[12] = 0;
   1235         x2 = blk[14];
   1236         blk[14] = 0;
   1237         x3 = blk[10];
   1238         blk[10] = 0;
   1239         x4 = blk[9];
   1240         blk[9] = 0;
   1241         x5 = blk[15];
   1242         blk[15] = 0;
   1243         x6 = blk[13];
   1244         blk[13] = 0;
   1245         x7 = blk[11];
   1246         blk[11] = 0;
   1247         x0 = ((*(blk += 8)) << 8) + 8192;
   1248         *blk = 0;  /* for proper rounding in the fourth stage */
   1249 
   1250         /* first stage */
   1251         x8 = W7 * (x4 + x5) + 4;
   1252         x4 = (x8 + (W1 - W7) * x4) >> 3;
   1253         x5 = (x8 - (W1 + W7) * x5) >> 3;
   1254         x8 = W3 * (x6 + x7) + 4;
   1255         x6 = (x8 - (W3 - W5) * x6) >> 3;
   1256         x7 = (x8 - (W3 + W5) * x7) >> 3;
   1257 
   1258         /* second stage */
   1259         x8 = x0 + x1;
   1260         x0 -= x1;
   1261         x1 = W6 * (x3 + x2) + 4;
   1262         x2 = (x1 - (W2 + W6) * x2) >> 3;
   1263         x3 = (x1 + (W2 - W6) * x3) >> 3;
   1264         x1 = x4 + x6;
   1265         x4 -= x6;
   1266         x6 = x5 + x7;
   1267         x5 -= x7;
   1268 
   1269         /* third stage */
   1270         x7 = x8 + x3;
   1271         x8 -= x3;
   1272         x3 = x0 + x2;
   1273         x0 -= x2;
   1274         x2 = (181 * (x4 + x5) + 128) >> 8;
   1275         x4 = (181 * (x4 - x5) + 128) >> 8;
   1276 
   1277         /* fourth stage */
   1278         res = ((x7 + x1) >> 14);
   1279         CLIP_RESULT(res)
   1280         res2 = ((x3 + x2) >> 14);
   1281         CLIP_RESULT(res2)
   1282         dst_word = res | (res2 << 8);
   1283         res = ((x0 + x4) >> 14);
   1284         CLIP_RESULT(res)
   1285         dst_word |= (res << 16);
   1286         res = ((x8 + x6) >> 14);
   1287         CLIP_RESULT(res)
   1288         dst_word |= (res << 24);
   1289         *((uint32*)(rec += lx)) = dst_word;
   1290 
   1291         res = ((x8 - x6) >> 14);
   1292         CLIP_RESULT(res)
   1293         res2 = ((x0 - x4) >> 14);
   1294         CLIP_RESULT(res2)
   1295         dst_word = res | (res2 << 8);
   1296         res = ((x3 - x2) >> 14);
   1297         CLIP_RESULT(res)
   1298         dst_word |= (res << 16);
   1299         res = ((x7 - x1) >> 14);
   1300         CLIP_RESULT(res)
   1301         dst_word |= (res << 24);
   1302         *((uint32*)(rec + 4)) = dst_word;
   1303     }
   1304     return;
   1305 }
   1306 
   1307 
   1308 /* This function should not be called at all ****/
   1309 void idct_row0zmv(Short *srce, UChar *rec, UChar *pred, Int lx)
   1310 {
   1311     OSCL_UNUSED_ARG(srce);
   1312     OSCL_UNUSED_ARG(rec);
   1313     OSCL_UNUSED_ARG(pred);
   1314     OSCL_UNUSED_ARG(lx);
   1315 
   1316     return;
   1317 }
   1318 
   1319 void idct_row1zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1320 {
   1321     int tmp;
   1322     int i = 8;
   1323     uint32 pred_word, dst_word;
   1324     int res, res2;
   1325 
   1326     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1327     pred -= 16;
   1328     rec -= lx;
   1329     blk -= 8;
   1330 
   1331     while (i--)
   1332     {
   1333         tmp = (*(blk += 8) + 32) >> 6;
   1334         *blk = 0;
   1335 
   1336         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1337         res = tmp + (pred_word & 0xFF);
   1338         CLIP_RESULT(res);
   1339         res2 = tmp + ((pred_word >> 8) & 0xFF);
   1340         CLIP_RESULT(res2);
   1341         dst_word = (res2 << 8) | res;
   1342         res = tmp + ((pred_word >> 16) & 0xFF);
   1343         CLIP_RESULT(res);
   1344         dst_word |= (res << 16);
   1345         res = tmp + ((pred_word >> 24) & 0xFF);
   1346         CLIP_RESULT(res);
   1347         dst_word |= (res << 24);
   1348         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1349 
   1350         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1351         res = tmp + (pred_word & 0xFF);
   1352         CLIP_RESULT(res);
   1353         res2 = tmp + ((pred_word >> 8) & 0xFF);
   1354         CLIP_RESULT(res2);
   1355         dst_word = (res2 << 8) | res;
   1356         res = tmp + ((pred_word >> 16) & 0xFF);
   1357         CLIP_RESULT(res);
   1358         dst_word |= (res << 16);
   1359         res = tmp + ((pred_word >> 24) & 0xFF);
   1360         CLIP_RESULT(res);
   1361         dst_word |= (res << 24);
   1362         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1363     }
   1364     return;
   1365 }
   1366 
   1367 void idct_row2zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1368 {
   1369     int32 x0, x1, x2, x4, x5;
   1370     int i = 8;
   1371     uint32 pred_word, dst_word;
   1372     int res, res2;
   1373 
   1374     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1375     rec -= lx;
   1376     pred -= 16;
   1377     blk -= 8;
   1378 
   1379     while (i--)
   1380     {
   1381         /* shortcut */
   1382         x4 = blk[9];
   1383         blk[9] = 0;
   1384         x0 = ((*(blk += 8)) << 8) + 8192;
   1385         *blk = 0;  /* for proper rounding in the fourth stage */
   1386 
   1387         /* first stage */
   1388         x5 = (W7 * x4 + 4) >> 3;
   1389         x4 = (W1 * x4 + 4) >> 3;
   1390 
   1391         /* third stage */
   1392         x2 = (181 * (x4 + x5) + 128) >> 8;
   1393         x1 = (181 * (x4 - x5) + 128) >> 8;
   1394 
   1395         /* fourth stage */
   1396         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1397         res = (x0 + x4) >> 14;
   1398         ADD_AND_CLIP1(res);
   1399         res2 = (x0 + x2) >> 14;
   1400         ADD_AND_CLIP2(res2);
   1401         dst_word = (res2 << 8) | res;
   1402         res = (x0 + x1) >> 14;
   1403         ADD_AND_CLIP3(res);
   1404         dst_word |= (res << 16);
   1405         res = (x0 + x5) >> 14;
   1406         ADD_AND_CLIP4(res);
   1407         dst_word |= (res << 24);
   1408         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1409 
   1410         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1411         res = (x0 - x5) >> 14;
   1412         ADD_AND_CLIP1(res);
   1413         res2 = (x0 - x1) >> 14;
   1414         ADD_AND_CLIP2(res2);
   1415         dst_word = (res2 << 8) | res;
   1416         res = (x0 - x2) >> 14;
   1417         ADD_AND_CLIP3(res);
   1418         dst_word |= (res << 16);
   1419         res = (x0 - x4) >> 14;
   1420         ADD_AND_CLIP4(res);
   1421         dst_word |= (res << 24);
   1422         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1423     }
   1424     return ;
   1425 }
   1426 
   1427 void idct_row3zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1428 {
   1429     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
   1430     int i = 8;
   1431     uint32 pred_word, dst_word;
   1432     int res, res2;
   1433 
   1434     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1435     rec -= lx;
   1436     pred -= 16;
   1437     blk -= 8;
   1438 
   1439     while (i--)
   1440     {
   1441         x2 = blk[10];
   1442         blk[10] = 0;
   1443         x1 = blk[9];
   1444         blk[9] = 0;
   1445         x0 = ((*(blk += 8)) << 8) + 8192;
   1446         *blk = 0;  /* for proper rounding in the fourth stage */
   1447         /* both upper and lower*/
   1448         /* both x2orx6 and x0orx4 */
   1449 
   1450         x4 = x0;
   1451         x6 = (W6 * x2 + 4) >> 3;
   1452         x2 = (W2 * x2 + 4) >> 3;
   1453         x8 = x0 - x2;
   1454         x0 += x2;
   1455         x2 = x8;
   1456         x8 = x4 - x6;
   1457         x4 += x6;
   1458         x6 = x8;
   1459 
   1460         x7 = (W7 * x1 + 4) >> 3;
   1461         x1 = (W1 * x1 + 4) >> 3;
   1462         x3 = x7;
   1463         x5 = (181 * (x1 - x7) + 128) >> 8;
   1464         x7 = (181 * (x1 + x7) + 128) >> 8;
   1465 
   1466         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1467         res = (x0 + x1) >> 14;
   1468         ADD_AND_CLIP1(res);
   1469         res2 = (x4 + x7) >> 14;
   1470         ADD_AND_CLIP2(res2);
   1471         dst_word = (res2 << 8) | res;
   1472         res = (x6 + x5) >> 14;
   1473         ADD_AND_CLIP3(res);
   1474         dst_word |= (res << 16);
   1475         res = (x2 + x3) >> 14;
   1476         ADD_AND_CLIP4(res);
   1477         dst_word |= (res << 24);
   1478         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1479 
   1480         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1481         res = (x2 - x3) >> 14;
   1482         ADD_AND_CLIP1(res);
   1483         res2 = (x6 - x5) >> 14;
   1484         ADD_AND_CLIP2(res2);
   1485         dst_word = (res2 << 8) | res;
   1486         res = (x4 - x7) >> 14;
   1487         ADD_AND_CLIP3(res);
   1488         dst_word |= (res << 16);
   1489         res = (x0 - x1) >> 14;
   1490         ADD_AND_CLIP4(res);
   1491         dst_word |= (res << 24);
   1492         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1493     }
   1494 
   1495     return ;
   1496 }
   1497 
   1498 void idct_row4zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1499 {
   1500     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
   1501     int i = 8;
   1502     uint32 pred_word, dst_word;
   1503     int res, res2;
   1504 
   1505     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1506     rec -= lx;
   1507     pred -= 16;
   1508     blk -= 8;
   1509 
   1510     while (i--)
   1511     {
   1512         x2 = blk[10];
   1513         blk[10] = 0;
   1514         x1 = blk[9];
   1515         blk[9] = 0;
   1516         x3 = blk[11];
   1517         blk[11] = 0;
   1518         x0 = ((*(blk += 8)) << 8) + 8192;
   1519         *blk = 0;   /* for proper rounding in the fourth stage */
   1520 
   1521         x4 = x0;
   1522         x6 = (W6 * x2 + 4) >> 3;
   1523         x2 = (W2 * x2 + 4) >> 3;
   1524         x8 = x0 - x2;
   1525         x0 += x2;
   1526         x2 = x8;
   1527         x8 = x4 - x6;
   1528         x4 += x6;
   1529         x6 = x8;
   1530 
   1531         x7 = (W7 * x1 + 4) >> 3;
   1532         x1 = (W1 * x1 + 4) >> 3;
   1533         x5 = (W3 * x3 + 4) >> 3;
   1534         x3 = (- W5 * x3 + 4) >> 3;
   1535         x8 = x1 - x5;
   1536         x1 += x5;
   1537         x5 = x8;
   1538         x8 = x7 - x3;
   1539         x3 += x7;
   1540         x7 = (181 * (x5 + x8) + 128) >> 8;
   1541         x5 = (181 * (x5 - x8) + 128) >> 8;
   1542 
   1543         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1544         res = (x0 + x1) >> 14;
   1545         ADD_AND_CLIP1(res);
   1546         res2 = (x4 + x7) >> 14;
   1547         ADD_AND_CLIP2(res2);
   1548         dst_word = (res2 << 8) | res;
   1549         res = (x6 + x5) >> 14;
   1550         ADD_AND_CLIP3(res);
   1551         dst_word |= (res << 16);
   1552         res = (x2 + x3) >> 14;
   1553         ADD_AND_CLIP4(res);
   1554         dst_word |= (res << 24);
   1555         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1556 
   1557         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1558         res = (x2 - x3) >> 14;
   1559         ADD_AND_CLIP1(res);
   1560         res2 = (x6 - x5) >> 14;
   1561         ADD_AND_CLIP2(res2);
   1562         dst_word = (res2 << 8) | res;
   1563         res = (x4 - x7) >> 14;
   1564         ADD_AND_CLIP3(res);
   1565         dst_word |= (res << 16);
   1566         res = (x0 - x1) >> 14;
   1567         ADD_AND_CLIP4(res);
   1568         dst_word |= (res << 24);
   1569         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1570     }
   1571     return ;
   1572 }
   1573 
   1574 #ifndef SMALL_DCT
   1575 void idct_row0x40zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1576 {
   1577     int32 x1, x2, x4, x5;
   1578     int i = 8;
   1579     uint32 pred_word, dst_word;
   1580     int res, res2;
   1581 
   1582     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1583     rec -= lx;
   1584     pred -= 16;
   1585 
   1586     while (i--)
   1587     {
   1588         /* shortcut */
   1589         x4 = blk[1];
   1590         blk[1] = 0;
   1591         blk += 8;  /* for proper rounding in the fourth stage */
   1592 
   1593         /* first stage */
   1594         x5 = (W7 * x4 + 4) >> 3;
   1595         x4 = (W1 * x4 + 4) >> 3;
   1596 
   1597         /* third stage */
   1598         x2 = (181 * (x4 + x5) + 128) >> 8;
   1599         x1 = (181 * (x4 - x5) + 128) >> 8;
   1600 
   1601         /* fourth stage */
   1602         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1603         res = (8192 + x4) >> 14;
   1604         ADD_AND_CLIP1(res);
   1605         res2 = (8192 + x2) >> 14;
   1606         ADD_AND_CLIP2(res2);
   1607         dst_word = (res2 << 8) | res;
   1608         res = (8192 + x1) >> 14;
   1609         ADD_AND_CLIP3(res);
   1610         dst_word |= (res << 16);
   1611         res = (8192 + x5) >> 14;
   1612         ADD_AND_CLIP4(res);
   1613         dst_word |= (res << 24);
   1614         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1615 
   1616         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1617         res = (8192 - x5) >> 14;
   1618         ADD_AND_CLIP1(res);
   1619         res2 = (8192 - x1) >> 14;
   1620         ADD_AND_CLIP2(res2);
   1621         dst_word = (res2 << 8) | res;
   1622         res = (8192 - x2) >> 14;
   1623         ADD_AND_CLIP3(res);
   1624         dst_word |= (res << 16);
   1625         res = (8192 - x4) >> 14;
   1626         ADD_AND_CLIP4(res);
   1627         dst_word |= (res << 24);
   1628         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1629     }
   1630     return ;
   1631 }
   1632 
   1633 void idct_row0x20zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1634 {
   1635     int32 x0, x2, x4, x6;
   1636     int i = 8;
   1637     uint32 pred_word, dst_word;
   1638     int res, res2;
   1639 
   1640     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1641     rec -= lx;
   1642     pred -= 16;
   1643 
   1644     while (i--)
   1645     {
   1646         x2 = blk[2];
   1647         blk[2] = 0;
   1648         blk += 8; /* for proper rounding in the fourth stage */
   1649         /* both upper and lower*/
   1650         /* both x2orx6 and x0orx4 */
   1651         x6 = (W6 * x2 + 4) >> 3;
   1652         x2 = (W2 * x2 + 4) >> 3;
   1653         x0 = 8192 + x2;
   1654         x2 = 8192 - x2;
   1655         x4 = 8192 + x6;
   1656         x6 = 8192 - x6;
   1657 
   1658         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1659         res = (x0) >> 14;
   1660         ADD_AND_CLIP1(res);
   1661         res2 = (x4) >> 14;
   1662         ADD_AND_CLIP2(res2);
   1663         dst_word = (res2 << 8) | res;
   1664         res = (x6) >> 14;
   1665         ADD_AND_CLIP3(res);
   1666         dst_word |= (res << 16);
   1667         res = (x2) >> 14;
   1668         ADD_AND_CLIP4(res);
   1669         dst_word |= (res << 24);
   1670         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1671 
   1672         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1673         res = (x2) >> 14;
   1674         ADD_AND_CLIP1(res);
   1675         res2 = (x6) >> 14;
   1676         ADD_AND_CLIP2(res2);
   1677         dst_word = (res2 << 8) | res;
   1678         res = (x4) >> 14;
   1679         ADD_AND_CLIP3(res);
   1680         dst_word |= (res << 16);
   1681         res = (x0) >> 14;
   1682         ADD_AND_CLIP4(res);
   1683         dst_word |= (res << 24);
   1684         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1685     }
   1686 
   1687     return ;
   1688 }
   1689 
   1690 void idct_row0x10zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1691 {
   1692     int32 x1, x3, x5, x7;
   1693     int i = 8;
   1694     uint32 pred_word, dst_word;
   1695     int res, res2;
   1696 
   1697     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1698     rec -= lx;
   1699     pred -= 16;
   1700 
   1701     while (i--)
   1702     {
   1703         x3 = blk[3];
   1704         blk[3] = 0;
   1705         blk += 8;
   1706 
   1707         x1 = (W3 * x3 + 4) >> 3;
   1708         x3 = (-W5 * x3 + 4) >> 3;
   1709 
   1710         x7 = (-181 * (x3 + x1) + 128) >> 8;
   1711         x5 = (181 * (x3 - x1) + 128) >> 8;
   1712 
   1713         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1714         res = (8192 + x1) >> 14;
   1715         ADD_AND_CLIP1(res);
   1716         res2 = (8192 + x7) >> 14;
   1717         ADD_AND_CLIP2(res2);
   1718         dst_word = (res2 << 8) | res;
   1719         res = (8192 + x5) >> 14;
   1720         ADD_AND_CLIP3(res);
   1721         dst_word |= (res << 16);
   1722         res = (8192 + x3) >> 14;
   1723         ADD_AND_CLIP4(res);
   1724         dst_word |= (res << 24);
   1725         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1726 
   1727         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1728         res = (8192 - x3) >> 14;
   1729         ADD_AND_CLIP1(res);
   1730         res2 = (8192 - x5) >> 14;
   1731         ADD_AND_CLIP2(res2);
   1732         dst_word = (res2 << 8) | res;
   1733         res = (8192 - x7) >> 14;
   1734         ADD_AND_CLIP3(res);
   1735         dst_word |= (res << 16);
   1736         res = (8192 - x1) >> 14;
   1737         ADD_AND_CLIP4(res);
   1738         dst_word |= (res << 24);
   1739         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1740     }
   1741     return ;
   1742 }
   1743 
   1744 #endif /* SMALL_DCT */
   1745 
   1746 void idct_rowzmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1747 {
   1748     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
   1749     int i = 8;
   1750     uint32 pred_word, dst_word;
   1751     int res, res2;
   1752 
   1753     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1754     rec -= lx;
   1755     pred -= 16;
   1756     blk -= 8;
   1757 
   1758     while (i--)
   1759     {
   1760         x1 = (int32)blk[12] << 8;
   1761         blk[12] = 0;
   1762         x2 = blk[14];
   1763         blk[14] = 0;
   1764         x3 = blk[10];
   1765         blk[10] = 0;
   1766         x4 = blk[9];
   1767         blk[9] = 0;
   1768         x5 = blk[15];
   1769         blk[15] = 0;
   1770         x6 = blk[13];
   1771         blk[13] = 0;
   1772         x7 = blk[11];
   1773         blk[11] = 0;
   1774         x0 = ((*(blk += 8)) << 8) + 8192;
   1775         *blk = 0;   /* for proper rounding in the fourth stage */
   1776 
   1777         /* first stage */
   1778         x8 = W7 * (x4 + x5) + 4;
   1779         x4 = (x8 + (W1 - W7) * x4) >> 3;
   1780         x5 = (x8 - (W1 + W7) * x5) >> 3;
   1781         x8 = W3 * (x6 + x7) + 4;
   1782         x6 = (x8 - (W3 - W5) * x6) >> 3;
   1783         x7 = (x8 - (W3 + W5) * x7) >> 3;
   1784 
   1785         /* second stage */
   1786         x8 = x0 + x1;
   1787         x0 -= x1;
   1788         x1 = W6 * (x3 + x2) + 4;
   1789         x2 = (x1 - (W2 + W6) * x2) >> 3;
   1790         x3 = (x1 + (W2 - W6) * x3) >> 3;
   1791         x1 = x4 + x6;
   1792         x4 -= x6;
   1793         x6 = x5 + x7;
   1794         x5 -= x7;
   1795 
   1796         /* third stage */
   1797         x7 = x8 + x3;
   1798         x8 -= x3;
   1799         x3 = x0 + x2;
   1800         x0 -= x2;
   1801         x2 = (181 * (x4 + x5) + 128) >> 8;
   1802         x4 = (181 * (x4 - x5) + 128) >> 8;
   1803 
   1804         /* fourth stage */
   1805         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1806 
   1807         res = (x7 + x1) >> 14;
   1808         ADD_AND_CLIP1(res);
   1809         res2 = (x3 + x2) >> 14;
   1810         ADD_AND_CLIP2(res2);
   1811         dst_word = (res2 << 8) | res;
   1812         res = (x0 + x4) >> 14;
   1813         ADD_AND_CLIP3(res);
   1814         dst_word |= (res << 16);
   1815         res = (x8 + x6) >> 14;
   1816         ADD_AND_CLIP4(res);
   1817         dst_word |= (res << 24);
   1818         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1819 
   1820         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1821 
   1822         res = (x8 - x6) >> 14;
   1823         ADD_AND_CLIP1(res);
   1824         res2 = (x0 - x4) >> 14;
   1825         ADD_AND_CLIP2(res2);
   1826         dst_word = (res2 << 8) | res;
   1827         res = (x3 - x2) >> 14;
   1828         ADD_AND_CLIP3(res);
   1829         dst_word |= (res << 16);
   1830         res = (x7 - x1) >> 14;
   1831         ADD_AND_CLIP4(res);
   1832         dst_word |= (res << 24);
   1833         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1834     }
   1835     return;
   1836 }
   1837 
   1838 /*----------------------------------------------------------------------------
   1839 ;  End Function: idctcol
   1840 ----------------------------------------------------------------------------*/
   1841 /* ======================================================================== */
   1842 /*  Function : BlockIDCTMotionComp                                              */
   1843 /*  Date     : 10/16/2000                                                   */
   1844 /*  Purpose  : fast IDCT routine                                    */
   1845 /*  In/out   :                                                              */
   1846 /*      Int* coeff_in   Dequantized coefficient
   1847         Int block_out   output IDCT coefficient
   1848         Int maxval      clip value                                          */
   1849 /*  Modified :   7/31/01, add checking for all-zero and DC-only block.  */
   1850 /*              do 8 columns at a time                                      */
   1851 /*               8/2/01, do column first then row-IDCT.                 */
   1852 /*               8/2/01, remove clipping (included in motion comp).     */
   1853 /*               8/7/01, combine with motion comp.                      */
   1854 /*               8/8/01, use AAN IDCT                                       */
   1855 /*               9/4/05, use Chen's IDCT and 16 bit block                   */
   1856 /* ======================================================================== */
   1857 void BlockIDCTMotionComp(Short *block, UChar *bitmapcol, UChar bitmaprow,
   1858                          Int dctMode, UChar *rec, UChar *pred, Int lx_intra)
   1859 {
   1860     Int i;
   1861     Int tmp, tmp2;
   1862     ULong tmp4;
   1863     Int bmap;
   1864     Short *ptr = block;
   1865     UChar *endcol;
   1866     UInt mask = 0xFF;
   1867     Int lx = lx_intra >> 1;
   1868     Int intra = (lx_intra & 1);
   1869 
   1870     /*  all-zero block */
   1871     if (dctMode == 0 || bitmaprow == 0)
   1872     {
   1873         if (intra)
   1874         {
   1875             *((ULong*)rec) = *((ULong*)(rec + 4)) = 0;
   1876             *((ULong*)(rec += lx)) = 0;
   1877             *((ULong*)(rec + 4)) = 0;
   1878             *((ULong*)(rec += lx)) = 0;
   1879             *((ULong*)(rec + 4)) = 0;
   1880             *((ULong*)(rec += lx)) = 0;
   1881             *((ULong*)(rec + 4)) = 0;
   1882             *((ULong*)(rec += lx)) = 0;
   1883             *((ULong*)(rec + 4)) = 0;
   1884             *((ULong*)(rec += lx)) = 0;
   1885             *((ULong*)(rec + 4)) = 0;
   1886             *((ULong*)(rec += lx)) = 0;
   1887             *((ULong*)(rec + 4)) = 0;
   1888             *((ULong*)(rec += lx)) = 0;
   1889             *((ULong*)(rec + 4)) = 0;
   1890             return ;
   1891         }
   1892         else /* copy from previous frame */
   1893         {
   1894             *((ULong*)rec) = *((ULong*)pred);
   1895             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1896             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1897             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1898             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1899             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1900             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1901             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1902             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1903             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1904             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1905             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1906             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1907             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1908             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1909             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1910             return ;
   1911         }
   1912     }
   1913 
   1914     /* Test for DC only block */
   1915     if (dctMode == 1 || (bitmaprow == 0x80 && bitmapcol[0] == 0x80))
   1916     {
   1917         i = ((block[0] << 3) + 32) >> 6;
   1918         block[0] = 0;
   1919         if (intra)
   1920         {
   1921             if ((UInt)i > mask) i = mask & (~(i >> 31));
   1922 
   1923             tmp = i | (i << 8);
   1924             tmp |= (tmp << 16);
   1925 
   1926             *((ULong*)rec) = *((ULong*)(rec + 4)) = tmp;
   1927             *((ULong*)(rec += lx)) = tmp;
   1928             *((ULong*)(rec + 4)) = tmp;
   1929             *((ULong*)(rec += lx)) = tmp;
   1930             *((ULong*)(rec + 4)) = tmp;
   1931             *((ULong*)(rec += lx)) = tmp;
   1932             *((ULong*)(rec + 4)) = tmp;
   1933             *((ULong*)(rec += lx)) = tmp;
   1934             *((ULong*)(rec + 4)) = tmp;
   1935             *((ULong*)(rec += lx)) = tmp;
   1936             *((ULong*)(rec + 4)) = tmp;
   1937             *((ULong*)(rec += lx)) = tmp;
   1938             *((ULong*)(rec + 4)) = tmp;
   1939             *((ULong*)(rec += lx)) = tmp;
   1940             *((ULong*)(rec + 4)) = tmp;
   1941 
   1942             return ;
   1943         }
   1944         else
   1945         {
   1946             endcol = rec + (lx << 3);
   1947             do
   1948             {
   1949                 tmp4 = *((ULong*)pred);
   1950                 tmp2 = tmp4 & 0xFF;
   1951                 tmp2 += i;
   1952                 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
   1953                 tmp = (tmp4 >> 8) & 0xFF;
   1954                 tmp += i;
   1955                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1956                 tmp2 |= (tmp << 8);
   1957                 tmp = (tmp4 >> 16) & 0xFF;
   1958                 tmp += i;
   1959                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1960                 tmp2 |= (tmp << 16);
   1961                 tmp = (tmp4 >> 24) & 0xFF;
   1962                 tmp += i;
   1963                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1964                 tmp2 |= (tmp << 24);
   1965                 *((ULong*)rec) = tmp2;
   1966 
   1967                 tmp4 = *((ULong*)(pred + 4));
   1968                 tmp2 = tmp4 & 0xFF;
   1969                 tmp2 += i;
   1970                 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
   1971                 tmp = (tmp4 >> 8) & 0xFF;
   1972                 tmp += i;
   1973                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1974                 tmp2 |= (tmp << 8);
   1975                 tmp = (tmp4 >> 16) & 0xFF;
   1976                 tmp += i;
   1977                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1978                 tmp2 |= (tmp << 16);
   1979                 tmp = (tmp4 >> 24) & 0xFF;
   1980                 tmp += i;
   1981                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1982                 tmp2 |= (tmp << 24);
   1983                 *((ULong*)(rec + 4)) = tmp2;
   1984 
   1985                 rec += lx;
   1986                 pred += 16;
   1987             }
   1988             while (rec < endcol);
   1989             return ;
   1990         }
   1991     }
   1992 
   1993     for (i = 0; i < dctMode; i++)
   1994     {
   1995         bmap = (Int)bitmapcol[i];
   1996         if (bmap)
   1997         {
   1998             if ((bmap&0xf) == 0)
   1999                 (*(idctcolVCA[bmap>>4]))(ptr);
   2000             else
   2001                 idct_col(ptr);
   2002         }
   2003         ptr++;
   2004     }
   2005 
   2006     if ((bitmaprow&0xf) == 0)
   2007     {
   2008         if (intra)
   2009             (*(idctrowVCAIntra[(Int)(bitmaprow>>4)]))(block, rec, lx);
   2010         else
   2011             (*(idctrowVCAzmv[(Int)(bitmaprow>>4)]))(block, rec, pred, lx);
   2012     }
   2013     else
   2014     {
   2015         if (intra)
   2016             idct_rowIntra(block, rec, lx);
   2017         else
   2018             idct_rowzmv(block, rec, pred, lx);
   2019     }
   2020 }
   2021