Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 /*
     19 
     20 ------------------------------------------------------------------------------
     21  REVISION HISTORY
     22  Who:   Date: July/2001
     23  Description:   1. Optimized BlockIDCT bitmap checking.
     24                 2. Rearranged functions.
     25                 3. Do column IDCT first, then row IDCT.
     26                 4. Combine motion comp and IDCT, require
     27                    two sets of row IDCTs one for INTRA
     28                    and one for INTER.
     29                 5. Add AAN IDCT
     30 
     31  Who:   Date: 8/16/01
     32                 1. Increase the input precision to 8 bits, i.e. change RDCTBITS
     33                    to 11, have to comment out all in-line assembly since 16 bit
     34                     multiplication doesn't work. Try to use diffent precision with
     35                     32 bit mult. but hasn't finished. Turns out that without in-line
     36                     assembly the performance doesn't change much (only 1%).
     37  Who:   Date: 9/04/05
     38                 1. Replace AAN IDCT with Chen's IDCT to accommodate 16 bit data type.
     39 
     40 */
     41 #include "oscl_base_macros.h" // for OSCL_UNUSED_ARG
     42 #include "mp4def.h"
     43 #include "mp4enc_lib.h"
     44 #include "mp4lib_int.h"
     45 #include "dct.h"
     46 
     47 #define ADD_CLIP    { \
     48             tmp = *rec + tmp; \
     49         if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
     50         *rec++ = tmp;   \
     51         }
     52 
     53 #define INTRA_CLIP  { \
     54         if((UInt)tmp > mask) tmp = mask&(~(tmp>>31)); \
     55         *rec++ = tmp;   \
     56         }
     57 
     58 
     59 #define CLIP_RESULT(x)      if((UInt)x > 0xFF){x = 0xFF & (~(x>>31));}
     60 #define ADD_AND_CLIP1(x)    x += (pred_word&0xFF); CLIP_RESULT(x);
     61 #define ADD_AND_CLIP2(x)    x += ((pred_word>>8)&0xFF); CLIP_RESULT(x);
     62 #define ADD_AND_CLIP3(x)    x += ((pred_word>>16)&0xFF); CLIP_RESULT(x);
     63 #define ADD_AND_CLIP4(x)    x += ((pred_word>>24)&0xFF); CLIP_RESULT(x);
     64 
     65 
     66 void idct_col0(Short *blk)
     67 {
     68     OSCL_UNUSED_ARG(blk);
     69 
     70     return;
     71 }
     72 
     73 void idct_col1(Short *blk)
     74 {
     75     blk[0] = blk[8] = blk[16] = blk[24] = blk[32] = blk[40] = blk[48] = blk[56] =
     76                                               blk[0] << 3;
     77     return ;
     78 }
     79 
     80 void idct_col2(Short *blk)
     81 {
     82     int32 x0, x1, x3, x5, x7;//, x8;
     83 
     84     x1 = blk[8];
     85     x0 = ((int32)blk[0] << 11) + 128;
     86     /* both upper and lower*/
     87 
     88     x7 = W7 * x1;
     89     x1 = W1 * x1;
     90 
     91     x3 = x7;
     92     x5 = (181 * (x1 - x7) + 128) >> 8;
     93     x7 = (181 * (x1 + x7) + 128) >> 8;
     94 
     95     blk[0] = (x0 + x1) >> 8;
     96     blk[8] = (x0 + x7) >> 8;
     97     blk[16] = (x0 + x5) >> 8;
     98     blk[24] = (x0 + x3) >> 8;
     99     blk[56] = (x0 - x1) >> 8;
    100     blk[48] = (x0 - x7) >> 8;
    101     blk[40] = (x0 - x5) >> 8;
    102     blk[32] = (x0 - x3) >> 8;
    103     return ;
    104 }
    105 
    106 void idct_col3(Short *blk)
    107 {
    108     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    109 
    110     x2 = blk[16];
    111     x1 = blk[8];
    112     x0 = ((int32)blk[0] << 11) + 128;
    113 
    114     x4 = x0;
    115     x6 = W6 * x2;
    116     x2 = W2 * x2;
    117     x8 = x0 - x2;
    118     x0 += x2;
    119     x2 = x8;
    120     x8 = x4 - x6;
    121     x4 += x6;
    122     x6 = x8;
    123 
    124     x7 = W7 * x1;
    125     x1 = W1 * x1;
    126     x3 = x7;
    127     x5 = (181 * (x1 - x7) + 128) >> 8;
    128     x7 = (181 * (x1 + x7) + 128) >> 8;
    129 
    130     blk[0] = (x0 + x1) >> 8;
    131     blk[8] = (x4 + x7) >> 8;
    132     blk[16] = (x6 + x5) >> 8;
    133     blk[24] = (x2 + x3) >> 8;
    134     blk[56] = (x0 - x1) >> 8;
    135     blk[48] = (x4 - x7) >> 8;
    136     blk[40] = (x6 - x5) >> 8;
    137     blk[32] = (x2 - x3) >> 8;
    138     return ;
    139 }
    140 
    141 void idct_col4(Short *blk)
    142 {
    143     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    144     x2 = blk[16];
    145     x1 = blk[8];
    146     x3 = blk[24];
    147     x0 = ((int32)blk[0] << 11) + 128;
    148 
    149     x4 = x0;
    150     x6 = W6 * x2;
    151     x2 = W2 * x2;
    152     x8 = x0 - x2;
    153     x0 += x2;
    154     x2 = x8;
    155     x8 = x4 - x6;
    156     x4 += x6;
    157     x6 = x8;
    158 
    159     x7 = W7 * x1;
    160     x1 = W1 * x1;
    161     x5 = W3 * x3;
    162     x3 = -W5 * x3;
    163     x8 = x1 - x5;
    164     x1 += x5;
    165     x5 = x8;
    166     x8 = x7 - x3;
    167     x3 += x7;
    168     x7 = (181 * (x5 + x8) + 128) >> 8;
    169     x5 = (181 * (x5 - x8) + 128) >> 8;
    170 
    171 
    172     blk[0] = (x0 + x1) >> 8;
    173     blk[8] = (x4 + x7) >> 8;
    174     blk[16] = (x6 + x5) >> 8;
    175     blk[24] = (x2 + x3) >> 8;
    176     blk[56] = (x0 - x1) >> 8;
    177     blk[48] = (x4 - x7) >> 8;
    178     blk[40] = (x6 - x5) >> 8;
    179     blk[32] = (x2 - x3) >> 8;
    180     return ;
    181 }
    182 
    183 #ifndef SMALL_DCT
    184 void idct_col0x40(Short *blk)
    185 {
    186     int32 x1, x3, x5, x7;//, x8;
    187 
    188     x1 = blk[8];
    189     /* both upper and lower*/
    190 
    191     x7 = W7 * x1;
    192     x1 = W1 * x1;
    193 
    194     x3 = x7;
    195     x5 = (181 * (x1 - x7) + 128) >> 8;
    196     x7 = (181 * (x1 + x7) + 128) >> 8;
    197 
    198     blk[0] = (128 + x1) >> 8;
    199     blk[8] = (128 + x7) >> 8;
    200     blk[16] = (128 + x5) >> 8;
    201     blk[24] = (128 + x3) >> 8;
    202     blk[56] = (128 - x1) >> 8;
    203     blk[48] = (128 - x7) >> 8;
    204     blk[40] = (128 - x5) >> 8;
    205     blk[32] = (128 - x3) >> 8;
    206 
    207     return ;
    208 }
    209 
    210 void idct_col0x20(Short *blk)
    211 {
    212     int32 x0, x2, x4, x6;
    213 
    214     x2 = blk[16];
    215     x6 = W6 * x2;
    216     x2 = W2 * x2;
    217     x0 = 128 + x2;
    218     x2 = 128 - x2;
    219     x4 = 128 + x6;
    220     x6 = 128 - x6;
    221 
    222     blk[0] = (x0) >> 8;
    223     blk[56] = (x0) >> 8;
    224     blk[8] = (x4) >> 8;
    225     blk[48] = (x4) >> 8;
    226     blk[16] = (x6) >> 8;
    227     blk[40] = (x6) >> 8;
    228     blk[24] = (x2) >> 8;
    229     blk[32] = (x2) >> 8;
    230 
    231     return ;
    232 }
    233 
    234 void idct_col0x10(Short *blk)
    235 {
    236     int32 x1, x3, x5,  x7;
    237 
    238     x3 = blk[24];
    239     x1 = W3 * x3;
    240     x3 = W5 * x3;
    241 
    242     x7 = (181 * (x3 - x1) + 128) >> 8;
    243     x5 = (-181 * (x1 + x3) + 128) >> 8;
    244 
    245 
    246     blk[0] = (128 + x1) >> 8;
    247     blk[8] = (128 + x7) >> 8;
    248     blk[16] = (128 + x5) >> 8;
    249     blk[24] = (128 - x3) >> 8;
    250     blk[56] = (128 - x1) >> 8;
    251     blk[48] = (128 - x7) >> 8;
    252     blk[40] = (128 - x5) >> 8;
    253     blk[32] = (128 + x3) >> 8;
    254 
    255     return ;
    256 }
    257 
    258 #endif /* SMALL_DCT */
    259 
    260 void idct_col(Short *blk)
    261 {
    262     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    263 
    264     x1 = (int32)blk[32] << 11;
    265     x2 = blk[48];
    266     x3 = blk[16];
    267     x4 = blk[8];
    268     x5 = blk[56];
    269     x6 = blk[40];
    270     x7 = blk[24];
    271     x0 = ((int32)blk[0] << 11) + 128;
    272 
    273     /* first stage */
    274     x8 = W7 * (x4 + x5);
    275     x4 = x8 + (W1 - W7) * x4;
    276     x5 = x8 - (W1 + W7) * x5;
    277     x8 = W3 * (x6 + x7);
    278     x6 = x8 - (W3 - W5) * x6;
    279     x7 = x8 - (W3 + W5) * x7;
    280 
    281     /* second stage */
    282     x8 = x0 + x1;
    283     x0 -= x1;
    284     x1 = W6 * (x3 + x2);
    285     x2 = x1 - (W2 + W6) * x2;
    286     x3 = x1 + (W2 - W6) * x3;
    287     x1 = x4 + x6;
    288     x4 -= x6;
    289     x6 = x5 + x7;
    290     x5 -= x7;
    291 
    292     /* third stage */
    293     x7 = x8 + x3;
    294     x8 -= x3;
    295     x3 = x0 + x2;
    296     x0 -= x2;
    297     x2 = (181 * (x4 + x5) + 128) >> 8;
    298     x4 = (181 * (x4 - x5) + 128) >> 8;
    299 
    300     /* fourth stage */
    301     blk[0]    = (x7 + x1) >> 8;
    302     blk[8] = (x3 + x2) >> 8;
    303     blk[16] = (x0 + x4) >> 8;
    304     blk[24] = (x8 + x6) >> 8;
    305     blk[32] = (x8 - x6) >> 8;
    306     blk[40] = (x0 - x4) >> 8;
    307     blk[48] = (x3 - x2) >> 8;
    308     blk[56] = (x7 - x1) >> 8;
    309 
    310     return ;
    311 }
    312 
    313 /* This function should not be called at all ****/
    314 void idct_row0Inter(Short *srce, UChar *rec, Int lx)
    315 {
    316     OSCL_UNUSED_ARG(srce);
    317 
    318     OSCL_UNUSED_ARG(rec);
    319 
    320     OSCL_UNUSED_ARG(lx);
    321 
    322     return;
    323 }
    324 
    325 void idct_row1Inter(Short *blk, UChar *rec, Int lx)
    326 {
    327     int tmp;
    328     int i = 8;
    329     uint32 pred_word, dst_word;
    330     int res, res2;
    331 
    332     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    333     rec -= lx;
    334     blk -= 8;
    335 
    336     while (i--)
    337     {
    338         tmp = (*(blk += 8) + 32) >> 6;
    339         *blk = 0;
    340 
    341         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    342         res = tmp + (pred_word & 0xFF);
    343         CLIP_RESULT(res);
    344         res2 = tmp + ((pred_word >> 8) & 0xFF);
    345         CLIP_RESULT(res2);
    346         dst_word = (res2 << 8) | res;
    347         res = tmp + ((pred_word >> 16) & 0xFF);
    348         CLIP_RESULT(res);
    349         dst_word |= (res << 16);
    350         res = tmp + ((pred_word >> 24) & 0xFF);
    351         CLIP_RESULT(res);
    352         dst_word |= (res << 24);
    353         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    354 
    355         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    356         res = tmp + (pred_word & 0xFF);
    357         CLIP_RESULT(res);
    358         res2 = tmp + ((pred_word >> 8) & 0xFF);
    359         CLIP_RESULT(res2);
    360         dst_word = (res2 << 8) | res;
    361         res = tmp + ((pred_word >> 16) & 0xFF);
    362         CLIP_RESULT(res);
    363         dst_word |= (res << 16);
    364         res = tmp + ((pred_word >> 24) & 0xFF);
    365         CLIP_RESULT(res);
    366         dst_word |= (res << 24);
    367         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    368     }
    369     return;
    370 }
    371 
    372 void idct_row2Inter(Short *blk, UChar *rec, Int lx)
    373 {
    374     int32 x0, x1, x2, x4, x5;
    375     int i = 8;
    376     uint32 pred_word, dst_word;
    377     int res, res2;
    378 
    379     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    380     rec -= lx;
    381     blk -= 8;
    382 
    383     while (i--)
    384     {
    385         /* shortcut */
    386         x4 = blk[9];
    387         blk[9] = 0;
    388         x0 = ((*(blk += 8)) << 8) + 8192;
    389         *blk = 0;  /* for proper rounding in the fourth stage */
    390 
    391         /* first stage */
    392         x5 = (W7 * x4 + 4) >> 3;
    393         x4 = (W1 * x4 + 4) >> 3;
    394 
    395         /* third stage */
    396         x2 = (181 * (x4 + x5) + 128) >> 8;
    397         x1 = (181 * (x4 - x5) + 128) >> 8;
    398 
    399         /* fourth stage */
    400         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    401         res = (x0 + x4) >> 14;
    402         ADD_AND_CLIP1(res);
    403         res2 = (x0 + x2) >> 14;
    404         ADD_AND_CLIP2(res2);
    405         dst_word = (res2 << 8) | res;
    406         res = (x0 + x1) >> 14;
    407         ADD_AND_CLIP3(res);
    408         dst_word |= (res << 16);
    409         res = (x0 + x5) >> 14;
    410         ADD_AND_CLIP4(res);
    411         dst_word |= (res << 24);
    412         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    413 
    414         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    415         res = (x0 - x5) >> 14;
    416         ADD_AND_CLIP1(res);
    417         res2 = (x0 - x1) >> 14;
    418         ADD_AND_CLIP2(res2);
    419         dst_word = (res2 << 8) | res;
    420         res = (x0 - x2) >> 14;
    421         ADD_AND_CLIP3(res);
    422         dst_word |= (res << 16);
    423         res = (x0 - x4) >> 14;
    424         ADD_AND_CLIP4(res);
    425         dst_word |= (res << 24);
    426         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    427     }
    428     return ;
    429 }
    430 
    431 void idct_row3Inter(Short *blk, UChar *rec, Int lx)
    432 {
    433     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    434     int i = 8;
    435     uint32 pred_word, dst_word;
    436     int res, res2;
    437 
    438     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    439     rec -= lx;
    440     blk -= 8;
    441 
    442     while (i--)
    443     {
    444         x2 = blk[10];
    445         blk[10] = 0;
    446         x1 = blk[9];
    447         blk[9] = 0;
    448         x0 = ((*(blk += 8)) << 8) + 8192;
    449         *blk = 0;  /* for proper rounding in the fourth stage */
    450         /* both upper and lower*/
    451         /* both x2orx6 and x0orx4 */
    452 
    453         x4 = x0;
    454         x6 = (W6 * x2 + 4) >> 3;
    455         x2 = (W2 * x2 + 4) >> 3;
    456         x8 = x0 - x2;
    457         x0 += x2;
    458         x2 = x8;
    459         x8 = x4 - x6;
    460         x4 += x6;
    461         x6 = x8;
    462 
    463         x7 = (W7 * x1 + 4) >> 3;
    464         x1 = (W1 * x1 + 4) >> 3;
    465         x3 = x7;
    466         x5 = (181 * (x1 - x7) + 128) >> 8;
    467         x7 = (181 * (x1 + x7) + 128) >> 8;
    468 
    469         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    470         res = (x0 + x1) >> 14;
    471         ADD_AND_CLIP1(res);
    472         res2 = (x4 + x7) >> 14;
    473         ADD_AND_CLIP2(res2);
    474         dst_word = (res2 << 8) | res;
    475         res = (x6 + x5) >> 14;
    476         ADD_AND_CLIP3(res);
    477         dst_word |= (res << 16);
    478         res = (x2 + x3) >> 14;
    479         ADD_AND_CLIP4(res);
    480         dst_word |= (res << 24);
    481         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    482 
    483         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    484         res = (x2 - x3) >> 14;
    485         ADD_AND_CLIP1(res);
    486         res2 = (x6 - x5) >> 14;
    487         ADD_AND_CLIP2(res2);
    488         dst_word = (res2 << 8) | res;
    489         res = (x4 - x7) >> 14;
    490         ADD_AND_CLIP3(res);
    491         dst_word |= (res << 16);
    492         res = (x0 - x1) >> 14;
    493         ADD_AND_CLIP4(res);
    494         dst_word |= (res << 24);
    495         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    496     }
    497 
    498     return ;
    499 }
    500 
    501 void idct_row4Inter(Short *blk, UChar *rec, Int lx)
    502 {
    503     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    504     int i = 8;
    505     uint32 pred_word, dst_word;
    506     int res, res2;
    507 
    508     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    509     rec -= lx;
    510     blk -= 8;
    511 
    512     while (i--)
    513     {
    514         x2 = blk[10];
    515         blk[10] = 0;
    516         x1 = blk[9];
    517         blk[9] = 0;
    518         x3 = blk[11];
    519         blk[11] = 0;
    520         x0 = ((*(blk += 8)) << 8) + 8192;
    521         *blk = 0;   /* for proper rounding in the fourth stage */
    522 
    523         x4 = x0;
    524         x6 = (W6 * x2 + 4) >> 3;
    525         x2 = (W2 * x2 + 4) >> 3;
    526         x8 = x0 - x2;
    527         x0 += x2;
    528         x2 = x8;
    529         x8 = x4 - x6;
    530         x4 += x6;
    531         x6 = x8;
    532 
    533         x7 = (W7 * x1 + 4) >> 3;
    534         x1 = (W1 * x1 + 4) >> 3;
    535         x5 = (W3 * x3 + 4) >> 3;
    536         x3 = (- W5 * x3 + 4) >> 3;
    537         x8 = x1 - x5;
    538         x1 += x5;
    539         x5 = x8;
    540         x8 = x7 - x3;
    541         x3 += x7;
    542         x7 = (181 * (x5 + x8) + 128) >> 8;
    543         x5 = (181 * (x5 - x8) + 128) >> 8;
    544 
    545         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    546         res = (x0 + x1) >> 14;
    547         ADD_AND_CLIP1(res);
    548         res2 = (x4 + x7) >> 14;
    549         ADD_AND_CLIP2(res2);
    550         dst_word = (res2 << 8) | res;
    551         res = (x6 + x5) >> 14;
    552         ADD_AND_CLIP3(res);
    553         dst_word |= (res << 16);
    554         res = (x2 + x3) >> 14;
    555         ADD_AND_CLIP4(res);
    556         dst_word |= (res << 24);
    557         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    558 
    559         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    560         res = (x2 - x3) >> 14;
    561         ADD_AND_CLIP1(res);
    562         res2 = (x6 - x5) >> 14;
    563         ADD_AND_CLIP2(res2);
    564         dst_word = (res2 << 8) | res;
    565         res = (x4 - x7) >> 14;
    566         ADD_AND_CLIP3(res);
    567         dst_word |= (res << 16);
    568         res = (x0 - x1) >> 14;
    569         ADD_AND_CLIP4(res);
    570         dst_word |= (res << 24);
    571         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    572     }
    573     return ;
    574 }
    575 
    576 #ifndef SMALL_DCT
    577 void idct_row0x40Inter(Short *blk, UChar *rec, Int lx)
    578 {
    579     int32 x1, x2, x4, x5;
    580     int i = 8;
    581     uint32 pred_word, dst_word;
    582     int res, res2;
    583 
    584     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    585     rec -= lx;
    586 
    587     while (i--)
    588     {
    589         /* shortcut */
    590         x4 = blk[1];
    591         blk[1] = 0;
    592         blk += 8;  /* for proper rounding in the fourth stage */
    593 
    594         /* first stage */
    595         x5 = (W7 * x4 + 4) >> 3;
    596         x4 = (W1 * x4 + 4) >> 3;
    597 
    598         /* third stage */
    599         x2 = (181 * (x4 + x5) + 128) >> 8;
    600         x1 = (181 * (x4 - x5) + 128) >> 8;
    601 
    602         /* fourth stage */
    603         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    604         res = (8192 + x4) >> 14;
    605         ADD_AND_CLIP1(res);
    606         res2 = (8192 + x2) >> 14;
    607         ADD_AND_CLIP2(res2);
    608         dst_word = (res2 << 8) | res;
    609         res = (8192 + x1) >> 14;
    610         ADD_AND_CLIP3(res);
    611         dst_word |= (res << 16);
    612         res = (8192 + x5) >> 14;
    613         ADD_AND_CLIP4(res);
    614         dst_word |= (res << 24);
    615         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    616 
    617         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    618         res = (8192 - x5) >> 14;
    619         ADD_AND_CLIP1(res);
    620         res2 = (8192 - x1) >> 14;
    621         ADD_AND_CLIP2(res2);
    622         dst_word = (res2 << 8) | res;
    623         res = (8192 - x2) >> 14;
    624         ADD_AND_CLIP3(res);
    625         dst_word |= (res << 16);
    626         res = (8192 - x4) >> 14;
    627         ADD_AND_CLIP4(res);
    628         dst_word |= (res << 24);
    629         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    630     }
    631     return ;
    632 }
    633 
    634 void idct_row0x20Inter(Short *blk, UChar *rec, Int lx)
    635 {
    636     int32 x0, x2, x4, x6;
    637     int i = 8;
    638     uint32 pred_word, dst_word;
    639     int res, res2;
    640 
    641     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    642     rec -= lx;
    643 
    644     while (i--)
    645     {
    646         x2 = blk[2];
    647         blk[2] = 0;
    648         blk += 8; /* for proper rounding in the fourth stage */
    649         /* both upper and lower*/
    650         /* both x2orx6 and x0orx4 */
    651         x6 = (W6 * x2 + 4) >> 3;
    652         x2 = (W2 * x2 + 4) >> 3;
    653         x0 = 8192 + x2;
    654         x2 = 8192 - x2;
    655         x4 = 8192 + x6;
    656         x6 = 8192 - x6;
    657 
    658         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    659         res = (x0) >> 14;
    660         ADD_AND_CLIP1(res);
    661         res2 = (x4) >> 14;
    662         ADD_AND_CLIP2(res2);
    663         dst_word = (res2 << 8) | res;
    664         res = (x6) >> 14;
    665         ADD_AND_CLIP3(res);
    666         dst_word |= (res << 16);
    667         res = (x2) >> 14;
    668         ADD_AND_CLIP4(res);
    669         dst_word |= (res << 24);
    670         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    671 
    672         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    673         res = (x2) >> 14;
    674         ADD_AND_CLIP1(res);
    675         res2 = (x6) >> 14;
    676         ADD_AND_CLIP2(res2);
    677         dst_word = (res2 << 8) | res;
    678         res = (x4) >> 14;
    679         ADD_AND_CLIP3(res);
    680         dst_word |= (res << 16);
    681         res = (x0) >> 14;
    682         ADD_AND_CLIP4(res);
    683         dst_word |= (res << 24);
    684         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    685     }
    686 
    687     return ;
    688 }
    689 
    690 void idct_row0x10Inter(Short *blk, UChar *rec, Int lx)
    691 {
    692     int32 x1, x3, x5, x7;
    693     int i = 8;
    694     uint32 pred_word, dst_word;
    695     int res, res2;
    696 
    697     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    698     rec -= lx;
    699 
    700     while (i--)
    701     {
    702         x3 = blk[3];
    703         blk[3] = 0;
    704         blk += 8;
    705 
    706         x1 = (W3 * x3 + 4) >> 3;
    707         x3 = (-W5 * x3 + 4) >> 3;
    708 
    709         x7 = (-181 * (x3 + x1) + 128) >> 8;
    710         x5 = (181 * (x3 - x1) + 128) >> 8;
    711 
    712         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    713         res = (8192 + x1) >> 14;
    714         ADD_AND_CLIP1(res);
    715         res2 = (8192 + x7) >> 14;
    716         ADD_AND_CLIP2(res2);
    717         dst_word = (res2 << 8) | res;
    718         res = (8192 + x5) >> 14;
    719         ADD_AND_CLIP3(res);
    720         dst_word |= (res << 16);
    721         res = (8192 + x3) >> 14;
    722         ADD_AND_CLIP4(res);
    723         dst_word |= (res << 24);
    724         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    725 
    726         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    727         res = (8192 - x3) >> 14;
    728         ADD_AND_CLIP1(res);
    729         res2 = (8192 - x5) >> 14;
    730         ADD_AND_CLIP2(res2);
    731         dst_word = (res2 << 8) | res;
    732         res = (8192 - x7) >> 14;
    733         ADD_AND_CLIP3(res);
    734         dst_word |= (res << 16);
    735         res = (8192 - x1) >> 14;
    736         ADD_AND_CLIP4(res);
    737         dst_word |= (res << 24);
    738         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    739     }
    740     return ;
    741 }
    742 
    743 #endif /* SMALL_DCT */
    744 
    745 void idct_rowInter(Short *blk, UChar *rec, Int lx)
    746 {
    747     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    748     int i = 8;
    749     uint32 pred_word, dst_word;
    750     int res, res2;
    751 
    752     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
    753     rec -= lx;
    754     blk -= 8;
    755 
    756     while (i--)
    757     {
    758         x1 = (int32)blk[12] << 8;
    759         blk[12] = 0;
    760         x2 = blk[14];
    761         blk[14] = 0;
    762         x3 = blk[10];
    763         blk[10] = 0;
    764         x4 = blk[9];
    765         blk[9] = 0;
    766         x5 = blk[15];
    767         blk[15] = 0;
    768         x6 = blk[13];
    769         blk[13] = 0;
    770         x7 = blk[11];
    771         blk[11] = 0;
    772         x0 = ((*(blk += 8)) << 8) + 8192;
    773         *blk = 0;   /* for proper rounding in the fourth stage */
    774 
    775         /* first stage */
    776         x8 = W7 * (x4 + x5) + 4;
    777         x4 = (x8 + (W1 - W7) * x4) >> 3;
    778         x5 = (x8 - (W1 + W7) * x5) >> 3;
    779         x8 = W3 * (x6 + x7) + 4;
    780         x6 = (x8 - (W3 - W5) * x6) >> 3;
    781         x7 = (x8 - (W3 + W5) * x7) >> 3;
    782 
    783         /* second stage */
    784         x8 = x0 + x1;
    785         x0 -= x1;
    786         x1 = W6 * (x3 + x2) + 4;
    787         x2 = (x1 - (W2 + W6) * x2) >> 3;
    788         x3 = (x1 + (W2 - W6) * x3) >> 3;
    789         x1 = x4 + x6;
    790         x4 -= x6;
    791         x6 = x5 + x7;
    792         x5 -= x7;
    793 
    794         /* third stage */
    795         x7 = x8 + x3;
    796         x8 -= x3;
    797         x3 = x0 + x2;
    798         x0 -= x2;
    799         x2 = (181 * (x4 + x5) + 128) >> 8;
    800         x4 = (181 * (x4 - x5) + 128) >> 8;
    801 
    802         /* fourth stage */
    803         pred_word = *((uint32*)(rec += lx)); /* read 4 bytes from pred */
    804 
    805         res = (x7 + x1) >> 14;
    806         ADD_AND_CLIP1(res);
    807         res2 = (x3 + x2) >> 14;
    808         ADD_AND_CLIP2(res2);
    809         dst_word = (res2 << 8) | res;
    810         res = (x0 + x4) >> 14;
    811         ADD_AND_CLIP3(res);
    812         dst_word |= (res << 16);
    813         res = (x8 + x6) >> 14;
    814         ADD_AND_CLIP4(res);
    815         dst_word |= (res << 24);
    816         *((uint32*)rec) = dst_word; /* save 4 bytes to dst */
    817 
    818         pred_word = *((uint32*)(rec + 4)); /* read 4 bytes from pred */
    819 
    820         res = (x8 - x6) >> 14;
    821         ADD_AND_CLIP1(res);
    822         res2 = (x0 - x4) >> 14;
    823         ADD_AND_CLIP2(res2);
    824         dst_word = (res2 << 8) | res;
    825         res = (x3 - x2) >> 14;
    826         ADD_AND_CLIP3(res);
    827         dst_word |= (res << 16);
    828         res = (x7 - x1) >> 14;
    829         ADD_AND_CLIP4(res);
    830         dst_word |= (res << 24);
    831         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
    832     }
    833     return;
    834 }
    835 
    836 void idct_row0Intra(Short *srce, UChar *rec, Int lx)
    837 {
    838     OSCL_UNUSED_ARG(srce);
    839 
    840     OSCL_UNUSED_ARG(rec);
    841 
    842     OSCL_UNUSED_ARG(lx);
    843 
    844     return;
    845 }
    846 
    847 void idct_row1Intra(Short *blk, UChar *rec, Int lx)
    848 {
    849     int32 tmp;
    850     int i = 8;
    851 
    852     rec -= lx;
    853     blk -= 8;
    854     while (i--)
    855     {
    856         tmp = ((*(blk += 8) + 32) >> 6);
    857         *blk = 0;
    858         CLIP_RESULT(tmp)
    859 
    860         tmp |= (tmp << 8);
    861         tmp |= (tmp << 16);
    862         *((uint32*)(rec += lx)) = tmp;
    863         *((uint32*)(rec + 4)) = tmp;
    864     }
    865     return;
    866 }
    867 
    868 void idct_row2Intra(Short *blk, UChar *rec, Int lx)
    869 {
    870     int32 x0, x1, x2, x4, x5;
    871     int res, res2;
    872     uint32 dst_word;
    873     int i = 8;
    874 
    875     rec -= lx;
    876     blk -= 8;
    877     while (i--)
    878     {
    879         /* shortcut */
    880         x4 = blk[9];
    881         blk[9] = 0;
    882         x0 = ((*(blk += 8)) << 8) + 8192;
    883         *blk = 0;   /* for proper rounding in the fourth stage */
    884 
    885         /* first stage */
    886         x5 = (W7 * x4 + 4) >> 3;
    887         x4 = (W1 * x4 + 4) >> 3;
    888 
    889         /* third stage */
    890         x2 = (181 * (x4 + x5) + 128) >> 8;
    891         x1 = (181 * (x4 - x5) + 128) >> 8;
    892 
    893         /* fourth stage */
    894         res = ((x0 + x4) >> 14);
    895         CLIP_RESULT(res)
    896         res2 = ((x0 + x2) >> 14);
    897         CLIP_RESULT(res2)
    898         dst_word = (res2 << 8) | res;
    899         res = ((x0 + x1) >> 14);
    900         CLIP_RESULT(res)
    901         dst_word |= (res << 16);
    902         res = ((x0 + x5) >> 14);
    903         CLIP_RESULT(res)
    904         dst_word |= (res << 24);
    905         *((uint32*)(rec += lx)) = dst_word;
    906 
    907         res = ((x0 - x5) >> 14);
    908         CLIP_RESULT(res)
    909         res2 = ((x0 - x1) >> 14);
    910         CLIP_RESULT(res2)
    911         dst_word = (res2 << 8) | res;
    912         res = ((x0 - x2) >> 14);
    913         CLIP_RESULT(res)
    914         dst_word |= (res << 16);
    915         res = ((x0 - x4) >> 14);
    916         CLIP_RESULT(res)
    917         dst_word |= (res << 24);
    918         *((uint32*)(rec + 4)) = dst_word;
    919     }
    920     return ;
    921 }
    922 
    923 void idct_row3Intra(Short *blk, UChar *rec, Int lx)
    924 {
    925     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    926     int res, res2;
    927     uint32 dst_word;
    928     int i = 8;
    929 
    930     rec -= lx;
    931     blk -= 8;
    932     while (i--)
    933     {
    934         x2 = blk[10];
    935         blk[10] = 0;
    936         x1 = blk[9];
    937         blk[9] = 0;
    938         x0 = ((*(blk += 8)) << 8) + 8192;
    939         *blk = 0;/* for proper rounding in the fourth stage */
    940         /* both upper and lower*/
    941         /* both x2orx6 and x0orx4 */
    942 
    943         x4 = x0;
    944         x6 = (W6 * x2 + 4) >> 3;
    945         x2 = (W2 * x2 + 4) >> 3;
    946         x8 = x0 - x2;
    947         x0 += x2;
    948         x2 = x8;
    949         x8 = x4 - x6;
    950         x4 += x6;
    951         x6 = x8;
    952 
    953         x7 = (W7 * x1 + 4) >> 3;
    954         x1 = (W1 * x1 + 4) >> 3;
    955         x3 = x7;
    956         x5 = (181 * (x1 - x7) + 128) >> 8;
    957         x7 = (181 * (x1 + x7) + 128) >> 8;
    958 
    959         res = ((x0 + x1) >> 14);
    960         CLIP_RESULT(res)
    961         res2 = ((x4 + x7) >> 14);
    962         CLIP_RESULT(res2)
    963         dst_word = (res2 << 8) | res;
    964         res = ((x6 + x5) >> 14);
    965         CLIP_RESULT(res)
    966         dst_word |= (res << 16);
    967         res = ((x2 + x3) >> 14);
    968         CLIP_RESULT(res)
    969         dst_word |= (res << 24);
    970         *((uint32*)(rec += lx)) = dst_word;
    971 
    972         res = ((x2 - x3) >> 14);
    973         CLIP_RESULT(res)
    974         res2 = ((x6 - x5) >> 14);
    975         CLIP_RESULT(res2)
    976         dst_word = (res2 << 8) | res;
    977         res = ((x4 - x7) >> 14);
    978         CLIP_RESULT(res)
    979         dst_word |= (res << 16);
    980         res = ((x0 - x1) >> 14);
    981         CLIP_RESULT(res)
    982         dst_word |= (res << 24);
    983         *((uint32*)(rec + 4)) = dst_word;
    984 
    985     }
    986     return ;
    987 }
    988 
    989 void idct_row4Intra(Short *blk, UChar *rec, Int lx)
    990 {
    991     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
    992     int res, res2;
    993     uint32 dst_word;
    994     int i = 8;
    995 
    996     rec -= lx;
    997     blk -= 8;
    998     while (i--)
    999     {
   1000         x2 = blk[10];
   1001         blk[10] = 0;
   1002         x1 = blk[9];
   1003         blk[9] = 0;
   1004         x3 = blk[11];
   1005         blk[11] = 0;
   1006         x0 = ((*(blk += 8)) << 8) + 8192;
   1007         *blk = 0; /* for proper rounding in the fourth stage */
   1008 
   1009         x4 = x0;
   1010         x6 = (W6 * x2 + 4) >> 3;
   1011         x2 = (W2 * x2 + 4) >> 3;
   1012         x8 = x0 - x2;
   1013         x0 += x2;
   1014         x2 = x8;
   1015         x8 = x4 - x6;
   1016         x4 += x6;
   1017         x6 = x8;
   1018 
   1019         x7 = (W7 * x1 + 4) >> 3;
   1020         x1 = (W1 * x1 + 4) >> 3;
   1021         x5 = (W3 * x3 + 4) >> 3;
   1022         x3 = (- W5 * x3 + 4) >> 3;
   1023         x8 = x1 - x5;
   1024         x1 += x5;
   1025         x5 = x8;
   1026         x8 = x7 - x3;
   1027         x3 += x7;
   1028         x7 = (181 * (x5 + x8) + 128) >> 8;
   1029         x5 = (181 * (x5 - x8) + 128) >> 8;
   1030 
   1031         res = ((x0 + x1) >> 14);
   1032         CLIP_RESULT(res)
   1033         res2 = ((x4 + x7) >> 14);
   1034         CLIP_RESULT(res2)
   1035         dst_word = (res2 << 8) | res;
   1036         res = ((x6 + x5) >> 14);
   1037         CLIP_RESULT(res)
   1038         dst_word |= (res << 16);
   1039         res = ((x2 + x3) >> 14);
   1040         CLIP_RESULT(res)
   1041         dst_word |= (res << 24);
   1042         *((uint32*)(rec += lx)) = dst_word;
   1043 
   1044         res = ((x2 - x3) >> 14);
   1045         CLIP_RESULT(res)
   1046         res2 = ((x6 - x5) >> 14);
   1047         CLIP_RESULT(res2)
   1048         dst_word = (res2 << 8) | res;
   1049         res = ((x4 - x7) >> 14);
   1050         CLIP_RESULT(res)
   1051         dst_word |= (res << 16);
   1052         res = ((x0 - x1) >> 14);
   1053         CLIP_RESULT(res)
   1054         dst_word |= (res << 24);
   1055         *((uint32*)(rec + 4)) = dst_word;
   1056     }
   1057 
   1058     return ;
   1059 }
   1060 
   1061 #ifndef SMALL_DCT
   1062 void idct_row0x40Intra(Short *blk, UChar *rec, Int lx)
   1063 {
   1064     int32  x1, x2, x4, x5;
   1065     int res, res2;
   1066     uint32 dst_word;
   1067     int i = 8;
   1068 
   1069     rec -= lx;
   1070 
   1071     while (i--)
   1072     {
   1073         /* shortcut */
   1074         x4 = blk[1];
   1075         blk[1] = 0;
   1076         blk += 8;
   1077 
   1078         /* first stage */
   1079         x5 = (W7 * x4 + 4) >> 3;
   1080         x4 = (W1 * x4 + 4) >> 3;
   1081 
   1082         /* third stage */
   1083         x2 = (181 * (x4 + x5) + 128) >> 8;
   1084         x1 = (181 * (x4 - x5) + 128) >> 8;
   1085 
   1086         /* fourth stage */
   1087         res = ((8192 + x4) >> 14);
   1088         CLIP_RESULT(res)
   1089         res2 = ((8192 + x2) >> 14);
   1090         CLIP_RESULT(res2)
   1091         dst_word = (res2 << 8) | res;
   1092         res = ((8192 + x1) >> 14);
   1093         CLIP_RESULT(res)
   1094         dst_word |= (res << 16);
   1095         res = ((8192 + x5) >> 14);
   1096         CLIP_RESULT(res)
   1097         dst_word |= (res << 24);
   1098         *((uint32*)(rec += lx)) = dst_word;
   1099 
   1100         res = ((8192 - x5) >> 14);
   1101         CLIP_RESULT(res)
   1102         res2 = ((8192 - x1) >> 14);
   1103         CLIP_RESULT(res2)
   1104         dst_word = (res2 << 8) | res;
   1105         res = ((8192 - x2) >> 14);
   1106         CLIP_RESULT(res)
   1107         dst_word |= (res << 16);
   1108         res = ((8192 - x4) >> 14);
   1109         CLIP_RESULT(res)
   1110         dst_word |= (res << 24);
   1111         *((uint32*)(rec + 4)) = dst_word;
   1112 
   1113     }
   1114     return ;
   1115 }
   1116 
   1117 void idct_row0x20Intra(Short *blk, UChar *rec, Int lx)
   1118 {
   1119     int32 x0, x2, x4, x6;
   1120     int res, res2;
   1121     uint32 dst_word;
   1122     int i = 8;
   1123 
   1124     rec -= lx;
   1125     while (i--)
   1126     {
   1127         x2 = blk[2];
   1128         blk[2] = 0;
   1129         blk += 8;
   1130 
   1131         /* both upper and lower*/
   1132         /* both x2orx6 and x0orx4 */
   1133         x6 = (W6 * x2 + 4) >> 3;
   1134         x2 = (W2 * x2 + 4) >> 3;
   1135         x0 = 8192 + x2;
   1136         x2 = 8192 - x2;
   1137         x4 = 8192 + x6;
   1138         x6 = 8192 - x6;
   1139 
   1140         res = ((x0) >> 14);
   1141         CLIP_RESULT(res)
   1142         res2 = ((x4) >> 14);
   1143         CLIP_RESULT(res2)
   1144         dst_word = (res2 << 8) | res;
   1145         res = ((x6) >> 14);
   1146         CLIP_RESULT(res)
   1147         dst_word |= (res << 16);
   1148         res = ((x2) >> 14);
   1149         CLIP_RESULT(res)
   1150         dst_word |= (res << 24);
   1151         *((uint32*)(rec += lx)) = dst_word;
   1152 
   1153         res = ((x2) >> 14);
   1154         CLIP_RESULT(res)
   1155         res2 = ((x6) >> 14);
   1156         CLIP_RESULT(res2)
   1157         dst_word = (res2 << 8) | res;
   1158         res = ((x4) >> 14);
   1159         CLIP_RESULT(res)
   1160         dst_word |= (res << 16);
   1161         res = ((x0) >> 14);
   1162         CLIP_RESULT(res)
   1163         dst_word |= (res << 24);
   1164         *((uint32*)(rec + 4)) = dst_word;
   1165 
   1166     }
   1167     return ;
   1168 }
   1169 
   1170 void idct_row0x10Intra(Short *blk, UChar *rec, Int lx)
   1171 {
   1172     int32 x1, x3, x5, x7;
   1173     int res, res2;
   1174     uint32 dst_word;
   1175     int i = 8;
   1176 
   1177     rec -= lx;
   1178     while (i--)
   1179     {
   1180         x3 = blk[3];
   1181         blk[3] = 0 ;
   1182         blk += 8;
   1183 
   1184         x1 = (W3 * x3 + 4) >> 3;
   1185         x3 = (W5 * x3 + 4) >> 3;
   1186 
   1187         x7 = (181 * (x3 - x1) + 128) >> 8;
   1188         x5 = (-181 * (x1 + x3) + 128) >> 8;
   1189 
   1190         res = ((8192 + x1) >> 14);
   1191         CLIP_RESULT(res)
   1192         res2 = ((8192 + x7) >> 14);
   1193         CLIP_RESULT(res2)
   1194         dst_word = (res2 << 8) | res;
   1195         res = ((8192 + x5) >> 14);
   1196         CLIP_RESULT(res)
   1197         dst_word |= (res << 16);
   1198         res = ((8192 - x3) >> 14);
   1199         CLIP_RESULT(res)
   1200         dst_word |= (res << 24);
   1201         *((uint32*)(rec += lx)) = dst_word;
   1202 
   1203         res = ((8192 + x3) >> 14);
   1204         CLIP_RESULT(res)
   1205         res2 = ((8192 - x5) >> 14);
   1206         CLIP_RESULT(res2)
   1207         dst_word = (res2 << 8) | res;
   1208         res = ((8192 - x7) >> 14);
   1209         CLIP_RESULT(res)
   1210         dst_word |= (res << 16);
   1211         res = ((8192 - x1) >> 14);
   1212         CLIP_RESULT(res)
   1213         dst_word |= (res << 24);
   1214         *((uint32*)(rec + 4)) = dst_word;
   1215 
   1216     }
   1217 
   1218     return ;
   1219 }
   1220 
   1221 #endif /* SMALL_DCT */
   1222 void idct_rowIntra(Short *blk, UChar *rec, Int lx)
   1223 {
   1224     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
   1225     int i = 8;
   1226     int res, res2;
   1227     uint32 dst_word;
   1228 
   1229     blk -= 8;
   1230     rec -= lx;
   1231 
   1232     while (i--)
   1233     {
   1234         x1 = (int32)blk[12] << 8;
   1235         blk[12] = 0;
   1236         x2 = blk[14];
   1237         blk[14] = 0;
   1238         x3 = blk[10];
   1239         blk[10] = 0;
   1240         x4 = blk[9];
   1241         blk[9] = 0;
   1242         x5 = blk[15];
   1243         blk[15] = 0;
   1244         x6 = blk[13];
   1245         blk[13] = 0;
   1246         x7 = blk[11];
   1247         blk[11] = 0;
   1248         x0 = ((*(blk += 8)) << 8) + 8192;
   1249         *blk = 0;  /* for proper rounding in the fourth stage */
   1250 
   1251         /* first stage */
   1252         x8 = W7 * (x4 + x5) + 4;
   1253         x4 = (x8 + (W1 - W7) * x4) >> 3;
   1254         x5 = (x8 - (W1 + W7) * x5) >> 3;
   1255         x8 = W3 * (x6 + x7) + 4;
   1256         x6 = (x8 - (W3 - W5) * x6) >> 3;
   1257         x7 = (x8 - (W3 + W5) * x7) >> 3;
   1258 
   1259         /* second stage */
   1260         x8 = x0 + x1;
   1261         x0 -= x1;
   1262         x1 = W6 * (x3 + x2) + 4;
   1263         x2 = (x1 - (W2 + W6) * x2) >> 3;
   1264         x3 = (x1 + (W2 - W6) * x3) >> 3;
   1265         x1 = x4 + x6;
   1266         x4 -= x6;
   1267         x6 = x5 + x7;
   1268         x5 -= x7;
   1269 
   1270         /* third stage */
   1271         x7 = x8 + x3;
   1272         x8 -= x3;
   1273         x3 = x0 + x2;
   1274         x0 -= x2;
   1275         x2 = (181 * (x4 + x5) + 128) >> 8;
   1276         x4 = (181 * (x4 - x5) + 128) >> 8;
   1277 
   1278         /* fourth stage */
   1279         res = ((x7 + x1) >> 14);
   1280         CLIP_RESULT(res)
   1281         res2 = ((x3 + x2) >> 14);
   1282         CLIP_RESULT(res2)
   1283         dst_word = res | (res2 << 8);
   1284         res = ((x0 + x4) >> 14);
   1285         CLIP_RESULT(res)
   1286         dst_word |= (res << 16);
   1287         res = ((x8 + x6) >> 14);
   1288         CLIP_RESULT(res)
   1289         dst_word |= (res << 24);
   1290         *((uint32*)(rec += lx)) = dst_word;
   1291 
   1292         res = ((x8 - x6) >> 14);
   1293         CLIP_RESULT(res)
   1294         res2 = ((x0 - x4) >> 14);
   1295         CLIP_RESULT(res2)
   1296         dst_word = res | (res2 << 8);
   1297         res = ((x3 - x2) >> 14);
   1298         CLIP_RESULT(res)
   1299         dst_word |= (res << 16);
   1300         res = ((x7 - x1) >> 14);
   1301         CLIP_RESULT(res)
   1302         dst_word |= (res << 24);
   1303         *((uint32*)(rec + 4)) = dst_word;
   1304     }
   1305     return;
   1306 }
   1307 
   1308 
   1309 /* This function should not be called at all ****/
   1310 void idct_row0zmv(Short *srce, UChar *rec, UChar *pred, Int lx)
   1311 {
   1312     OSCL_UNUSED_ARG(srce);
   1313     OSCL_UNUSED_ARG(rec);
   1314     OSCL_UNUSED_ARG(pred);
   1315     OSCL_UNUSED_ARG(lx);
   1316 
   1317     return;
   1318 }
   1319 
   1320 void idct_row1zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1321 {
   1322     int tmp;
   1323     int i = 8;
   1324     uint32 pred_word, dst_word;
   1325     int res, res2;
   1326 
   1327     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1328     pred -= 16;
   1329     rec -= lx;
   1330     blk -= 8;
   1331 
   1332     while (i--)
   1333     {
   1334         tmp = (*(blk += 8) + 32) >> 6;
   1335         *blk = 0;
   1336 
   1337         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1338         res = tmp + (pred_word & 0xFF);
   1339         CLIP_RESULT(res);
   1340         res2 = tmp + ((pred_word >> 8) & 0xFF);
   1341         CLIP_RESULT(res2);
   1342         dst_word = (res2 << 8) | res;
   1343         res = tmp + ((pred_word >> 16) & 0xFF);
   1344         CLIP_RESULT(res);
   1345         dst_word |= (res << 16);
   1346         res = tmp + ((pred_word >> 24) & 0xFF);
   1347         CLIP_RESULT(res);
   1348         dst_word |= (res << 24);
   1349         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1350 
   1351         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1352         res = tmp + (pred_word & 0xFF);
   1353         CLIP_RESULT(res);
   1354         res2 = tmp + ((pred_word >> 8) & 0xFF);
   1355         CLIP_RESULT(res2);
   1356         dst_word = (res2 << 8) | res;
   1357         res = tmp + ((pred_word >> 16) & 0xFF);
   1358         CLIP_RESULT(res);
   1359         dst_word |= (res << 16);
   1360         res = tmp + ((pred_word >> 24) & 0xFF);
   1361         CLIP_RESULT(res);
   1362         dst_word |= (res << 24);
   1363         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1364     }
   1365     return;
   1366 }
   1367 
   1368 void idct_row2zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1369 {
   1370     int32 x0, x1, x2, x4, x5;
   1371     int i = 8;
   1372     uint32 pred_word, dst_word;
   1373     int res, res2;
   1374 
   1375     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1376     rec -= lx;
   1377     pred -= 16;
   1378     blk -= 8;
   1379 
   1380     while (i--)
   1381     {
   1382         /* shortcut */
   1383         x4 = blk[9];
   1384         blk[9] = 0;
   1385         x0 = ((*(blk += 8)) << 8) + 8192;
   1386         *blk = 0;  /* for proper rounding in the fourth stage */
   1387 
   1388         /* first stage */
   1389         x5 = (W7 * x4 + 4) >> 3;
   1390         x4 = (W1 * x4 + 4) >> 3;
   1391 
   1392         /* third stage */
   1393         x2 = (181 * (x4 + x5) + 128) >> 8;
   1394         x1 = (181 * (x4 - x5) + 128) >> 8;
   1395 
   1396         /* fourth stage */
   1397         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1398         res = (x0 + x4) >> 14;
   1399         ADD_AND_CLIP1(res);
   1400         res2 = (x0 + x2) >> 14;
   1401         ADD_AND_CLIP2(res2);
   1402         dst_word = (res2 << 8) | res;
   1403         res = (x0 + x1) >> 14;
   1404         ADD_AND_CLIP3(res);
   1405         dst_word |= (res << 16);
   1406         res = (x0 + x5) >> 14;
   1407         ADD_AND_CLIP4(res);
   1408         dst_word |= (res << 24);
   1409         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1410 
   1411         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1412         res = (x0 - x5) >> 14;
   1413         ADD_AND_CLIP1(res);
   1414         res2 = (x0 - x1) >> 14;
   1415         ADD_AND_CLIP2(res2);
   1416         dst_word = (res2 << 8) | res;
   1417         res = (x0 - x2) >> 14;
   1418         ADD_AND_CLIP3(res);
   1419         dst_word |= (res << 16);
   1420         res = (x0 - x4) >> 14;
   1421         ADD_AND_CLIP4(res);
   1422         dst_word |= (res << 24);
   1423         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1424     }
   1425     return ;
   1426 }
   1427 
   1428 void idct_row3zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1429 {
   1430     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
   1431     int i = 8;
   1432     uint32 pred_word, dst_word;
   1433     int res, res2;
   1434 
   1435     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1436     rec -= lx;
   1437     pred -= 16;
   1438     blk -= 8;
   1439 
   1440     while (i--)
   1441     {
   1442         x2 = blk[10];
   1443         blk[10] = 0;
   1444         x1 = blk[9];
   1445         blk[9] = 0;
   1446         x0 = ((*(blk += 8)) << 8) + 8192;
   1447         *blk = 0;  /* for proper rounding in the fourth stage */
   1448         /* both upper and lower*/
   1449         /* both x2orx6 and x0orx4 */
   1450 
   1451         x4 = x0;
   1452         x6 = (W6 * x2 + 4) >> 3;
   1453         x2 = (W2 * x2 + 4) >> 3;
   1454         x8 = x0 - x2;
   1455         x0 += x2;
   1456         x2 = x8;
   1457         x8 = x4 - x6;
   1458         x4 += x6;
   1459         x6 = x8;
   1460 
   1461         x7 = (W7 * x1 + 4) >> 3;
   1462         x1 = (W1 * x1 + 4) >> 3;
   1463         x3 = x7;
   1464         x5 = (181 * (x1 - x7) + 128) >> 8;
   1465         x7 = (181 * (x1 + x7) + 128) >> 8;
   1466 
   1467         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1468         res = (x0 + x1) >> 14;
   1469         ADD_AND_CLIP1(res);
   1470         res2 = (x4 + x7) >> 14;
   1471         ADD_AND_CLIP2(res2);
   1472         dst_word = (res2 << 8) | res;
   1473         res = (x6 + x5) >> 14;
   1474         ADD_AND_CLIP3(res);
   1475         dst_word |= (res << 16);
   1476         res = (x2 + x3) >> 14;
   1477         ADD_AND_CLIP4(res);
   1478         dst_word |= (res << 24);
   1479         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1480 
   1481         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1482         res = (x2 - x3) >> 14;
   1483         ADD_AND_CLIP1(res);
   1484         res2 = (x6 - x5) >> 14;
   1485         ADD_AND_CLIP2(res2);
   1486         dst_word = (res2 << 8) | res;
   1487         res = (x4 - x7) >> 14;
   1488         ADD_AND_CLIP3(res);
   1489         dst_word |= (res << 16);
   1490         res = (x0 - x1) >> 14;
   1491         ADD_AND_CLIP4(res);
   1492         dst_word |= (res << 24);
   1493         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1494     }
   1495 
   1496     return ;
   1497 }
   1498 
   1499 void idct_row4zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1500 {
   1501     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
   1502     int i = 8;
   1503     uint32 pred_word, dst_word;
   1504     int res, res2;
   1505 
   1506     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1507     rec -= lx;
   1508     pred -= 16;
   1509     blk -= 8;
   1510 
   1511     while (i--)
   1512     {
   1513         x2 = blk[10];
   1514         blk[10] = 0;
   1515         x1 = blk[9];
   1516         blk[9] = 0;
   1517         x3 = blk[11];
   1518         blk[11] = 0;
   1519         x0 = ((*(blk += 8)) << 8) + 8192;
   1520         *blk = 0;   /* for proper rounding in the fourth stage */
   1521 
   1522         x4 = x0;
   1523         x6 = (W6 * x2 + 4) >> 3;
   1524         x2 = (W2 * x2 + 4) >> 3;
   1525         x8 = x0 - x2;
   1526         x0 += x2;
   1527         x2 = x8;
   1528         x8 = x4 - x6;
   1529         x4 += x6;
   1530         x6 = x8;
   1531 
   1532         x7 = (W7 * x1 + 4) >> 3;
   1533         x1 = (W1 * x1 + 4) >> 3;
   1534         x5 = (W3 * x3 + 4) >> 3;
   1535         x3 = (- W5 * x3 + 4) >> 3;
   1536         x8 = x1 - x5;
   1537         x1 += x5;
   1538         x5 = x8;
   1539         x8 = x7 - x3;
   1540         x3 += x7;
   1541         x7 = (181 * (x5 + x8) + 128) >> 8;
   1542         x5 = (181 * (x5 - x8) + 128) >> 8;
   1543 
   1544         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1545         res = (x0 + x1) >> 14;
   1546         ADD_AND_CLIP1(res);
   1547         res2 = (x4 + x7) >> 14;
   1548         ADD_AND_CLIP2(res2);
   1549         dst_word = (res2 << 8) | res;
   1550         res = (x6 + x5) >> 14;
   1551         ADD_AND_CLIP3(res);
   1552         dst_word |= (res << 16);
   1553         res = (x2 + x3) >> 14;
   1554         ADD_AND_CLIP4(res);
   1555         dst_word |= (res << 24);
   1556         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1557 
   1558         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1559         res = (x2 - x3) >> 14;
   1560         ADD_AND_CLIP1(res);
   1561         res2 = (x6 - x5) >> 14;
   1562         ADD_AND_CLIP2(res2);
   1563         dst_word = (res2 << 8) | res;
   1564         res = (x4 - x7) >> 14;
   1565         ADD_AND_CLIP3(res);
   1566         dst_word |= (res << 16);
   1567         res = (x0 - x1) >> 14;
   1568         ADD_AND_CLIP4(res);
   1569         dst_word |= (res << 24);
   1570         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1571     }
   1572     return ;
   1573 }
   1574 
   1575 #ifndef SMALL_DCT
   1576 void idct_row0x40zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1577 {
   1578     int32 x1, x2, x4, x5;
   1579     int i = 8;
   1580     uint32 pred_word, dst_word;
   1581     int res, res2;
   1582 
   1583     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1584     rec -= lx;
   1585     pred -= 16;
   1586 
   1587     while (i--)
   1588     {
   1589         /* shortcut */
   1590         x4 = blk[1];
   1591         blk[1] = 0;
   1592         blk += 8;  /* for proper rounding in the fourth stage */
   1593 
   1594         /* first stage */
   1595         x5 = (W7 * x4 + 4) >> 3;
   1596         x4 = (W1 * x4 + 4) >> 3;
   1597 
   1598         /* third stage */
   1599         x2 = (181 * (x4 + x5) + 128) >> 8;
   1600         x1 = (181 * (x4 - x5) + 128) >> 8;
   1601 
   1602         /* fourth stage */
   1603         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1604         res = (8192 + x4) >> 14;
   1605         ADD_AND_CLIP1(res);
   1606         res2 = (8192 + x2) >> 14;
   1607         ADD_AND_CLIP2(res2);
   1608         dst_word = (res2 << 8) | res;
   1609         res = (8192 + x1) >> 14;
   1610         ADD_AND_CLIP3(res);
   1611         dst_word |= (res << 16);
   1612         res = (8192 + x5) >> 14;
   1613         ADD_AND_CLIP4(res);
   1614         dst_word |= (res << 24);
   1615         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1616 
   1617         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1618         res = (8192 - x5) >> 14;
   1619         ADD_AND_CLIP1(res);
   1620         res2 = (8192 - x1) >> 14;
   1621         ADD_AND_CLIP2(res2);
   1622         dst_word = (res2 << 8) | res;
   1623         res = (8192 - x2) >> 14;
   1624         ADD_AND_CLIP3(res);
   1625         dst_word |= (res << 16);
   1626         res = (8192 - x4) >> 14;
   1627         ADD_AND_CLIP4(res);
   1628         dst_word |= (res << 24);
   1629         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1630     }
   1631     return ;
   1632 }
   1633 
   1634 void idct_row0x20zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1635 {
   1636     int32 x0, x2, x4, x6;
   1637     int i = 8;
   1638     uint32 pred_word, dst_word;
   1639     int res, res2;
   1640 
   1641     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1642     rec -= lx;
   1643     pred -= 16;
   1644 
   1645     while (i--)
   1646     {
   1647         x2 = blk[2];
   1648         blk[2] = 0;
   1649         blk += 8; /* for proper rounding in the fourth stage */
   1650         /* both upper and lower*/
   1651         /* both x2orx6 and x0orx4 */
   1652         x6 = (W6 * x2 + 4) >> 3;
   1653         x2 = (W2 * x2 + 4) >> 3;
   1654         x0 = 8192 + x2;
   1655         x2 = 8192 - x2;
   1656         x4 = 8192 + x6;
   1657         x6 = 8192 - x6;
   1658 
   1659         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1660         res = (x0) >> 14;
   1661         ADD_AND_CLIP1(res);
   1662         res2 = (x4) >> 14;
   1663         ADD_AND_CLIP2(res2);
   1664         dst_word = (res2 << 8) | res;
   1665         res = (x6) >> 14;
   1666         ADD_AND_CLIP3(res);
   1667         dst_word |= (res << 16);
   1668         res = (x2) >> 14;
   1669         ADD_AND_CLIP4(res);
   1670         dst_word |= (res << 24);
   1671         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1672 
   1673         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1674         res = (x2) >> 14;
   1675         ADD_AND_CLIP1(res);
   1676         res2 = (x6) >> 14;
   1677         ADD_AND_CLIP2(res2);
   1678         dst_word = (res2 << 8) | res;
   1679         res = (x4) >> 14;
   1680         ADD_AND_CLIP3(res);
   1681         dst_word |= (res << 16);
   1682         res = (x0) >> 14;
   1683         ADD_AND_CLIP4(res);
   1684         dst_word |= (res << 24);
   1685         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1686     }
   1687 
   1688     return ;
   1689 }
   1690 
   1691 void idct_row0x10zmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1692 {
   1693     int32 x1, x3, x5, x7;
   1694     int i = 8;
   1695     uint32 pred_word, dst_word;
   1696     int res, res2;
   1697 
   1698     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1699     rec -= lx;
   1700     pred -= 16;
   1701 
   1702     while (i--)
   1703     {
   1704         x3 = blk[3];
   1705         blk[3] = 0;
   1706         blk += 8;
   1707 
   1708         x1 = (W3 * x3 + 4) >> 3;
   1709         x3 = (-W5 * x3 + 4) >> 3;
   1710 
   1711         x7 = (-181 * (x3 + x1) + 128) >> 8;
   1712         x5 = (181 * (x3 - x1) + 128) >> 8;
   1713 
   1714         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1715         res = (8192 + x1) >> 14;
   1716         ADD_AND_CLIP1(res);
   1717         res2 = (8192 + x7) >> 14;
   1718         ADD_AND_CLIP2(res2);
   1719         dst_word = (res2 << 8) | res;
   1720         res = (8192 + x5) >> 14;
   1721         ADD_AND_CLIP3(res);
   1722         dst_word |= (res << 16);
   1723         res = (8192 + x3) >> 14;
   1724         ADD_AND_CLIP4(res);
   1725         dst_word |= (res << 24);
   1726         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1727 
   1728         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1729         res = (8192 - x3) >> 14;
   1730         ADD_AND_CLIP1(res);
   1731         res2 = (8192 - x5) >> 14;
   1732         ADD_AND_CLIP2(res2);
   1733         dst_word = (res2 << 8) | res;
   1734         res = (8192 - x7) >> 14;
   1735         ADD_AND_CLIP3(res);
   1736         dst_word |= (res << 16);
   1737         res = (8192 - x1) >> 14;
   1738         ADD_AND_CLIP4(res);
   1739         dst_word |= (res << 24);
   1740         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1741     }
   1742     return ;
   1743 }
   1744 
   1745 #endif /* SMALL_DCT */
   1746 
   1747 void idct_rowzmv(Short *blk, UChar *rec, UChar *pred, Int lx)
   1748 {
   1749     int32 x0, x1, x2, x3, x4, x5, x6, x7, x8;
   1750     int i = 8;
   1751     uint32 pred_word, dst_word;
   1752     int res, res2;
   1753 
   1754     /* preset the offset, such that we can take advantage pre-offset addressing mode   */
   1755     rec -= lx;
   1756     pred -= 16;
   1757     blk -= 8;
   1758 
   1759     while (i--)
   1760     {
   1761         x1 = (int32)blk[12] << 8;
   1762         blk[12] = 0;
   1763         x2 = blk[14];
   1764         blk[14] = 0;
   1765         x3 = blk[10];
   1766         blk[10] = 0;
   1767         x4 = blk[9];
   1768         blk[9] = 0;
   1769         x5 = blk[15];
   1770         blk[15] = 0;
   1771         x6 = blk[13];
   1772         blk[13] = 0;
   1773         x7 = blk[11];
   1774         blk[11] = 0;
   1775         x0 = ((*(blk += 8)) << 8) + 8192;
   1776         *blk = 0;   /* for proper rounding in the fourth stage */
   1777 
   1778         /* first stage */
   1779         x8 = W7 * (x4 + x5) + 4;
   1780         x4 = (x8 + (W1 - W7) * x4) >> 3;
   1781         x5 = (x8 - (W1 + W7) * x5) >> 3;
   1782         x8 = W3 * (x6 + x7) + 4;
   1783         x6 = (x8 - (W3 - W5) * x6) >> 3;
   1784         x7 = (x8 - (W3 + W5) * x7) >> 3;
   1785 
   1786         /* second stage */
   1787         x8 = x0 + x1;
   1788         x0 -= x1;
   1789         x1 = W6 * (x3 + x2) + 4;
   1790         x2 = (x1 - (W2 + W6) * x2) >> 3;
   1791         x3 = (x1 + (W2 - W6) * x3) >> 3;
   1792         x1 = x4 + x6;
   1793         x4 -= x6;
   1794         x6 = x5 + x7;
   1795         x5 -= x7;
   1796 
   1797         /* third stage */
   1798         x7 = x8 + x3;
   1799         x8 -= x3;
   1800         x3 = x0 + x2;
   1801         x0 -= x2;
   1802         x2 = (181 * (x4 + x5) + 128) >> 8;
   1803         x4 = (181 * (x4 - x5) + 128) >> 8;
   1804 
   1805         /* fourth stage */
   1806         pred_word = *((uint32*)(pred += 16)); /* read 4 bytes from pred */
   1807 
   1808         res = (x7 + x1) >> 14;
   1809         ADD_AND_CLIP1(res);
   1810         res2 = (x3 + x2) >> 14;
   1811         ADD_AND_CLIP2(res2);
   1812         dst_word = (res2 << 8) | res;
   1813         res = (x0 + x4) >> 14;
   1814         ADD_AND_CLIP3(res);
   1815         dst_word |= (res << 16);
   1816         res = (x8 + x6) >> 14;
   1817         ADD_AND_CLIP4(res);
   1818         dst_word |= (res << 24);
   1819         *((uint32*)(rec += lx)) = dst_word; /* save 4 bytes to dst */
   1820 
   1821         pred_word = *((uint32*)(pred + 4)); /* read 4 bytes from pred */
   1822 
   1823         res = (x8 - x6) >> 14;
   1824         ADD_AND_CLIP1(res);
   1825         res2 = (x0 - x4) >> 14;
   1826         ADD_AND_CLIP2(res2);
   1827         dst_word = (res2 << 8) | res;
   1828         res = (x3 - x2) >> 14;
   1829         ADD_AND_CLIP3(res);
   1830         dst_word |= (res << 16);
   1831         res = (x7 - x1) >> 14;
   1832         ADD_AND_CLIP4(res);
   1833         dst_word |= (res << 24);
   1834         *((uint32*)(rec + 4)) = dst_word; /* save 4 bytes to dst */
   1835     }
   1836     return;
   1837 }
   1838 
   1839 /*----------------------------------------------------------------------------
   1840 ;  End Function: idctcol
   1841 ----------------------------------------------------------------------------*/
   1842 /* ======================================================================== */
   1843 /*  Function : BlockIDCTMotionComp                                              */
   1844 /*  Date     : 10/16/2000                                                   */
   1845 /*  Purpose  : fast IDCT routine                                    */
   1846 /*  In/out   :                                                              */
   1847 /*      Int* coeff_in   Dequantized coefficient
   1848         Int block_out   output IDCT coefficient
   1849         Int maxval      clip value                                          */
   1850 /*  Modified :   7/31/01, add checking for all-zero and DC-only block.  */
   1851 /*              do 8 columns at a time                                      */
   1852 /*               8/2/01, do column first then row-IDCT.                 */
   1853 /*               8/2/01, remove clipping (included in motion comp).     */
   1854 /*               8/7/01, combine with motion comp.                      */
   1855 /*               8/8/01, use AAN IDCT                                       */
   1856 /*               9/4/05, use Chen's IDCT and 16 bit block                   */
   1857 /* ======================================================================== */
   1858 void BlockIDCTMotionComp(Short *block, UChar *bitmapcol, UChar bitmaprow,
   1859                          Int dctMode, UChar *rec, UChar *pred, Int lx_intra)
   1860 {
   1861     Int i;
   1862     Int tmp, tmp2;
   1863     ULong tmp4;
   1864     Int bmap;
   1865     Short *ptr = block;
   1866     UChar *endcol;
   1867     UInt mask = 0xFF;
   1868     Int lx = lx_intra >> 1;
   1869     Int intra = (lx_intra & 1);
   1870 
   1871     /*  all-zero block */
   1872     if (dctMode == 0 || bitmaprow == 0)
   1873     {
   1874         if (intra)
   1875         {
   1876             *((ULong*)rec) = *((ULong*)(rec + 4)) = 0;
   1877             *((ULong*)(rec += lx)) = 0;
   1878             *((ULong*)(rec + 4)) = 0;
   1879             *((ULong*)(rec += lx)) = 0;
   1880             *((ULong*)(rec + 4)) = 0;
   1881             *((ULong*)(rec += lx)) = 0;
   1882             *((ULong*)(rec + 4)) = 0;
   1883             *((ULong*)(rec += lx)) = 0;
   1884             *((ULong*)(rec + 4)) = 0;
   1885             *((ULong*)(rec += lx)) = 0;
   1886             *((ULong*)(rec + 4)) = 0;
   1887             *((ULong*)(rec += lx)) = 0;
   1888             *((ULong*)(rec + 4)) = 0;
   1889             *((ULong*)(rec += lx)) = 0;
   1890             *((ULong*)(rec + 4)) = 0;
   1891             return ;
   1892         }
   1893         else /* copy from previous frame */
   1894         {
   1895             *((ULong*)rec) = *((ULong*)pred);
   1896             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1897             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1898             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1899             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1900             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1901             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1902             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1903             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1904             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1905             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1906             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1907             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1908             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1909             *((ULong*)(rec += lx)) = *((ULong*)(pred += 16));
   1910             *((ULong*)(rec + 4)) = *((ULong*)(pred + 4));
   1911             return ;
   1912         }
   1913     }
   1914 
   1915     /* Test for DC only block */
   1916     if (dctMode == 1 || (bitmaprow == 0x80 && bitmapcol[0] == 0x80))
   1917     {
   1918         i = ((block[0] << 3) + 32) >> 6;
   1919         block[0] = 0;
   1920         if (intra)
   1921         {
   1922             if ((UInt)i > mask) i = mask & (~(i >> 31));
   1923 
   1924             tmp = i | (i << 8);
   1925             tmp |= (tmp << 16);
   1926 
   1927             *((ULong*)rec) = *((ULong*)(rec + 4)) = tmp;
   1928             *((ULong*)(rec += lx)) = tmp;
   1929             *((ULong*)(rec + 4)) = tmp;
   1930             *((ULong*)(rec += lx)) = tmp;
   1931             *((ULong*)(rec + 4)) = tmp;
   1932             *((ULong*)(rec += lx)) = tmp;
   1933             *((ULong*)(rec + 4)) = tmp;
   1934             *((ULong*)(rec += lx)) = tmp;
   1935             *((ULong*)(rec + 4)) = tmp;
   1936             *((ULong*)(rec += lx)) = tmp;
   1937             *((ULong*)(rec + 4)) = tmp;
   1938             *((ULong*)(rec += lx)) = tmp;
   1939             *((ULong*)(rec + 4)) = tmp;
   1940             *((ULong*)(rec += lx)) = tmp;
   1941             *((ULong*)(rec + 4)) = tmp;
   1942 
   1943             return ;
   1944         }
   1945         else
   1946         {
   1947             endcol = rec + (lx << 3);
   1948             do
   1949             {
   1950                 tmp4 = *((ULong*)pred);
   1951                 tmp2 = tmp4 & 0xFF;
   1952                 tmp2 += i;
   1953                 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
   1954                 tmp = (tmp4 >> 8) & 0xFF;
   1955                 tmp += i;
   1956                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1957                 tmp2 |= (tmp << 8);
   1958                 tmp = (tmp4 >> 16) & 0xFF;
   1959                 tmp += i;
   1960                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1961                 tmp2 |= (tmp << 16);
   1962                 tmp = (tmp4 >> 24) & 0xFF;
   1963                 tmp += i;
   1964                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1965                 tmp2 |= (tmp << 24);
   1966                 *((ULong*)rec) = tmp2;
   1967 
   1968                 tmp4 = *((ULong*)(pred + 4));
   1969                 tmp2 = tmp4 & 0xFF;
   1970                 tmp2 += i;
   1971                 if ((UInt)tmp2 > mask) tmp2 = mask & (~(tmp2 >> 31));
   1972                 tmp = (tmp4 >> 8) & 0xFF;
   1973                 tmp += i;
   1974                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1975                 tmp2 |= (tmp << 8);
   1976                 tmp = (tmp4 >> 16) & 0xFF;
   1977                 tmp += i;
   1978                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1979                 tmp2 |= (tmp << 16);
   1980                 tmp = (tmp4 >> 24) & 0xFF;
   1981                 tmp += i;
   1982                 if ((UInt)tmp > mask) tmp = mask & (~(tmp >> 31));
   1983                 tmp2 |= (tmp << 24);
   1984                 *((ULong*)(rec + 4)) = tmp2;
   1985 
   1986                 rec += lx;
   1987                 pred += 16;
   1988             }
   1989             while (rec < endcol);
   1990             return ;
   1991         }
   1992     }
   1993 
   1994     for (i = 0; i < dctMode; i++)
   1995     {
   1996         bmap = (Int)bitmapcol[i];
   1997         if (bmap)
   1998         {
   1999             if ((bmap&0xf) == 0)
   2000                 (*(idctcolVCA[bmap>>4]))(ptr);
   2001             else
   2002                 idct_col(ptr);
   2003         }
   2004         ptr++;
   2005     }
   2006 
   2007     if ((bitmaprow&0xf) == 0)
   2008     {
   2009         if (intra)
   2010             (*(idctrowVCAIntra[(Int)(bitmaprow>>4)]))(block, rec, lx);
   2011         else
   2012             (*(idctrowVCAzmv[(Int)(bitmaprow>>4)]))(block, rec, pred, lx);
   2013     }
   2014     else
   2015     {
   2016         if (intra)
   2017             idct_rowIntra(block, rec, lx);
   2018         else
   2019             idct_rowzmv(block, rec, pred, lx);
   2020     }
   2021 }
   2022