Home | History | Annotate | Download | only in src
      1 /* ------------------------------------------------------------------
      2  * Copyright (C) 1998-2009 PacketVideo
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
     13  * express or implied.
     14  * See the License for the specific language governing permissions
     15  * and limitations under the License.
     16  * -------------------------------------------------------------------
     17  */
     18 #include "mp4enc_lib.h"
     19 #include "mp4lib_int.h"
     20 #include "dct_inline.h"
     21 
     22 #define FDCT_SHIFT 10
     23 
     24 #ifdef __cplusplus
     25 extern "C"
     26 {
     27 #endif
     28 
     29     /**************************************************************************/
     30     /*  Function:   BlockDCT_AANwSub
     31         Date:       7/31/01
     32         Input:
     33         Output:     out[64] ==> next block
     34         Purpose:    Do subtraction for zero MV first
     35         Modified:
     36     **************************************************************************/
     37 
     38     Void BlockDCT_AANwSub(Short *out, UChar *cur, UChar *pred, Int width)
     39     {
     40         Short *dst;
     41         Int k0, k1, k2, k3, k4, k5, k6, k7;
     42         Int round;
     43         Int k12 = 0x022A02D4;
     44         Int k14 = 0x0188053A;
     45         Int abs_sum;
     46         Int mask;
     47         Int tmp, tmp2;
     48         Int ColTh;
     49 
     50         dst = out + 64 ;
     51         ColTh = *dst;
     52         out += 128;
     53         round = 1 << (FDCT_SHIFT - 1);
     54 
     55         do  /* fdct_nextrow */
     56         {
     57             /* assuming the block is word-aligned */
     58             mask = 0x1FE;
     59             tmp = *((Int*) cur);    /* contains 4 pixels */
     60             tmp2 = *((Int*) pred); /* prediction 4 pixels */
     61             k0 = tmp2 & 0xFF;
     62             k1 = mask & (tmp << 1);
     63             k0 = k1 - (k0 << 1);
     64             k1 = (tmp2 >> 8) & 0xFF;
     65             k2 = mask & (tmp >> 7);
     66             k1 = k2 - (k1 << 1);
     67             k2 = (tmp2 >> 16) & 0xFF;
     68             k3 = mask & (tmp >> 15);
     69             k2 = k3 - (k2 << 1);
     70             k3 = (tmp2 >> 24) & 0xFF;
     71             k4 = mask & (tmp >> 23);
     72             k3 = k4 - (k3 << 1);
     73             tmp = *((Int*)(cur + 4));   /* another 4 pixels */
     74             tmp2 = *((Int*)(pred + 4));
     75             k4 = tmp2 & 0xFF;
     76             k5 = mask & (tmp << 1);
     77             k4 = k5 - (k4 << 1);
     78             k5 = (tmp2 >> 8) & 0xFF;
     79             k6 = mask & (tmp >> 7);
     80             k5 = k6 - (k5 << 1);
     81             k6 = (tmp2 >> 16) & 0xFF;
     82             k7 = mask & (tmp >> 15);
     83             k6 = k7 - (k6 << 1);
     84             k7 = (tmp2 >> 24) & 0xFF;
     85             tmp = mask & (tmp >> 23);
     86             k7 = tmp - (k7 << 1);
     87             cur += width;
     88             pred += 16;
     89 
     90             /* fdct_1 */
     91             k0 = k0 + k7;
     92             k7 = k0 - (k7 << 1);
     93             k1 = k1 + k6;
     94             k6 = k1 - (k6 << 1);
     95             k2 = k2 + k5;
     96             k5 = k2 - (k5 << 1);
     97             k3 = k3 + k4;
     98             k4 = k3 - (k4 << 1);
     99 
    100             k0 = k0 + k3;
    101             k3 = k0 - (k3 << 1);
    102             k1 = k1 + k2;
    103             k2 = k1 - (k2 << 1);
    104 
    105             k0 = k0 + k1;
    106             k1 = k0 - (k1 << 1);
    107             /**********/
    108             dst[0] = k0;
    109             dst[4] = k1; /* col. 4 */
    110             /* fdct_2 */
    111             k4 = k4 + k5;
    112             k5 = k5 + k6;
    113             k6 = k6 + k7;
    114             k2 = k2 + k3;
    115             /* MUL2C k2,k5,724,FDCT_SHIFT */
    116             /* k0, k1 become scratch */
    117             /* assume FAST MULTIPLY */
    118             k1 = mla724(k12, k5, round);
    119             k0 = mla724(k12, k2, round);
    120 
    121             k5 = k1 >> FDCT_SHIFT;
    122             k2 = k0 >> FDCT_SHIFT;
    123             /*****************/
    124             k2 = k2 + k3;
    125             k3 = (k3 << 1) - k2;
    126             /********/
    127             dst[2] = k2;        /* col. 2 */
    128             k3 <<= 1;       /* scale up col. 6 */
    129             dst[6] = k3; /* col. 6 */
    130             /* fdct_3 */
    131             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    132             /* assume FAST MULTIPLY */
    133             /* k0, k1 are output */
    134             k0 = k4 - k6;
    135 
    136             k1 = mla392(k0, k14, round);
    137             k0 = mla554(k4, k12, k1);
    138             k1 = mla1338(k6, k14, k1);
    139 
    140             k4 = k0 >> FDCT_SHIFT;
    141             k6 = k1 >> FDCT_SHIFT;
    142             /***********************/
    143             k5 = k5 + k7;
    144             k7 = (k7 << 1) - k5;
    145             k4 = k4 + k7;
    146             k7 = (k7 << 1) - k4;
    147             k5 = k5 + k6;
    148             k4 <<= 1;       /* scale up col.5 */
    149             k6 = k5 - (k6 << 1);
    150             /********/
    151             dst[5] = k4;    /* col. 5 */
    152             k6 <<= 2;       /* scale up col. 7 */
    153             dst[1] = k5;    /* col. 1 */
    154             dst[7] = k6;    /* col. 7 */
    155             dst[3] = k7;    /* col. 3 */
    156             dst += 8;
    157         }
    158         while (dst < out);
    159 
    160         out -= 64;
    161         dst = out + 8;
    162 
    163         /*  Vertical Block Loop  */
    164         do  /* Vertical 8xDCT loop */
    165         {
    166             k0 = out[0];
    167             k1 = out[8];
    168             k2 = out[16];
    169             k3 = out[24];
    170             k4 = out[32];
    171             k5 = out[40];
    172             k6 = out[48];
    173             k7 = out[56];
    174             /* deadzone thresholding for column */
    175 
    176             abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
    177 
    178             if (abs_sum < ColTh)
    179             {
    180                 out[0] = 0x7fff;
    181                 out++;
    182                 continue;
    183             }
    184 
    185             /* fdct_1 */
    186             k0 = k0 + k7;
    187             k7 = k0 - (k7 << 1);
    188             k1 = k1 + k6;
    189             k6 = k1 - (k6 << 1);
    190             k2 = k2 + k5;
    191             k5 = k2 - (k5 << 1);
    192             k3 = k3 + k4;
    193             k4 = k3 - (k4 << 1);
    194 
    195             k0 = k0 + k3;
    196             k3 = k0 - (k3 << 1);
    197             k1 = k1 + k2;
    198             k2 = k1 - (k2 << 1);
    199 
    200             k0 = k0 + k1;
    201             k1 = k0 - (k1 << 1);
    202             /**********/
    203             out[32] = k1; /* row 4 */
    204             out[0] = k0; /* row 0 */
    205             /* fdct_2 */
    206             k4 = k4 + k5;
    207             k5 = k5 + k6;
    208             k6 = k6 + k7;
    209             k2 = k2 + k3;
    210             /* MUL2C k2,k5,724,FDCT_SHIFT */
    211             /* k0, k1 become scratch */
    212             /* assume FAST MULTIPLY */
    213             k1 = mla724(k12, k5, round);
    214             k0 = mla724(k12, k2, round);
    215 
    216             k5 = k1 >> FDCT_SHIFT;
    217             k2 = k0 >> FDCT_SHIFT;
    218             /*****************/
    219             k2 = k2 + k3;
    220             k3 = (k3 << 1) - k2;
    221             k3 <<= 1;       /* scale up col. 6 */
    222             /********/
    223             out[48] = k3;   /* row 6 */
    224             out[16] = k2;   /* row 2 */
    225             /* fdct_3 */
    226             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    227             /* assume FAST MULTIPLY */
    228             /* k0, k1 are output */
    229             k0 = k4 - k6;
    230 
    231             k1 = mla392(k0, k14, round);
    232             k0 = mla554(k4, k12, k1);
    233             k1 = mla1338(k6, k14, k1);
    234 
    235             k4 = k0 >> FDCT_SHIFT;
    236             k6 = k1 >> FDCT_SHIFT;
    237             /***********************/
    238             k5 = k5 + k7;
    239             k7 = (k7 << 1) - k5;
    240             k4 = k4 + k7;
    241             k7 = (k7 << 1) - k4;
    242             k5 = k5 + k6;
    243             k4 <<= 1;       /* scale up col. 5 */
    244             k6 = k5 - (k6 << 1);
    245             /********/
    246             out[24] = k7 ;    /* row 3 */
    247             k6 <<= 2;       /* scale up col. 7 */
    248             out[56] = k6 ;   /* row 7 */
    249             out[8] = k5 ;    /* row 1 */
    250             out[40] = k4 ;   /* row 5 */
    251             out++;
    252         }
    253         while ((uintptr_t)out < (uintptr_t)dst) ;
    254 
    255         return ;
    256     }
    257 
    258     /**************************************************************************/
    259     /*  Function:   Block4x4DCT_AANwSub
    260         Date:       7/31/01
    261         Input:
    262         Output:     out[64] ==> next block
    263         Purpose:    Do subtraction for zero MV first before 4x4 DCT
    264         Modified:
    265     **************************************************************************/
    266 
    267     Void Block4x4DCT_AANwSub(Short *out, UChar *cur, UChar *pred, Int width)
    268     {
    269         Short *dst;
    270         Int k0, k1, k2, k3, k4, k5, k6, k7;
    271         Int round;
    272         Int k12 = 0x022A02D4;
    273         Int k14 = 0x0188053A;
    274         Int mask;
    275         Int tmp, tmp2;
    276         Int abs_sum;
    277         Int ColTh;
    278 
    279         dst = out + 64 ;
    280         ColTh = *dst;
    281         out += 128;
    282         round = 1 << (FDCT_SHIFT - 1);
    283 
    284         do  /* fdct_nextrow */
    285         {
    286             /* assuming the block is word-aligned */
    287             mask = 0x1FE;
    288             tmp = *((Int*) cur);    /* contains 4 pixels */
    289             tmp2 = *((Int*) pred); /* prediction 4 pixels */
    290             k0 = tmp2 & 0xFF;
    291             k1 = mask & (tmp << 1);
    292             k0 = k1 - (k0 << 1);
    293             k1 = (tmp2 >> 8) & 0xFF;
    294             k2 = mask & (tmp >> 7);
    295             k1 = k2 - (k1 << 1);
    296             k2 = (tmp2 >> 16) & 0xFF;
    297             k3 = mask & (tmp >> 15);
    298             k2 = k3 - (k2 << 1);
    299             k3 = (tmp2 >> 24) & 0xFF;
    300             k4 = mask & (tmp >> 23);
    301             k3 = k4 - (k3 << 1);
    302             tmp = *((Int*)(cur + 4));   /* another 4 pixels */
    303             tmp2 = *((Int*)(pred + 4));
    304             k4 = tmp2 & 0xFF;
    305             k5 = mask & (tmp << 1);
    306             k4 = k5 - (k4 << 1);
    307             k5 = (tmp2 >> 8) & 0xFF;
    308             k6 = mask & (tmp >> 7);
    309             k5 = k6 - (k5 << 1);
    310             k6 = (tmp2 >> 16) & 0xFF;
    311             k7 = mask & (tmp >> 15);
    312             k6 = k7 - (k6 << 1);
    313             k7 = (tmp2 >> 24) & 0xFF;
    314             tmp = mask & (tmp >> 23);
    315             k7 = tmp - (k7 << 1);
    316             cur += width;
    317             pred += 16;
    318 
    319             /* fdct_1 */
    320             k0 = k0 + k7;
    321             k7 = k0 - (k7 << 1);
    322             k1 = k1 + k6;
    323             k6 = k1 - (k6 << 1);
    324             k2 = k2 + k5;
    325             k5 = k2 - (k5 << 1);
    326             k3 = k3 + k4;
    327             k4 = k3 - (k4 << 1);
    328 
    329             k0 = k0 + k3;
    330             k3 = k0 - (k3 << 1);
    331             k1 = k1 + k2;
    332             k2 = k1 - (k2 << 1);
    333 
    334             k0 = k0 + k1;
    335             /**********/
    336             dst[0] = k0;
    337             /* fdct_2 */
    338             k4 = k4 + k5;
    339             k5 = k5 + k6;
    340             k6 = k6 + k7;
    341             k2 = k2 + k3;
    342             /* MUL2C k2,k5,724,FDCT_SHIFT */
    343             /* k0, k1 become scratch */
    344             /* assume FAST MULTIPLY */
    345             k1 = mla724(k12, k5, round);
    346             k0 = mla724(k12, k2, round);
    347 
    348             k5 = k1 >> FDCT_SHIFT;
    349             k2 = k0 >> FDCT_SHIFT;
    350             /*****************/
    351             k2 = k2 + k3;
    352             /********/
    353             dst[2] = k2;        /* col. 2 */
    354             /* fdct_3 */
    355             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    356             /* assume FAST MULTIPLY */
    357             /* k0, k1 are output */
    358             k0 = k4 - k6;
    359 
    360             k1 = mla392(k0, k14, round);
    361             k0 = mla554(k4, k12, k1);
    362             k1 = mla1338(k6, k14, k1);
    363 
    364             k4 = k0 >> FDCT_SHIFT;
    365             k6 = k1 >> FDCT_SHIFT;
    366             /***********************/
    367             k5 = k5 + k7;
    368             k7 = (k7 << 1) - k5;
    369             k7 = k7 - k4;
    370             k5 = k5 + k6;
    371             /********/
    372             dst[1] = k5;        /* col. 1 */
    373             dst[3] = k7;        /* col. 3 */
    374             dst += 8;
    375         }
    376         while (dst < out);
    377 
    378         out -= 64;
    379         dst = out + 4;
    380 
    381         /*  Vertical Block Loop  */
    382         do  /* Vertical 8xDCT loop */
    383         {
    384             k0 = out[0];
    385             k1 = out[8];
    386             k2 = out[16];
    387             k3 = out[24];
    388             k4 = out[32];
    389             k5 = out[40];
    390             k6 = out[48];
    391             k7 = out[56];
    392 
    393             abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
    394 
    395             if (abs_sum < ColTh)
    396             {
    397                 out[0] = 0x7fff;
    398                 out++;
    399                 continue;
    400             }
    401             /* fdct_1 */
    402             k0 = k0 + k7;
    403             k7 = k0 - (k7 << 1);
    404             k1 = k1 + k6;
    405             k6 = k1 - (k6 << 1);
    406             k2 = k2 + k5;
    407             k5 = k2 - (k5 << 1);
    408             k3 = k3 + k4;
    409             k4 = k3 - (k4 << 1);
    410 
    411             k0 = k0 + k3;
    412             k3 = k0 - (k3 << 1);
    413             k1 = k1 + k2;
    414             k2 = k1 - (k2 << 1);
    415 
    416             k0 = k0 + k1;
    417             /**********/
    418             out[0] = k0;   /* row 0 */
    419             /* fdct_2 */
    420             k4 = k4 + k5;
    421             k5 = k5 + k6;
    422             k6 = k6 + k7;
    423             k2 = k2 + k3;
    424             /* MUL2C k2,k5,724,FDCT_SHIFT */
    425             /* k0, k1 become scratch */
    426             /* assume FAST MULTIPLY */
    427             k1 = mla724(k12, k5, round);
    428             k0 = mla724(k12, k2, round);
    429 
    430             k5 = k1 >> FDCT_SHIFT;
    431             k2 = k0 >> FDCT_SHIFT;
    432             /*****************/
    433             k2 = k2 + k3;
    434             /********/
    435             out[16] = k2;           /* row 2 */
    436             /* fdct_3 */
    437             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    438             /* assume FAST MULTIPLY */
    439             /* k0, k1 are output */
    440             k0 = k4 - k6;
    441 
    442             k1 = mla392(k0, k14, round);
    443             k0 = mla554(k4, k12, k1);
    444             k1 = mla1338(k6, k14, k1);
    445 
    446             k4 = k0 >> FDCT_SHIFT;
    447             k6 = k1 >> FDCT_SHIFT;
    448             /***********************/
    449             k5 = k5 + k7;
    450             k7 = (k7 << 1) - k5;
    451             k7 = k7 - k4 ;
    452             k5 = k5 + k6;
    453             /********/
    454             out[24] = k7 ;      /* row 3 */
    455             out[8] = k5 ;       /* row 1 */
    456             out++;
    457         }
    458         while ((uintptr_t)out < (uintptr_t)dst) ;
    459 
    460         return ;
    461     }
    462 
    463     /**************************************************************************/
    464     /*  Function:   Block2x2DCT_AANwSub
    465         Date:       7/31/01
    466         Input:
    467         Output:     out[64] ==> next block
    468         Purpose:    Do subtraction for zero MV first before 2x2 DCT
    469         Modified:
    470     **************************************************************************/
    471 
    472 
    473     Void Block2x2DCT_AANwSub(Short *out, UChar *cur, UChar *pred, Int width)
    474     {
    475         Short *dst;
    476         Int k0, k1, k2, k3, k4, k5, k6, k7;
    477         Int round;
    478         Int k12 = 0x022A02D4;
    479         Int k14 = 0x018803B2;
    480         Int mask;
    481         Int tmp, tmp2;
    482         Int abs_sum;
    483         Int ColTh;
    484 
    485         dst = out + 64 ;
    486         ColTh = *dst;
    487         out += 128;
    488         round = 1 << (FDCT_SHIFT - 1);
    489 
    490         do  /* fdct_nextrow */
    491         {
    492             /* assuming the block is word-aligned */
    493             mask = 0x1FE;
    494             tmp = *((Int*) cur);    /* contains 4 pixels */
    495             tmp2 = *((Int*) pred); /* prediction 4 pixels */
    496             k0 = tmp2 & 0xFF;
    497             k1 = mask & (tmp << 1);
    498             k0 = k1 - (k0 << 1);
    499             k1 = (tmp2 >> 8) & 0xFF;
    500             k2 = mask & (tmp >> 7);
    501             k1 = k2 - (k1 << 1);
    502             k2 = (tmp2 >> 16) & 0xFF;
    503             k3 = mask & (tmp >> 15);
    504             k2 = k3 - (k2 << 1);
    505             k3 = (tmp2 >> 24) & 0xFF;
    506             k4 = mask & (tmp >> 23);
    507             k3 = k4 - (k3 << 1);
    508             tmp = *((Int*)(cur + 4));   /* another 4 pixels */
    509             tmp2 = *((Int*)(pred + 4));
    510             k4 = tmp2 & 0xFF;
    511             k5 = mask & (tmp << 1);
    512             k4 = k5 - (k4 << 1);
    513             k5 = (tmp2 >> 8) & 0xFF;
    514             k6 = mask & (tmp >> 7);
    515             k5 = k6 - (k5 << 1);
    516             k6 = (tmp2 >> 16) & 0xFF;
    517             k7 = mask & (tmp >> 15);
    518             k6 = k7 - (k6 << 1);
    519             k7 = (tmp2 >> 24) & 0xFF;
    520             tmp = mask & (tmp >> 23);
    521             k7 = tmp - (k7 << 1);
    522             cur += width;
    523             pred += 16;
    524 
    525             /* fdct_1 */
    526             k0 = k0 + k7;
    527             k7 = k0 - (k7 << 1);
    528             k1 = k1 + k6;
    529             k6 = k1 - (k6 << 1);
    530             k2 = k2 + k5;
    531             k5 = k2 - (k5 << 1);
    532             k3 = k3 + k4;
    533             k4 = k3 - (k4 << 1);
    534 
    535             k0 = k0 + k3;
    536             k3 = k0 - (k3 << 1);
    537             k1 = k1 + k2;
    538             k2 = k1 - (k2 << 1);
    539 
    540             k0 = k0 + k1;
    541             /**********/
    542             dst[0] = k0;
    543             /* fdct_2 */
    544             k4 = k4 + k5;
    545             k5 = k5 + k6;
    546             k6 = k6 + k7;
    547             /* MUL2C k2,k5,724,FDCT_SHIFT */
    548             /* k0, k1 become scratch */
    549             /* assume FAST MULTIPLY */
    550             k1 = mla724(k12, k5, round);
    551 
    552             k5 = k1 >> FDCT_SHIFT;
    553             /*****************/
    554             /********/
    555             /* fdct_3 */
    556             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    557             /* assume FAST MULTIPLY */
    558             /* k0, k1 are output */
    559             k1 = mla392(k4, k14, round);
    560             k1 = mla946(k6, k14, k1);
    561 
    562             k6 = k1 >> FDCT_SHIFT;
    563             /***********************/
    564             k5 = k5 + k7;
    565             k5 = k5 + k6;
    566             /********/
    567             dst[1] = k5;
    568             dst += 8;
    569         }
    570         while (dst < out);
    571         out -= 64;
    572         dst = out + 2;
    573         /*  Vertical Block Loop  */
    574         do  /* Vertical 8xDCT loop */
    575         {
    576             k0 = out[0];
    577             k1 = out[8];
    578             k2 = out[16];
    579             k3 = out[24];
    580             k4 = out[32];
    581             k5 = out[40];
    582             k6 = out[48];
    583             k7 = out[56];
    584 
    585             abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
    586 
    587             if (abs_sum < ColTh)
    588             {
    589                 out[0] = 0x7fff;
    590                 out++;
    591                 continue;
    592             }
    593             /* fdct_1 */
    594             k0 = k0 + k7;
    595             k7 = k0 - (k7 << 1);
    596             k1 = k1 + k6;
    597             k6 = k1 - (k6 << 1);
    598             k2 = k2 + k5;
    599             k5 = k2 - (k5 << 1);
    600             k3 = k3 + k4;
    601             k4 = k3 - (k4 << 1);
    602 
    603             k0 = k0 + k3;
    604             k3 = k0 - (k3 << 1);
    605             k1 = k1 + k2;
    606             k2 = k1 - (k2 << 1);
    607 
    608             k0 = k0 + k1;
    609             /**********/
    610             out[0] = k0;        /* row 0 */
    611             /* fdct_2 */
    612             k4 = k4 + k5;
    613             k5 = k5 + k6;
    614             k6 = k6 + k7;
    615             /* MUL2C k2,k5,724,FDCT_SHIFT */
    616             /* k0, k1 become scratch */
    617             /* assume FAST MULTIPLY */
    618             k1 = mla724(k12, k5, round);
    619 
    620             k5 = k1 >> FDCT_SHIFT;
    621             /*****************/
    622             /********/
    623             /* fdct_3 */
    624             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    625             /* assume FAST MULTIPLY */
    626             /* k0, k1 are output */
    627             k1 = mla392(k4, k14, round);
    628             k1 = mla946(k6, k14, k1);
    629 
    630             k6 = k1 >> FDCT_SHIFT;
    631             /***********************/
    632             k5 = k5 + k7;
    633             k5 = k5 + k6;
    634             /********/
    635             out[8] = k5 ;       /* row 1 */
    636             out++;
    637         }
    638         while ((uintptr_t)out < (uintptr_t)dst) ;
    639 
    640         return ;
    641     }
    642 
    643     /**************************************************************************/
    644     /*  Function:   BlockDCT_AANIntra
    645         Date:       8/9/01
    646         Input:      rec
    647         Output:     out[64] ==> next block
    648         Purpose:    Input directly from rec frame.
    649         Modified:
    650     **************************************************************************/
    651 
    652     Void BlockDCT_AANIntra(Short *out, UChar *cur, UChar *dummy2, Int width)
    653     {
    654         Short *dst;
    655         Int k0, k1, k2, k3, k4, k5, k6, k7;
    656         Int round;
    657         Int k12 = 0x022A02D4;
    658         Int k14 = 0x0188053A;
    659         Int abs_sum;
    660         Int mask;
    661         Int *curInt, tmp;
    662         Int ColTh;
    663 
    664         OSCL_UNUSED_ARG(dummy2);
    665 
    666         dst = out + 64 ;
    667         ColTh = *dst;
    668         out += 128;
    669         round = 1 << (FDCT_SHIFT - 1);
    670 
    671         do  /* fdct_nextrow */
    672         {
    673             mask = 0x1FE;
    674             curInt = (Int*) cur;
    675             tmp = curInt[0];    /* contains 4 pixels */
    676             k0 = mask & (tmp << 1);
    677             k1 = mask & (tmp >> 7);
    678             k2 = mask & (tmp >> 15);
    679             k3 = mask & (tmp >> 23);
    680             tmp = curInt[1];    /* another 4 pixels */
    681             k4 =  mask & (tmp << 1);
    682             k5 =  mask & (tmp >> 7);
    683             k6 =  mask & (tmp >> 15);
    684             k7 =  mask & (tmp >> 23);
    685             cur += width;
    686             /* fdct_1 */
    687             k0 = k0 + k7;
    688             k7 = k0 - (k7 << 1);
    689             k1 = k1 + k6;
    690             k6 = k1 - (k6 << 1);
    691             k2 = k2 + k5;
    692             k5 = k2 - (k5 << 1);
    693             k3 = k3 + k4;
    694             k4 = k3 - (k4 << 1);
    695 
    696             k0 = k0 + k3;
    697             k3 = k0 - (k3 << 1);
    698             k1 = k1 + k2;
    699             k2 = k1 - (k2 << 1);
    700 
    701             k0 = k0 + k1;
    702             k1 = k0 - (k1 << 1);
    703             /**********/
    704             dst[0] = k0;
    705             dst[4] = k1; /* col. 4 */
    706             /* fdct_2 */
    707             k4 = k4 + k5;
    708             k5 = k5 + k6;
    709             k6 = k6 + k7;
    710             k2 = k2 + k3;
    711             /* MUL2C k2,k5,724,FDCT_SHIFT */
    712             /* k0, k1 become scratch */
    713             /* assume FAST MULTIPLY */
    714             k1 = mla724(k12, k5, round);
    715             k0 = mla724(k12, k2, round);
    716 
    717             k5 = k1 >> FDCT_SHIFT;
    718             k2 = k0 >> FDCT_SHIFT;
    719             /*****************/
    720             k2 = k2 + k3;
    721             k3 = (k3 << 1) - k2;
    722             /********/
    723             dst[2] = k2;        /* col. 2 */
    724             k3 <<= 1;       /* scale up col. 6 */
    725             dst[6] = k3; /* col. 6 */
    726             /* fdct_3 */
    727             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    728             /* assume FAST MULTIPLY */
    729             /* k0, k1 are output */
    730             k0 = k4 - k6;
    731 
    732             k1 = mla392(k0, k14, round);
    733             k0 = mla554(k4, k12, k1);
    734             k1 = mla1338(k6, k14, k1);
    735 
    736             k4 = k0 >> FDCT_SHIFT;
    737             k6 = k1 >> FDCT_SHIFT;
    738             /***********************/
    739             k5 = k5 + k7;
    740             k7 = (k7 << 1) - k5;
    741             k4 = k4 + k7;
    742             k7 = (k7 << 1) - k4;
    743             k5 = k5 + k6;
    744             k4 <<= 1;       /* scale up col.5 */
    745             k6 = k5 - (k6 << 1);
    746             /********/
    747             dst[5] = k4;    /* col. 5 */
    748             k6 <<= 2;       /* scale up col. 7 */
    749             dst[1] = k5;    /* col. 1 */
    750             dst[7] = k6;    /* col. 7 */
    751             dst[3] = k7;    /* col. 3 */
    752             dst += 8;
    753         }
    754         while (dst < out);
    755 
    756         out -= 64;
    757         dst = out + 8;
    758 
    759         /*  Vertical Block Loop  */
    760         do  /* Vertical 8xDCT loop */
    761         {
    762             k0 = out[0];
    763             k1 = out[8];
    764             k2 = out[16];
    765             k3 = out[24];
    766             k4 = out[32];
    767             k5 = out[40];
    768             k6 = out[48];
    769             k7 = out[56];
    770             /* deadzone thresholding for column */
    771 
    772             abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
    773 
    774             if (abs_sum < ColTh)
    775             {
    776                 out[0] = 0x7fff;
    777                 out++;
    778                 continue;
    779             }
    780 
    781             /* fdct_1 */
    782             k0 = k0 + k7;
    783             k7 = k0 - (k7 << 1);
    784             k1 = k1 + k6;
    785             k6 = k1 - (k6 << 1);
    786             k2 = k2 + k5;
    787             k5 = k2 - (k5 << 1);
    788             k3 = k3 + k4;
    789             k4 = k3 - (k4 << 1);
    790 
    791             k0 = k0 + k3;
    792             k3 = k0 - (k3 << 1);
    793             k1 = k1 + k2;
    794             k2 = k1 - (k2 << 1);
    795 
    796             k0 = k0 + k1;
    797             k1 = k0 - (k1 << 1);
    798             /**********/
    799             out[32] = k1; /* row 4 */
    800             out[0] = k0; /* row 0 */
    801             /* fdct_2 */
    802             k4 = k4 + k5;
    803             k5 = k5 + k6;
    804             k6 = k6 + k7;
    805             k2 = k2 + k3;
    806             /* MUL2C k2,k5,724,FDCT_SHIFT */
    807             /* k0, k1 become scratch */
    808             /* assume FAST MULTIPLY */
    809             k1 = mla724(k12, k5, round);
    810             k0 = mla724(k12, k2, round);
    811 
    812             k5 = k1 >> FDCT_SHIFT;
    813             k2 = k0 >> FDCT_SHIFT;
    814             /*****************/
    815             k2 = k2 + k3;
    816             k3 = (k3 << 1) - k2;
    817             k3 <<= 1;       /* scale up col. 6 */
    818             /********/
    819             out[48] = k3;   /* row 6 */
    820             out[16] = k2;   /* row 2 */
    821             /* fdct_3 */
    822             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    823             /* assume FAST MULTIPLY */
    824             /* k0, k1 are output */
    825             k0 = k4 - k6;
    826 
    827             k1 = mla392(k0, k14, round);
    828             k0 = mla554(k4, k12, k1);
    829             k1 = mla1338(k6, k14, k1);
    830 
    831             k4 = k0 >> FDCT_SHIFT;
    832             k6 = k1 >> FDCT_SHIFT;
    833             /***********************/
    834             k5 = k5 + k7;
    835             k7 = (k7 << 1) - k5;
    836             k4 = k4 + k7;
    837             k7 = (k7 << 1) - k4;
    838             k5 = k5 + k6;
    839             k4 <<= 1;       /* scale up col. 5 */
    840             k6 = k5 - (k6 << 1);
    841             /********/
    842             out[24] = k7 ;    /* row 3 */
    843             k6 <<= 2;       /* scale up col. 7 */
    844             out[56] = k6 ;   /* row 7 */
    845             out[8] = k5 ;    /* row 1 */
    846             out[40] = k4 ;   /* row 5 */
    847             out++;
    848         }
    849         while ((uintptr_t)out < (uintptr_t)dst) ;
    850 
    851         return ;
    852     }
    853 
    854     /**************************************************************************/
    855     /*  Function:   Block4x4DCT_AANIntra
    856         Date:       8/9/01
    857         Input:      prev
    858         Output:     out[64] ==> next block
    859         Purpose:    Input directly from prev frame. output 2x2 DCT
    860         Modified:
    861     **************************************************************************/
    862 
    863     Void Block4x4DCT_AANIntra(Short *out, UChar *cur, UChar *dummy2, Int width)
    864     {
    865         Short *dst;
    866         Int k0, k1, k2, k3, k4, k5, k6, k7;
    867         Int round;
    868         Int k12 = 0x022A02D4;
    869         Int k14 = 0x0188053A;
    870         Int mask;
    871         Int *curInt, tmp;
    872         Int abs_sum;
    873         Int ColTh;
    874 
    875         OSCL_UNUSED_ARG(dummy2);
    876 
    877         dst = out + 64 ;
    878         ColTh = *dst;
    879         out += 128;
    880         round = 1 << (FDCT_SHIFT - 1);
    881 
    882         do  /* fdct_nextrow */
    883         {
    884             mask = 0x1FE;
    885             curInt = (Int*) cur;
    886             tmp = curInt[0];    /* contains 4 pixels */
    887             k0 = mask & (tmp << 1);
    888             k1 = mask & (tmp >> 7);
    889             k2 = mask & (tmp >> 15);
    890             k3 = mask & (tmp >> 23);
    891             tmp = curInt[1];    /* another 4 pixels */
    892             k4 =  mask & (tmp << 1);
    893             k5 =  mask & (tmp >> 7);
    894             k6 =  mask & (tmp >> 15);
    895             k7 =  mask & (tmp >> 23);
    896             cur += width;
    897             /* fdct_1 */
    898             k0 = k0 + k7;
    899             k7 = k0 - (k7 << 1);
    900             k1 = k1 + k6;
    901             k6 = k1 - (k6 << 1);
    902             k2 = k2 + k5;
    903             k5 = k2 - (k5 << 1);
    904             k3 = k3 + k4;
    905             k4 = k3 - (k4 << 1);
    906 
    907             k0 = k0 + k3;
    908             k3 = k0 - (k3 << 1);
    909             k1 = k1 + k2;
    910             k2 = k1 - (k2 << 1);
    911 
    912             k0 = k0 + k1;
    913             /**********/
    914             dst[0] = k0;
    915             /* fdct_2 */
    916             k4 = k4 + k5;
    917             k5 = k5 + k6;
    918             k6 = k6 + k7;
    919             k2 = k2 + k3;
    920             /* MUL2C k2,k5,724,FDCT_SHIFT */
    921             /* k0, k1 become scratch */
    922             /* assume FAST MULTIPLY */
    923             k1 = mla724(k12, k5, round);
    924             k0 = mla724(k12, k2, round);
    925 
    926             k5 = k1 >> FDCT_SHIFT;
    927             k2 = k0 >> FDCT_SHIFT;
    928             /*****************/
    929             k2 = k2 + k3;
    930             /********/
    931             dst[2] = k2;        /* col. 2 */
    932             /* fdct_3 */
    933             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
    934             /* assume FAST MULTIPLY */
    935             /* k0, k1 are output */
    936             k0 = k4 - k6;
    937 
    938             k1 = mla392(k0, k14, round);
    939             k0 = mla554(k4, k12, k1);
    940             k1 = mla1338(k6, k14, k1);
    941 
    942             k4 = k0 >> FDCT_SHIFT;
    943             k6 = k1 >> FDCT_SHIFT;
    944             /***********************/
    945             k5 = k5 + k7;
    946             k7 = (k7 << 1) - k5;
    947             k7 = k7 - k4;
    948             k5 = k5 + k6;
    949             /********/
    950             dst[1] = k5;        /* col. 1 */
    951             dst[3] = k7;        /* col. 3 */
    952             dst += 8;
    953         }
    954         while (dst < out);
    955 
    956         out -= 64;
    957         dst = out + 4;
    958 
    959         /*  Vertical Block Loop  */
    960         do  /* Vertical 8xDCT loop */
    961         {
    962             k0 = out[0];
    963             k1 = out[8];
    964             k2 = out[16];
    965             k3 = out[24];
    966             k4 = out[32];
    967             k5 = out[40];
    968             k6 = out[48];
    969             k7 = out[56];
    970 
    971             abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
    972 
    973             if (abs_sum < ColTh)
    974             {
    975                 out[0] = 0x7fff;
    976                 out++;
    977                 continue;
    978             }
    979             /* fdct_1 */
    980             k0 = k0 + k7;
    981             k7 = k0 - (k7 << 1);
    982             k1 = k1 + k6;
    983             k6 = k1 - (k6 << 1);
    984             k2 = k2 + k5;
    985             k5 = k2 - (k5 << 1);
    986             k3 = k3 + k4;
    987             k4 = k3 - (k4 << 1);
    988 
    989             k0 = k0 + k3;
    990             k3 = k0 - (k3 << 1);
    991             k1 = k1 + k2;
    992             k2 = k1 - (k2 << 1);
    993 
    994             k0 = k0 + k1;
    995             /**********/
    996             out[0] = k0;   /* row 0 */
    997             /* fdct_2 */
    998             k4 = k4 + k5;
    999             k5 = k5 + k6;
   1000             k6 = k6 + k7;
   1001             k2 = k2 + k3;
   1002             /* MUL2C k2,k5,724,FDCT_SHIFT */
   1003             /* k0, k1 become scratch */
   1004             /* assume FAST MULTIPLY */
   1005             k1 = mla724(k12, k5, round);
   1006             k0 = mla724(k12, k2, round);
   1007 
   1008             k5 = k1 >> FDCT_SHIFT;
   1009             k2 = k0 >> FDCT_SHIFT;
   1010             /*****************/
   1011             k2 = k2 + k3;
   1012             /********/
   1013             out[16] = k2;           /* row 2 */
   1014             /* fdct_3 */
   1015             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
   1016             /* assume FAST MULTIPLY */
   1017             /* k0, k1 are output */
   1018             k0 = k4 - k6;
   1019 
   1020             k1 = mla392(k0, k14, round);
   1021             k0 = mla554(k4, k12, k1);
   1022             k1 = mla1338(k6, k14, k1);
   1023 
   1024             k4 = k0 >> FDCT_SHIFT;
   1025             k6 = k1 >> FDCT_SHIFT;
   1026             /***********************/
   1027             k5 = k5 + k7;
   1028             k7 = (k7 << 1) - k5;
   1029             k7 = k7 - k4 ;
   1030             k5 = k5 + k6;
   1031             /********/
   1032             out[24] = k7 ;      /* row 3 */
   1033             out[8] = k5 ;       /* row 1 */
   1034             out++;
   1035         }
   1036         while ((uintptr_t)out < (uintptr_t)dst) ;
   1037 
   1038         return ;
   1039     }
   1040 
   1041     /**************************************************************************/
   1042     /*  Function:   Block2x2DCT_AANIntra
   1043         Date:       8/9/01
   1044         Input:      prev
   1045         Output:     out[64] ==> next block
   1046         Purpose:    Input directly from prev frame. output 2x2 DCT
   1047         Modified:
   1048     **************************************************************************/
   1049 
   1050     Void Block2x2DCT_AANIntra(Short *out, UChar *cur, UChar *dummy2, Int width)
   1051     {
   1052         Short *dst;
   1053         Int k0, k1, k2, k3, k4, k5, k6, k7;
   1054         Int round;
   1055         Int k12 = 0x022A02D4;
   1056         Int k14 = 0x018803B2;
   1057         Int mask;
   1058         Int *curInt, tmp;
   1059         Int abs_sum;
   1060         Int ColTh;
   1061 
   1062         OSCL_UNUSED_ARG(dummy2);
   1063 
   1064         dst = out + 64 ;
   1065         ColTh = *dst;
   1066         out += 128;
   1067         round = 1 << (FDCT_SHIFT - 1);
   1068 
   1069         do  /* fdct_nextrow */
   1070         {
   1071             mask = 0x1FE;
   1072             curInt = (Int*) cur;
   1073             tmp = curInt[0];    /* contains 4 pixels */
   1074             k0 = mask & (tmp << 1);
   1075             k1 = mask & (tmp >> 7);
   1076             k2 = mask & (tmp >> 15);
   1077             k3 = mask & (tmp >> 23);
   1078             tmp = curInt[1];    /* another 4 pixels */
   1079             k4 =  mask & (tmp << 1);
   1080             k5 =  mask & (tmp >> 7);
   1081             k6 =  mask & (tmp >> 15);
   1082             k7 =  mask & (tmp >> 23);
   1083             cur += width;
   1084 
   1085             /* fdct_1 */
   1086             k0 = k0 + k7;
   1087             k7 = k0 - (k7 << 1);
   1088             k1 = k1 + k6;
   1089             k6 = k1 - (k6 << 1);
   1090             k2 = k2 + k5;
   1091             k5 = k2 - (k5 << 1);
   1092             k3 = k3 + k4;
   1093             k4 = k3 - (k4 << 1);
   1094 
   1095             k0 = k0 + k3;
   1096             k3 = k0 - (k3 << 1);
   1097             k1 = k1 + k2;
   1098             k2 = k1 - (k2 << 1);
   1099 
   1100             k0 = k0 + k1;
   1101             /**********/
   1102             dst[0] = k0;
   1103             /* fdct_2 */
   1104             k4 = k4 + k5;
   1105             k5 = k5 + k6;
   1106             k6 = k6 + k7;
   1107             /* MUL2C k2,k5,724,FDCT_SHIFT */
   1108             /* k0, k1 become scratch */
   1109             /* assume FAST MULTIPLY */
   1110             k1 = mla724(k12, k5, round);
   1111 
   1112             k5 = k1 >> FDCT_SHIFT;
   1113             /*****************/
   1114             /********/
   1115             /* fdct_3 */
   1116             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
   1117             /* assume FAST MULTIPLY */
   1118             /* k0, k1 are output */
   1119             k1 = mla392(k4, k14, round);
   1120             k1 = mla946(k6, k14, k1);
   1121 
   1122             k6 = k1 >> FDCT_SHIFT;
   1123             /***********************/
   1124             k5 = k5 + k7;
   1125             k5 = k5 + k6;
   1126             /********/
   1127             dst[1] = k5;
   1128             dst += 8;
   1129         }
   1130         while (dst < out);
   1131         out -= 64;
   1132         dst = out + 2;
   1133         /*  Vertical Block Loop  */
   1134         do  /* Vertical 8xDCT loop */
   1135         {
   1136             k0 = out[0];
   1137             k1 = out[8];
   1138             k2 = out[16];
   1139             k3 = out[24];
   1140             k4 = out[32];
   1141             k5 = out[40];
   1142             k6 = out[48];
   1143             k7 = out[56];
   1144 
   1145             abs_sum = sum_abs(k0, k1, k2, k3, k4, k5, k6, k7);
   1146 
   1147             if (abs_sum < ColTh)
   1148             {
   1149                 out[0] = 0x7fff;
   1150                 out++;
   1151                 continue;
   1152             }
   1153             /* fdct_1 */
   1154             k0 = k0 + k7;
   1155             k7 = k0 - (k7 << 1);
   1156             k1 = k1 + k6;
   1157             k6 = k1 - (k6 << 1);
   1158             k2 = k2 + k5;
   1159             k5 = k2 - (k5 << 1);
   1160             k3 = k3 + k4;
   1161             k4 = k3 - (k4 << 1);
   1162 
   1163             k0 = k0 + k3;
   1164             k3 = k0 - (k3 << 1);
   1165             k1 = k1 + k2;
   1166             k2 = k1 - (k2 << 1);
   1167 
   1168             k0 = k0 + k1;
   1169             /**********/
   1170             out[0] = k0;        /* row 0 */
   1171             /* fdct_2 */
   1172             k4 = k4 + k5;
   1173             k5 = k5 + k6;
   1174             k6 = k6 + k7;
   1175             /* MUL2C k2,k5,724,FDCT_SHIFT */
   1176             /* k0, k1 become scratch */
   1177             /* assume FAST MULTIPLY */
   1178             k1 = mla724(k12, k5, round);
   1179 
   1180             k5 = k1 >> FDCT_SHIFT;
   1181             /*****************/
   1182             /********/
   1183             /* fdct_3 */
   1184             /* ROTATE k4,k6,392,946, FDCT_SHIFT */
   1185             /* assume FAST MULTIPLY */
   1186             /* k0, k1 are output */
   1187             k1 = mla392(k4, k14, round);
   1188             k1 = mla946(k6, k14, k1);
   1189 
   1190             k6 = k1 >> FDCT_SHIFT;
   1191             /***********************/
   1192             k5 = k5 + k7;
   1193             k5 = k5 + k6;
   1194             /********/
   1195             out[8] = k5 ;       /* row 1 */
   1196             out++;
   1197         }
   1198         while ((uintptr_t)out < (uintptr_t)dst) ;
   1199 
   1200         return ;
   1201     }
   1202     /**************************************************************************/
   1203     /*  Function:   Block1x1DCTwSub
   1204         Date:       8/9/01
   1205         Input:      block
   1206         Output:     y
   1207         Purpose:    Compute DC value only
   1208         Modified:
   1209     **************************************************************************/
   1210     void Block1x1DCTwSub(Short *out, UChar *cur, UChar *pred, Int width)
   1211     {
   1212         UChar *end;
   1213         Int temp = 0;
   1214         Int offset2;
   1215 
   1216         offset2 = width - 8;
   1217         end = pred + (16 << 3);
   1218         do
   1219         {
   1220             temp += (*cur++ - *pred++);
   1221             temp += (*cur++ - *pred++);
   1222             temp += (*cur++ - *pred++);
   1223             temp += (*cur++ - *pred++);
   1224             temp += (*cur++ - *pred++);
   1225             temp += (*cur++ - *pred++);
   1226             temp += (*cur++ - *pred++);
   1227             temp += (*cur++ - *pred++);
   1228             cur += offset2;
   1229             pred += 8;
   1230         }
   1231         while (pred < end) ;
   1232 
   1233         out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = 0;
   1234         out[0] = temp >> 3;
   1235 
   1236         return ;
   1237     }
   1238 
   1239     /**************************************************************************/
   1240     /*  Function:   Block1x1DCTIntra
   1241         Date:       8/9/01
   1242         Input:      prev
   1243         Output:     out
   1244         Purpose:    Compute DC value only
   1245         Modified:
   1246     **************************************************************************/
   1247     void Block1x1DCTIntra(Short *out, UChar *cur, UChar *dummy2, Int width)
   1248     {
   1249         UChar *end;
   1250         Int temp = 0;
   1251         ULong word;
   1252 
   1253         OSCL_UNUSED_ARG(dummy2);
   1254 
   1255         end = cur + (width << 3);
   1256         do
   1257         {
   1258             word = *((ULong*)cur);
   1259             temp += (word >> 24);
   1260             temp += ((word >> 16) & 0xFF);
   1261             temp += ((word >> 8) & 0xFF);
   1262             temp += (word & 0xFF);
   1263 
   1264             word = *((ULong*)(cur + 4));
   1265             temp += (word >> 24);
   1266             temp += ((word >> 16) & 0xFF);
   1267             temp += ((word >> 8) & 0xFF);
   1268             temp += (word & 0xFF);
   1269 
   1270             cur += width;
   1271         }
   1272         while (cur < end) ;
   1273 
   1274         out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = 0;
   1275         out[0] = temp >> 3;
   1276 
   1277         return ;
   1278     }
   1279 
   1280 #ifdef __cplusplus
   1281 }
   1282 #endif
   1283 
   1284