Home | History | Annotate | Download | only in common
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21  *******************************************************************************
     22  * @file
     23  *  ih264_resi_trans_quant.c
     24  *
     25  * @brief
     26  *  Contains function definitions single stage  forward transform for H.264
     27  *  It will calculate the residue, do the cf and then do quantization
     28  *
     29  * @author
     30  *  Ittiam
     31  *
     32  * @par List of Functions:
     33  *  - ih264_resi_trans_quant_4x4()
     34  *  - ih264_resi_trans_quant_chroma_4x4
     35  *  - ih264_hadamard_quant_4x4
     36  *  - ih264_hadamard_quant_2x2_uv
     37  *  - ih264_resi_trans_quant_8x8
     38  *
     39  * @remarks
     40  *******************************************************************************
     41  */
     42 
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 
     47 /* System include files */
     48 #include <stddef.h>
     49 
     50 /* User include files */
     51 #include "ih264_typedefs.h"
     52 #include "ih264_defs.h"
     53 #include "ih264_size_defs.h"
     54 #include "ih264_macros.h"
     55 #include "ih264_trans_macros.h"
     56 #include "ih264_trans_data.h"
     57 #include "ih264_structs.h"
     58 #include "ih264_trans_quant_itrans_iquant.h"
     59 
     60 /**
     61  *******************************************************************************
     62  *
     63  * @brief
     64  *   This function performs forward transform and quantization on a 4*4 block
     65  *
     66  * @par Description:
     67  *   The function accepts source buffer and estimation buffer. From these, it
     68  *   computes the residue. This is residue is then transformed and quantized.
     69  *   The transform and quantization are in placed computed. They use the residue
     70  *   buffer for this.
     71  *
     72  * @param[in] pu1_src
     73  *   Pointer to source sub-block
     74  *
     75  * @param[in] pu1_pred
     76  *   Pointer to prediction sub-block
     77  *
     78  * @param[in] pi2_out
     79  *   Pointer to residual sub-block
     80  *
     81  * @param[in] src_strd
     82  *   Source stride
     83  *
     84  * @param[in] pred_strd
     85  *   Prediction stride
     86  *
     87  * @param[in] dst_strd
     88  *   Destination stride
     89  *
     90  * @param[in] u4_qbits
     91  *    QP_BITS_h264_4x4 + floor(QP/6)
     92  *
     93  * @param[in] pu2_threshold_matrix
     94  *   Pointer to Forward Quant Threshold Matrix
     95  *
     96  * @param[in] pu2_scale_matrix
     97  *   Pointer to Forward Quant Scale Matrix
     98  *
     99  * @param[in] u4_round_factor
    100  *   Quantization Round factor
    101  *
    102  * @param[out] pu1_nnz
    103  *   Total non-zero coefficients in the current sub-block
    104  *
    105  * @returns
    106  *
    107  * @remarks
    108  *   None
    109  *
    110  *******************************************************************************
    111  */
    112 void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
    113                                 UWORD8 *pu1_pred,
    114                                 WORD16 *pi2_out,
    115                                 WORD32 src_strd,
    116                                 WORD32 pred_strd,
    117                                 const UWORD16 *pu2_scale_matrix,
    118                                 const UWORD16 *pu2_threshold_matrix,
    119                                 UWORD32 u4_qbits,
    120                                 UWORD32 u4_round_factor,
    121                                 UWORD8 *pu1_nnz,
    122                                 WORD16 *pi2_alt_dc_addr)
    123 {
    124     UWORD32 i;
    125     WORD32  x0, x1, x2, x3, x4, x5, x6, x7;
    126     WORD32  i4_value, i4_sign;
    127     UWORD32 u4_abs_value;
    128     WORD16  *pi2_out_tmp = pi2_out;
    129     UWORD32 u4_nonzero_coeff = 0;
    130 
    131     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    132     {
    133         /* computing prediction error (residue) */
    134         x4 = pu1_src[0] - pu1_pred[0];
    135         x5 = pu1_src[1] - pu1_pred[1];
    136         x6 = pu1_src[2] - pu1_pred[2];
    137         x7 = pu1_src[3] - pu1_pred[3];
    138 
    139         /* Horizontal transform */
    140         x0 = x4 + x7;
    141         x1 = x5 + x6;
    142         x2 = x5 - x6;
    143         x3 = x4 - x7;
    144 
    145         pi2_out_tmp[0] = x0 + x1;
    146         pi2_out_tmp[1] = (x3 <<1) + x2;
    147         pi2_out_tmp[2] = x0 - x1;
    148         pi2_out_tmp[3] = x3 - (x2<<1);
    149 
    150         /* pointing to next row; */
    151         pu1_src += src_strd;
    152         pu1_pred += pred_strd;
    153         pi2_out_tmp += 4;
    154 
    155     }
    156     pi2_out_tmp = pi2_out;
    157     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    158     {
    159 
    160         /* Vertical transform and quantization */
    161         x4 = pi2_out_tmp[0];
    162         x5 = pi2_out_tmp[4];
    163         x6 = pi2_out_tmp[8];
    164         x7 = pi2_out_tmp[12];
    165 
    166 
    167         x0 = x4 + x7;
    168         x1 = x5 + x6;
    169         x2 = x5 - x6;
    170         x3 = x4 - x7;
    171 
    172         /* quantization is done in place */
    173 
    174         i4_value = x0 + x1;
    175 
    176         if(i==0)
    177         {
    178           (*pi2_alt_dc_addr) = i4_value;
    179         }
    180 
    181         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0], pu2_scale_matrix[0], u4_round_factor, u4_qbits, u4_nonzero_coeff);
    182         pi2_out_tmp[0] = i4_value;
    183 
    184 
    185         i4_value = (x3 << 1) + x2;
    186         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4], pu2_scale_matrix[4], u4_round_factor, u4_qbits, u4_nonzero_coeff);
    187         pi2_out_tmp[4] = i4_value;
    188 
    189 
    190         i4_value = x0 - x1;
    191         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8], pu2_scale_matrix[8], u4_round_factor, u4_qbits, u4_nonzero_coeff);
    192         pi2_out_tmp[8] = i4_value;
    193 
    194 
    195         i4_value = x3 - (x2 << 1);
    196         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12], pu2_scale_matrix[12], u4_round_factor, u4_qbits, u4_nonzero_coeff);
    197         pi2_out_tmp[12] = i4_value;
    198 
    199         pi2_out_tmp ++;
    200         pu2_scale_matrix++;
    201         pu2_threshold_matrix++;
    202     }
    203 
    204     /* Return total nonzero coefficients in the current sub block */
    205     *pu1_nnz =  u4_nonzero_coeff;
    206 }
    207 /**
    208  *******************************************************************************
    209  *
    210  * @brief
    211  *   This function performs forward transform and quantization on a 4*4 chroma block
    212  *   with interleaved values
    213  *
    214  * @par Description:
    215  *   The function accepts source buffer and estimation buffer. From these, it
    216  *   computes the residue. This is residue is then transformed and quantized.
    217  *   The transform and quantization are in placed computed. They use the residue
    218  *   buffer for this.
    219  *
    220  * @param[in] pu1_src
    221  *   Pointer to source sub-block
    222  *
    223  * @param[in] pu1_pred
    224  *   Pointer to prediction sub-block
    225  *
    226  * @param[in] pi2_out
    227  *   Pointer to residual sub-block
    228  *
    229  * @param[in] src_strd
    230  *   Source stride
    231  *
    232  * @param[in] pred_strd
    233  *   Prediction stride
    234  *
    235  * @param[in] dst_strd
    236  *   Destination stride
    237  *
    238  * @param[in] u4_qbits
    239  *    QP_BITS_h264_4x4 + floor(QP/6)
    240  *
    241  * @param[in] pu2_threshold_matrix
    242  *   Pointer to Forward Quant Threshold Matrix
    243  *
    244  * @param[in] pu2_scale_matrix
    245  *   Pointer to Forward Quant Scale Matrix
    246  *
    247  * @param[in] u4_round_factor
    248  *   Quantization Round factor
    249  *
    250  * @param[out] pu1_nnz
    251  *   Total non-zero coefficients in the current sub-block
    252  *
    253  * @returns
    254  *
    255  * @remarks
    256  *   None
    257  *
    258  *******************************************************************************
    259  */
    260 void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
    261                                        UWORD8 *pu1_pred,
    262                                        WORD16 *pi2_out,
    263                                        WORD32 src_strd,
    264                                        WORD32 pred_strd,
    265                                        const UWORD16 *pu2_scale_matrix,
    266                                        const UWORD16 *pu2_threshold_matrix,
    267                                        UWORD32 u4_qbits,
    268                                        UWORD32 u4_round_factor,
    269                                        UWORD8 *pu1_nnz,
    270                                        WORD16 *pu1_dc_alt_addr)
    271 {
    272     UWORD32 i;
    273     WORD32  x0, x1, x2, x3, x4, x5, x6, x7;
    274     WORD32  i4_value, i4_sign;
    275     UWORD32 u4_abs_value;
    276     WORD16  *pi2_out_tmp = pi2_out;
    277     UWORD32 u4_nonzero_coeff = 0;
    278 
    279     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    280     {
    281         /* computing prediction error (residue) */
    282         x4 = pu1_src[0] - pu1_pred[0];
    283         x5 = pu1_src[2] - pu1_pred[2];
    284         x6 = pu1_src[4] - pu1_pred[4];
    285         x7 = pu1_src[6] - pu1_pred[6];
    286 
    287         /* Horizontal transform */
    288         x0 = x4 + x7;
    289         x1 = x5 + x6;
    290         x2 = x5 - x6;
    291         x3 = x4 - x7;
    292 
    293         pi2_out_tmp[0] = x0 + x1;
    294         pi2_out_tmp[1] = (x3 <<1) + x2;
    295         pi2_out_tmp[2] = x0 - x1;
    296         pi2_out_tmp[3] = x3 - (x2<<1);
    297 
    298         /* pointing to next row; */
    299         pu1_src += src_strd;
    300         pu1_pred += pred_strd;
    301         pi2_out_tmp += 4;
    302 
    303     }
    304     pi2_out_tmp = pi2_out;
    305     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    306     {
    307 
    308         /* Vertical transform and quantization */
    309         x4 = pi2_out_tmp[0];
    310         x5 = pi2_out_tmp[4];
    311         x6 = pi2_out_tmp[8];
    312         x7 = pi2_out_tmp[12];
    313 
    314 
    315         x0 = x4 + x7;
    316         x1 = x5 + x6;
    317         x2 = x5 - x6;
    318         x3 = x4 - x7;
    319 
    320         /* quantization is done in place */
    321 
    322         i4_value = x0 + x1;
    323 
    324         if(i==0)
    325         {
    326           *pu1_dc_alt_addr = i4_value;
    327         }
    328 
    329         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    330                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
    331                   u4_nonzero_coeff);
    332         pi2_out_tmp[0] = i4_value;
    333 
    334         i4_value = (x3 << 1) + x2;
    335         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[4],
    336                   pu2_scale_matrix[4], u4_round_factor, u4_qbits,
    337                   u4_nonzero_coeff);
    338         pi2_out_tmp[4] = i4_value;
    339 
    340         i4_value = x0 - x1;
    341         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
    342                   pu2_scale_matrix[8], u4_round_factor, u4_qbits,
    343                   u4_nonzero_coeff);
    344         pi2_out_tmp[8] = i4_value;
    345 
    346         i4_value = x3 - (x2 << 1);
    347         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[12],
    348                   pu2_scale_matrix[12], u4_round_factor, u4_qbits,
    349                   u4_nonzero_coeff);
    350         pi2_out_tmp[12] = i4_value;
    351 
    352         pi2_out_tmp ++;
    353         pu2_scale_matrix++;
    354         pu2_threshold_matrix++;
    355     }
    356 
    357     /* Return total nonzero coefficients in the current sub block */
    358     *pu1_nnz =  u4_nonzero_coeff;
    359 }
    360 
    361 /**
    362  *******************************************************************************
    363  *
    364  * @brief
    365  *   This function performs forward hadamard transform and quantization on a 4*4 block
    366  *
    367  * @par Description:
    368  *   The function accepts source buffer and estimation buffer. From these, it
    369  *   computes the residue. This is residue is then transformed and quantized.
    370  *   The transform and quantization are in placed computed. They use the residue
    371  *   buffer for this.
    372  *
    373  * @param[in] pu1_src
    374  *   Pointer to source sub-block
    375  *
    376  * @param[in] pu1_pred
    377  *   Pointer to prediction sub-block
    378  *
    379  * @param[in] pi2_out
    380  *   Pointer to residual sub-block
    381  *
    382  * @param[in] src_strd
    383  *   Source stride
    384  *
    385  * @param[in] pred_strd
    386  *   Prediction stride
    387  *
    388  * @param[in] dst_strd
    389  *   Destination stride
    390  *
    391  * @param[in] u4_qbits
    392  *    QP_BITS_h264_4x4 + floor(QP/6)
    393  *
    394  * @param[in] pu2_threshold_matrix
    395  *   Pointer to Forward Quant Threshold Matrix
    396  *
    397  * @param[in] pu2_scale_matrix
    398  *   Pointer to Forward Quant Scale Matrix
    399  *
    400  * @param[in] u4_round_factor
    401  *   Quantization Round factor
    402  *
    403  * @param[out] pu1_nnz
    404  *   Total non-zero coefficients in the current sub-block
    405  *
    406  * @returns
    407  *
    408  * @remarks
    409  *   None
    410  *
    411  */
    412 
    413 void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
    414                               WORD16 *pi2_dst,
    415                               const UWORD16 *pu2_scale_matrix,
    416                               const UWORD16 *pu2_threshold_matrix,
    417                               UWORD32 u4_qbits,
    418                               UWORD32 u4_round_factor,
    419                               UWORD8 *pu1_nnz)
    420 {
    421   WORD32 i;
    422   WORD32 x0,x1,x2,x3,x4,x5,x6,x7,i4_value;
    423   UWORD32 u4_abs_value;
    424   WORD32 i4_sign;
    425 
    426   *pu1_nnz = 0;
    427 
    428   for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    429     {
    430         x4 = pi2_src[0];
    431         x5 = pi2_src[1];
    432         x6 = pi2_src[2];
    433         x7 = pi2_src[3];
    434 
    435         x0 = x4 + x7;
    436         x1 = x5 + x6;
    437         x2 = x5 - x6;
    438         x3 = x4 - x7;
    439 
    440         pi2_dst[0] = x0 + x1;
    441         pi2_dst[1] = x3 + x2;
    442         pi2_dst[2] = x0 - x1;
    443         pi2_dst[3] = x3 - x2;
    444 
    445         pi2_src += 4;
    446         pi2_dst += 4;
    447     }
    448 
    449     /* Vertical transform and quantization */
    450     pi2_dst -= SUB_BLK_WIDTH_4x4<<2;
    451 
    452     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
    453     {
    454         x4 = pi2_dst[0];
    455         x5 = pi2_dst[4];
    456         x6 = pi2_dst[8];
    457         x7 = pi2_dst[12] ;
    458 
    459         x0 = x4 + x7;
    460         x1 = x5 + x6;
    461         x2 = x5 - x6;
    462         x3 = x4 - x7;
    463 
    464 
    465         i4_value = (x0 + x1) >> 1;
    466         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    467                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
    468         pi2_dst[0] = i4_value;
    469 
    470         i4_value = (x3 + x2) >> 1;
    471         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    472                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
    473         pi2_dst[4] = i4_value;
    474 
    475         i4_value = (x0 - x1) >> 1;
    476         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    477                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
    478         pi2_dst[8] = i4_value;
    479 
    480         i4_value = (x3 - x2) >> 1;
    481         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    482                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
    483         pi2_dst[12] = i4_value;
    484 
    485         pi2_dst ++;
    486     }
    487 }
    488 
    489 /**
    490  *******************************************************************************
    491  *
    492  * @brief
    493  *   This function performs forward hadamard transform and quantization on a 2*2 block
    494  *   for both U and V planes
    495  *
    496  * @par Description:
    497  *   The function accepts source buffer and estimation buffer. From these, it
    498  *   computes the residue. This is residue is then transformed and quantized.
    499  *   The transform and quantization are in placed computed. They use the residue
    500  *   buffer for this.
    501  *
    502  * @param[in] pu1_src
    503  *   Pointer to source sub-block
    504  *
    505  * @param[in] pu1_pred
    506  *   Pointer to prediction sub-block
    507  *
    508  * @param[in] pi2_out
    509  *   Pointer to residual sub-block
    510  *
    511  * @param[in] src_strd
    512  *   Source stride
    513  *
    514  * @param[in] pred_strd
    515  *   Prediction stride
    516  *
    517  * @param[in] dst_strd
    518  *   Destination stride
    519  *
    520  * @param[in] u4_qbits
    521  *    QP_BITS_h264_4x4 + floor(QP/6)
    522  *
    523  * @param[in] pu2_threshold_matrix
    524  *   Pointer to Forward Quant Threshold Matrix
    525  *
    526  * @param[in] pu2_scale_matrix
    527  *   Pointer to Forward Quant Scale Matrix
    528  *
    529  * @param[in] u4_round_factor
    530  *   Quantization Round factor
    531  *
    532  * @param[out] pu1_nnz
    533  *   Total non-zero coefficients in the current sub-block
    534  *
    535  * @returns
    536  *
    537  * @remarks
    538  *   NNZ for dc is populated at 0 and 5th position of pu1_nnz
    539  *
    540  */
    541 
    542 void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
    543                                  WORD16 *pi2_dst,
    544                                  const UWORD16 *pu2_scale_matrix,
    545                                  const UWORD16 *pu2_threshold_matrix,
    546                                  UWORD32 u4_qbits,
    547                                  UWORD32 u4_round_factor,
    548                                  UWORD8 *pu1_nnz)
    549 {
    550     WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
    551     WORD32 i4_value, i4_sign, plane;
    552     UWORD32 u4_abs_value;
    553 
    554     for(plane = 0; plane < 2; plane++)
    555     {
    556         pu1_nnz[plane] = 0;
    557 
    558         /* Horizontal transform */
    559         x4 = pi2_src[0];
    560         x5 = pi2_src[1];
    561         x6 = pi2_src[2];
    562         x7 = pi2_src[3];
    563 
    564         x0 = x4 + x5;
    565         x1 = x4 - x5;
    566         x2 = x6 + x7;
    567         x3 = x6 - x7;
    568 
    569         /* Vertical transform and quantization */
    570         i4_value = (x0 + x2);
    571         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    572                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
    573                   pu1_nnz[plane]);
    574         pi2_dst[0] = i4_value;
    575 
    576         i4_value = (x0 - x2);
    577         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    578                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
    579                   pu1_nnz[plane]);
    580         pi2_dst[2] = i4_value;
    581 
    582         i4_value = (x1 - x3);
    583         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    584                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
    585                   pu1_nnz[plane]);
    586         pi2_dst[3] = i4_value;
    587 
    588         i4_value = (x1 + x3);
    589         FWD_QUANT(i4_value, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    590                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
    591                   pu1_nnz[plane]);
    592         pi2_dst[1] = i4_value;
    593 
    594         pi2_dst += 4;
    595         pi2_src += 4;
    596 
    597     }
    598 }
    599 
    600 /*
    601  *******************************************************************************
    602  *
    603  * @brief
    604  *  This function performs Single stage forward transform CF8 and quantization on 8*8 blocks
    605  *  for h.264
    606  *
    607  * @par Description:
    608  *  Performs single stage 8x8 forward transform CF8 after calculating the residue
    609  *  The result is then quantized
    610  *
    611  * @param[in] pu1_src
    612  *  Input 8x8 pixels
    613  *
    614  * @param[in] pu1_pred
    615  *  Input 8x8 pixels
    616  *
    617  * @param[in] pi1_out
    618  * Output 8x8 pixels
    619  *
    620  * @param[in] u4_thresh
    621  *  Threshold under which the coeffs are not quantized
    622  *
    623  *  @param[in] u4_qp_div
    624  *  QP/6
    625  *
    626  *  @param[in] u4_qp_rem
    627  *  QP%6
    628  *
    629  * @param[in] u2_src_stride
    630  *  Source stride
    631  *
    632  * @param[in] pred_strd
    633  * stride for prediciton buffer
    634  *
    635  *  @param[in] dst_strd
    636  *  stride for destination buffer
    637  *
    638  *  @param[in] pu4_quant_mat
    639  *  Pointer to the 4x4 quantization matrix
    640  *
    641  * @returns  Void
    642  *
    643  *
    644  *******************************************************************************
    645  */
    646 void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
    647                                 UWORD8 *pu1_pred,
    648                                 WORD16 *pi2_out,
    649                                 WORD32 src_strd,
    650                                 WORD32 pred_strd,
    651                                 const UWORD16 *pu2_scale_matrix,
    652                                 const UWORD16 *pu2_threshold_matrix,
    653                                 UWORD32 u4_qbits,
    654                                 UWORD32 u4_round_factor,
    655                                 UWORD8 *pu1_nnz,
    656                                 WORD16 *pu1_dc_alt_addr)
    657 
    658 {
    659     WORD16 *pi2_out_tmp = pi2_out;
    660     UWORD32 i;
    661     WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
    662     WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
    663     WORD32 i4_sign;
    664     UWORD32 u4_abs_value;
    665     UWORD32 u4_nonzero_coeff = 0;
    666 
    667     UNUSED(pu1_dc_alt_addr);
    668 
    669     /*Horizontal transform */
    670     /* we are going to use the a's and r's in a twisted way since */
    671     /*i dont want to declare more variables */
    672     for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
    673     {
    674         r0 = pu1_src[0];
    675         r0 -= pu1_pred[0];
    676         r1 = pu1_src[1];
    677         r1 -= pu1_pred[1];
    678         r2 = pu1_src[2];r2 -= pu1_pred[2];
    679         r3 = pu1_src[3];r3 -= pu1_pred[3];
    680         r4 = pu1_src[4];r4 -= pu1_pred[4];
    681         r5 = pu1_src[5];r5 -= pu1_pred[5];
    682         r6 = pu1_src[6];r6 -= pu1_pred[6];
    683         r7 = pu1_src[7];r7 -= pu1_pred[7];
    684 
    685 
    686         a0 = r0 + r7;
    687         a1 = r1 + r6;
    688         a2 = r2 + r5;
    689         a3 = r3 + r4;
    690 
    691         a4 = a0 + a3;
    692         a5 = a1 + a2;
    693         a6 = a0 - a3;
    694         a7 = a1 - a2;
    695 
    696         pi2_out_tmp[0] = a4 + a5;
    697 
    698         pi2_out_tmp[2] = a6 + (a7>>1);
    699         pi2_out_tmp[4] = a4 - a5;
    700         pi2_out_tmp[6] = (a6>>1) - a7;
    701 
    702         a0 = r0 - r7;
    703         a1 = r1 - r6;
    704         a2 = r2 - r5;
    705         a3 = r3 - r4;
    706 
    707         a4 = a1 + a2 + ((a0>>1) + a0);
    708         a5 = a0 - a3 - ((a2>>1) + a2);
    709         a6 = a0 + a3 - ((a1>>1) + a1);
    710         a7 = a1 - a2 + ((a3>>1) + a3);
    711 
    712         pi2_out_tmp[1] = a4 + (a7>>2);
    713         pi2_out_tmp[3] = a5 + (a6>>2);
    714         pi2_out_tmp[5] = a6 - (a5>>2);
    715         pi2_out_tmp[7] = (a4>>2) - a7;
    716 
    717         pu1_src += src_strd;
    718         pu1_pred += pred_strd;
    719         pi2_out_tmp += 8;
    720     }
    721 
    722     /*vertical transform and quant */
    723 
    724     pi2_out_tmp = pi2_out;
    725 
    726     for (i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
    727     {
    728 
    729         r0 = pi2_out_tmp[0];
    730         r1 = pi2_out_tmp[8];
    731         r2 = pi2_out_tmp[16];
    732         r3 = pi2_out_tmp[24];
    733         r4 = pi2_out_tmp[32];
    734         r5 = pi2_out_tmp[40];
    735         r6 = pi2_out_tmp[48];
    736         r7 = pi2_out_tmp[56];
    737 
    738         a0 = r0 + r7;
    739         a1 = r1 + r6;
    740         a2 = r2 + r5;
    741         a3 = r3 + r4;
    742 
    743         a4 = a0 + a3;
    744         a5 = a1 + a2;
    745         a6 = a0 - a3;
    746         a7 = a1 - a2;
    747 
    748         a0 = r0 - r7;
    749         a1 = r1 - r6;
    750         a2 = r2 - r5;
    751         a3 = r3 - r4;
    752 
    753         r0 = a4 + a5;
    754         r2 = a6 + (a7>>1);
    755         r4 = a4 - a5;
    756         r6 = (a6>>1) - a7;
    757 
    758         a4 = a1 + a2 + ((a0>>1) + a0);
    759         a5 = a0 - a3 - ((a2>>1) + a2);
    760         a6 = a0 + a3 - ((a1>>1) + a1);
    761         a7 = a1 - a2 + ((a3>>1) + a3);
    762 
    763         r1 = a4 + (a7>>2);
    764         r3 = a5 + (a6>>2);
    765         r5 = a6 - (a5>>2);
    766         r7 = (a4>>2) - a7;
    767 
    768         FWD_QUANT(r0, u4_abs_value, i4_sign, pu2_threshold_matrix[0],
    769                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
    770                   u4_nonzero_coeff);
    771         pi2_out_tmp[0] = r0;
    772 
    773         FWD_QUANT(r1, u4_abs_value, i4_sign, pu2_threshold_matrix[8],
    774                   pu2_scale_matrix[8], u4_round_factor, u4_qbits,
    775                   u4_nonzero_coeff);
    776         pi2_out_tmp[8] = r1;
    777 
    778         FWD_QUANT(r2, u4_abs_value, i4_sign, pu2_threshold_matrix[16],
    779                   pu2_scale_matrix[16], u4_round_factor, u4_qbits,
    780                   u4_nonzero_coeff);
    781         pi2_out_tmp[16] = r2;
    782 
    783         FWD_QUANT(r3, u4_abs_value, i4_sign, pu2_threshold_matrix[24],
    784                   pu2_scale_matrix[24], u4_round_factor, u4_qbits,
    785                   u4_nonzero_coeff);
    786         pi2_out_tmp[24] = r3;
    787 
    788         FWD_QUANT(r4, u4_abs_value, i4_sign, pu2_threshold_matrix[32],
    789                   pu2_scale_matrix[32], u4_round_factor, u4_qbits,
    790                   u4_nonzero_coeff);
    791         pi2_out_tmp[32] = r4;
    792 
    793         FWD_QUANT(r5, u4_abs_value, i4_sign, pu2_threshold_matrix[40],
    794                   pu2_scale_matrix[40], u4_round_factor, u4_qbits,
    795                   u4_nonzero_coeff);
    796         pi2_out_tmp[40] = r5;
    797 
    798         FWD_QUANT(r6, u4_abs_value, i4_sign, pu2_threshold_matrix[48],
    799                   pu2_scale_matrix[48], u4_round_factor, u4_qbits,
    800                   u4_nonzero_coeff);
    801         pi2_out_tmp[48] = r6;
    802 
    803         FWD_QUANT(r7, u4_abs_value, i4_sign, pu2_threshold_matrix[56],
    804                   pu2_scale_matrix[56], u4_round_factor, u4_qbits,
    805                   u4_nonzero_coeff);
    806         pi2_out_tmp[56] = r7;
    807 
    808         pi2_out_tmp++;
    809         pu2_scale_matrix++;
    810         pu2_threshold_matrix++;
    811     }
    812        /* Return total nonzero coefficients in the current sub block */
    813         *pu1_nnz =  u4_nonzero_coeff;
    814 }
    815