Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22  *******************************************************************************
     23  * @file
     24  *  ih264e_core_coding.c
     25  *
     26  * @brief
     27  *  This file contains routines that perform luma and chroma core coding for
     28  *  intra macroblocks
     29  *
     30  * @author
     31  *  ittiam
     32  *
     33  * @par List of Functions:
     34  *  - ih264e_pack_l_mb_i16()
     35  *  - ih264e_pack_c_mb_i8()
     36  *  - ih264e_code_luma_intra_macroblock_16x16()
     37  *  - ih264e_code_luma_intra_macroblock_4x4()
     38  *  - ih264e_code_chroma_intra_macroblock_8x8()
     39  *
     40  * @remarks
     41  *  None
     42  *
     43  *******************************************************************************
     44  */
     45 
     46 /*****************************************************************************/
     47 /* File Includes                                                             */
     48 /*****************************************************************************/
     49 
     50 /* System include files */
     51 #include <stdio.h>
     52 #include <string.h>
     53 #include <assert.h>
     54 
     55 /* User include files */
     56 #include "ih264e_config.h"
     57 #include "ih264_typedefs.h"
     58 #include "ih264_platform_macros.h"
     59 #include "iv2.h"
     60 #include "ive2.h"
     61 #include "ih264_macros.h"
     62 #include "ih264_defs.h"
     63 #include "ih264e_defs.h"
     64 #include "ih264_trans_data.h"
     65 #include "ih264e_error.h"
     66 #include "ih264e_bitstream.h"
     67 #include "ime_distortion_metrics.h"
     68 #include "ime_defs.h"
     69 #include "ime_structs.h"
     70 #include "ih264_structs.h"
     71 #include "ih264_trans_quant_itrans_iquant.h"
     72 #include "ih264_inter_pred_filters.h"
     73 #include "ih264_mem_fns.h"
     74 #include "ih264_padding.h"
     75 #include "ih264_intra_pred_filters.h"
     76 #include "ih264_deblk_edge_filters.h"
     77 #include "ih264_cabac_tables.h"
     78 #include "irc_cntrl_param.h"
     79 #include "irc_frame_info_collector.h"
     80 #include "ih264e_rate_control.h"
     81 #include "ih264e_cabac_structs.h"
     82 #include "ih264e_structs.h"
     83 #include "ih264e_globals.h"
     84 #include "ih264e_core_coding.h"
     85 #include "ih264e_mc.h"
     86 
     87 
     88 /*****************************************************************************/
     89 /* Function Definitions                                                      */
     90 /*****************************************************************************/
     91 
     92 /**
     93 *******************************************************************************
     94 *
     95 * @brief
     96 *  This function performs does the DCT transform then Hadamard transform
     97 *  and quantization for a macroblock when the mb mode is intra 16x16 mode
     98 *
     99 * @par Description:
    100 *  First  cf4 is done on all 16 4x4 blocks of the 16x16 input block.
    101 *  Then hadamard transform is done on the DC coefficients
    102 *  Quantization is then performed on the 16x16 block, 4x4 wise
    103 *
    104 * @param[in] pu1_src
    105 *  Pointer to source sub-block
    106 *
    107 * @param[in] pu1_pred
    108 *  Pointer to prediction sub-block
    109 *
    110 * @param[in] pi2_out
    111 *  Pointer to residual sub-block
    112 *  The output will be in linear format
    113 *  The first 16 continuous locations will contain the values of Dc block
    114 *  After DC block and a stride 1st AC block will follow
    115 *  After one more stride next AC block will follow
    116 *  The blocks will be in raster scan order
    117 *
    118 * @param[in] src_strd
    119 *  Source stride
    120 *
    121 * @param[in] pred_strd
    122 *  Prediction stride
    123 *
    124 * @param[in] dst_strd
    125 *  Destination stride
    126 *
    127 * @param[in] pu2_scale_matrix
    128 *  The quantization matrix for 4x4 transform
    129 *
    130 * @param[in] pu2_threshold_matrix
    131 *  Threshold matrix
    132 *
    133 * @param[in] u4_qbits
    134 *  15+QP/6
    135 *
    136 * @param[in] u4_round_factor
    137 *  Round factor for quant
    138 *
    139 * @param[out] pu1_nnz
    140 *  Memory to store the non-zeros after transform
    141 *  The first byte will be the nnz of DC block
    142 *  From the next byte the AC nnzs will be stored in raster scan order
    143 *
    144 * @param u4_dc_flag
    145 *  Signals if Dc transform is to be done or not
    146 *   1 -> Dc transform will be done
    147 *   0 -> Dc transform will not be done
    148 *
    149 * @remarks
    150 *
    151 *******************************************************************************
    152 */
    153 void ih264e_luma_16x16_resi_trans_dctrans_quant(codec_t *ps_codec,
    154                                                 UWORD8 *pu1_src,
    155                                                 UWORD8 *pu1_pred,
    156                                                 WORD16 *pi2_out,
    157                                                 WORD32 src_strd,
    158                                                 WORD32 pred_strd,
    159                                                 WORD32 dst_strd,
    160                                                 const UWORD16 *pu2_scale_matrix,
    161                                                 const UWORD16 *pu2_threshold_matrix,
    162                                                 UWORD32 u4_qbits,
    163                                                 UWORD32 u4_round_factor,
    164                                                 UWORD8 *pu1_nnz,
    165                                                 UWORD32 u4_dc_flag)
    166 
    167 {
    168     WORD32 blk_cntr;
    169     WORD32 i4_offsetx, i4_offsety;
    170     UWORD8 *pu1_curr_src, *pu1_curr_pred;
    171 
    172     WORD16 *pi2_dc_str = pi2_out;
    173 
    174     /* Move to the ac addresses */
    175     pu1_nnz++;
    176     pi2_out += dst_strd;
    177 
    178     for (blk_cntr = 0; blk_cntr < NUM_LUMA4x4_BLOCKS_IN_MB; blk_cntr++)
    179     {
    180         IND2SUB_LUMA_MB(blk_cntr, i4_offsetx, i4_offsety);
    181 
    182         pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
    183         pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
    184 
    185         ps_codec->pf_resi_trans_quant_4x4(pu1_curr_src, pu1_curr_pred,
    186                                           pi2_out + blk_cntr * dst_strd,
    187                                           src_strd, pred_strd, pu2_scale_matrix,
    188                                           pu2_threshold_matrix, u4_qbits,
    189                                           u4_round_factor, &pu1_nnz[blk_cntr],
    190                                           &pi2_dc_str[blk_cntr]);
    191 
    192     }
    193 
    194     if (!u4_dc_flag)
    195         return;
    196 
    197     /*
    198      * In case of i16x16, we need to remove the contribution of dc coeffs into
    199      * nnz of each block. We are doing that in the packing function
    200      */
    201 
    202     /* Adjust pointers to point to dc values */
    203     pi2_out -= dst_strd;
    204     pu1_nnz--;
    205 
    206     u4_qbits++;
    207     u4_round_factor <<= 1;
    208 
    209     ps_codec->pf_hadamard_quant_4x4(pi2_dc_str, pi2_out, pu2_scale_matrix,
    210                                     pu2_threshold_matrix, u4_qbits,
    211                                     u4_round_factor, &pu1_nnz[0]);
    212 }
    213 
    214 /**
    215 *******************************************************************************
    216 *
    217 * @brief
    218 *  This function performs the intra 16x16 inverse transform process for H264
    219 *  it includes inverse Dc transform, inverse quant and then inverse transform
    220 *
    221 * @par Description:
    222 *
    223 * @param[in] pi2_src
    224 *  Input data, 16x16 size
    225 *  First 16 mem locations will have the Dc coffs in rater scan order in linear fashion
    226 *  after a stride 1st AC clock will be present again in raster can order
    227 *  Then each AC block of the 16x16 block will follow in raster scan order
    228 *
    229 * @param[in] pu1_pred
    230 *  The predicted data, 16x16 size
    231 *  Block by block form
    232 *
    233 * @param[in] pu1_out
    234 *  Output 16x16
    235 *  In block by block form
    236 *
    237 * @param[in] src_strd
    238 *  Source stride
    239 *
    240 * @param[in] pred_strd
    241 *  input stride for prediction buffer
    242 *
    243 * @param[in] out_strd
    244 *  input stride for output buffer
    245 *
    246 * @param[in] pu2_iscale_mat
    247 *  Inverse quantization matrix for 4x4 transform
    248 *
    249 * @param[in] pu2_weigh_mat
    250 *  weight matrix of 4x4 transform
    251 *
    252 * @param[in] qp_div
    253 *  QP/6
    254 *
    255 * @param[in] pi4_tmp
    256 *  Input temporary buffer
    257 *  needs to be at least 20 in size
    258 *
    259 * @param[in] pu4_cntrl
    260 *  Controls the transform path
    261 *  total Last 17 bits are used
    262 *  the 16th th bit will correspond to DC block
    263 *  and 32-17 will correspond to the ac blocks in raster scan order
    264 *  bit equaling zero indicates that the entire 4x4 block is zero for DC
    265 *  For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block is nonzero
    266 *
    267 * @param[in] pi4_tmp
    268 *  Input temporary buffer
    269 *  needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
    270 *
    271 * @returns
    272 *  none
    273 *
    274 * @remarks
    275 *  The all zero case must be taken care outside
    276 *
    277 *******************************************************************************
    278 */
    279 void ih264e_luma_16x16_idctrans_iquant_itrans_recon(codec_t *ps_codec,
    280                                                     WORD16 *pi2_src,
    281                                                     UWORD8 *pu1_pred,
    282                                                     UWORD8 *pu1_out,
    283                                                     WORD32 src_strd,
    284                                                     WORD32 pred_strd,
    285                                                     WORD32 out_strd,
    286                                                     const UWORD16 *pu2_iscale_mat,
    287                                                     const UWORD16 *pu2_weigh_mat,
    288                                                     UWORD32 qp_div,
    289                                                     UWORD32 u4_cntrl,
    290                                                     UWORD32 u4_dc_trans_flag,
    291                                                     WORD32 *pi4_tmp)
    292 {
    293     /* Start index for inverse quant in a 4x4 block */
    294     WORD32 iq_start_idx = (u4_dc_trans_flag == 0) ? 0 : 1;
    295 
    296     /* Cntrl bits for 4x4 transforms
    297      * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
    298      * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
    299      *                    : dc block must contain only single dc coefficient
    300      * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
    301      *                    : ie not (ac or dc)
    302      */
    303     UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
    304 
    305     /* tmp registers for block ids */
    306     UWORD32 u4_blk_id;
    307 
    308     /* Subscrripts */
    309     WORD32 i4_offset_x, i4_offset_y;
    310 
    311     UWORD8 *pu1_cur_prd_blk, *pu1_cur_out_blk;
    312 
    313     /* Src and stride for dc coeffs */
    314     UWORD32 u4_dc_inc;
    315     WORD16 *pi2_dc_src;
    316 
    317     /*
    318      * For intra blocks we need to do inverse dc transform
    319      * In case if intra blocks, its here that we populate the dc bits in cntrl
    320      * as they cannot be populated any earlier
    321      */
    322     if (u4_dc_trans_flag)
    323     {
    324         UWORD32 cntr, u4_dc_cntrl;
    325         /* Do inv hadamard and place the results at the start of each AC block */
    326         ps_codec->pf_ihadamard_scaling_4x4(pi2_src, pi2_src, pu2_iscale_mat,
    327                                            pu2_weigh_mat, qp_div, pi4_tmp);
    328 
    329         /* Update the cntrl flag */
    330         u4_dc_cntrl = 0;
    331         for (cntr = 0; cntr < DC_COEFF_CNT_LUMA_MB; cntr++)
    332         {
    333             u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr));
    334         }
    335         /* Mark dc bits as 1 if corresponding ac bit is 0 */
    336         u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
    337         /* Combine both ac and dc bits */
    338         u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA)
    339                         | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_LUMA);
    340     }
    341 
    342     /* Source for dc coeffs
    343      * If the block is intra, we have to read dc values from first row of src
    344      * then stride for each block is 1, other wise its src stride
    345      */
    346     pi2_dc_src = (iq_start_idx == 0) ? (pi2_src + src_strd) : pi2_src;
    347     u4_dc_inc = (iq_start_idx == 0) ? src_strd : 1;
    348 
    349     /* The AC blocks starts from 2nd row */
    350     pi2_src += src_strd;
    351 
    352     /* Get the block bits */
    353     u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA);
    354     u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_LUMA) << 16;
    355     u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFFFF0000;
    356 
    357     /* Get first block to process */
    358     DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
    359     while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
    360     {
    361         /* Compute address of src blocks */
    362         WORD32 i4_src_offset = u4_dc_inc * u4_blk_id;
    363 
    364         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
    365 
    366         /* Compute address of out and pred blocks */
    367         pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
    368         pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
    369 
    370         /* Do inv dc transform */
    371         ps_codec->pf_iquant_itrans_recon_4x4_dc(pi2_dc_src + i4_src_offset,
    372                                                 pu1_cur_prd_blk,
    373                                                 pu1_cur_out_blk, pred_strd,
    374                                                 out_strd, pu2_iscale_mat,
    375                                                 pu2_weigh_mat, qp_div, NULL,
    376                                                 iq_start_idx,
    377                                                 pi2_dc_src + i4_src_offset);
    378         /* Get next DC block to process */
    379         DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
    380     }
    381 
    382     /* now process ac/mixed blocks */
    383     DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
    384     while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
    385     {
    386 
    387         WORD32 i4_src_offset = src_strd * u4_blk_id;
    388 
    389         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
    390 
    391         pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
    392         pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
    393 
    394         ps_codec->pf_iquant_itrans_recon_4x4(pi2_src + i4_src_offset,
    395                                              pu1_cur_prd_blk, pu1_cur_out_blk,
    396                                              pred_strd, out_strd,
    397                                              pu2_iscale_mat, pu2_weigh_mat,
    398                                              qp_div, (WORD16*) pi4_tmp,
    399                                              iq_start_idx,
    400                                              pi2_dc_src + u4_blk_id);
    401 
    402         DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
    403     }
    404 
    405     /* Now process empty blocks */
    406     DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
    407     while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
    408     {
    409         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
    410 
    411         pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
    412         pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
    413 
    414         ps_codec->pf_inter_pred_luma_copy(pu1_cur_prd_blk, pu1_cur_out_blk,
    415                                           pred_strd, out_strd, SIZE_4X4_BLK_HRZ,
    416                                           SIZE_4X4_BLK_VERT, 0, 0);
    417 
    418         DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
    419     }
    420 }
    421 
    422 /**
    423 *******************************************************************************
    424 *
    425 * @brief
    426 *  This function performs does the DCT transform then Hadamard transform
    427 *  and quantization for a chroma macroblock
    428 *
    429 * @par Description:
    430 *  First  cf4 is done on all 16 4x4 blocks of the 8x8input block
    431 *  Then hadamard transform is done on the DC coefficients
    432 *  Quantization is then performed on the 8x8 block, 4x4 wise
    433 *
    434 * @param[in] pu1_src
    435 *  Pointer to source sub-block
    436 *  The input is in interleaved format for two chroma planes
    437 *
    438 * @param[in] pu1_pred
    439 *  Pointer to prediction sub-block
    440 *  Prediction is in inter leaved format
    441 *
    442 * @param[in] pi2_out
    443 *  Pointer to residual sub-block
    444 *  The output will be in linear format
    445 *  The first 4 continuous locations will contain the values of DC block for U
    446 *  and then next 4 will contain for V.
    447 *  After DC block and a stride 1st AC block of U plane will follow
    448 *  After one more stride next AC block of V plane will follow
    449 *  The blocks will be in raster scan order
    450 *
    451 *  After all the AC blocks of U plane AC blocks of V plane will follow in exact
    452 *  same way
    453 *
    454 * @param[in] src_strd
    455 *  Source stride
    456 *
    457 * @param[in] pred_strd
    458 *  Prediction stride
    459 *
    460 * @param[in] dst_strd
    461 *  Destination stride
    462 *
    463 * @param[in] pu2_scale_matrix
    464 *  The quantization matrix for 4x4 transform
    465 *
    466 * @param[in] pu2_threshold_matrix
    467 *  Threshold matrix
    468 *
    469 * @param[in] u4_qbits
    470 *  15+QP/6
    471 *
    472 * @param[in] u4_round_factor
    473 *  Round factor for quant
    474 *
    475 * @param[out] pu1_nnz
    476 *  Memory to store the non-zeros after transform
    477 *  The first byte will be the nnz od DC block for U plane
    478 *  From the next byte the AC nnzs will be storerd in raster scan order
    479 *  The fifth byte will be nnz of Dc block of V plane
    480 *  Then Ac blocks will follow
    481 *
    482 * @param u4_dc_flag
    483 *  Signals if Dc transform is to be done or not
    484 *   1 -> Dc transform will be done
    485 *   0 -> Dc transform will not be done
    486 *
    487 * @remarks
    488 *
    489 *******************************************************************************
    490 */
    491 void ih264e_chroma_8x8_resi_trans_dctrans_quant(codec_t *ps_codec,
    492                                                 UWORD8 *pu1_src,
    493                                                 UWORD8 *pu1_pred,
    494                                                 WORD16 *pi2_out,
    495                                                 WORD32 src_strd,
    496                                                 WORD32 pred_strd,
    497                                                 WORD32 out_strd,
    498                                                 const UWORD16 *pu2_scale_matrix,
    499                                                 const UWORD16 *pu2_threshold_matrix,
    500                                                 UWORD32 u4_qbits,
    501                                                 UWORD32 u4_round_factor,
    502                                                 UWORD8 *pu1_nnz_c)
    503 {
    504     WORD32 blk_cntr;
    505     WORD32 i4_offsetx, i4_offsety;
    506     UWORD8 *pu1_curr_src, *pu1_curr_pred;
    507 
    508     WORD16 pi2_dc_str[8];
    509     UWORD8 au1_dcnnz[2];
    510 
    511     /* Move to the ac addresses */
    512     pu1_nnz_c++;
    513     pi2_out += out_strd;
    514 
    515     for (blk_cntr = 0; blk_cntr < NUM_CHROMA4x4_BLOCKS_IN_MB; blk_cntr++)
    516     {
    517         IND2SUB_CHROMA_MB(blk_cntr, i4_offsetx, i4_offsety);
    518 
    519         pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
    520         pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
    521 
    522         /* For chroma, v plane nnz is populated from position 5 */
    523         ps_codec->pf_resi_trans_quant_chroma_4x4(
    524                         pu1_curr_src, pu1_curr_pred,
    525                         pi2_out + blk_cntr * out_strd, src_strd, pred_strd,
    526                         pu2_scale_matrix, pu2_threshold_matrix, u4_qbits,
    527                         u4_round_factor, &pu1_nnz_c[blk_cntr + (blk_cntr > 3)],
    528                         &pi2_dc_str[blk_cntr]);
    529     }
    530 
    531     /* Adjust pointers to point to dc values */
    532     pi2_out -= out_strd;
    533     pu1_nnz_c--;
    534 
    535     u4_qbits++;
    536     u4_round_factor <<= 1;
    537 
    538     ps_codec->pf_hadamard_quant_2x2_uv(pi2_dc_str, pi2_out, pu2_scale_matrix,
    539                                        pu2_threshold_matrix, u4_qbits,
    540                                        u4_round_factor, au1_dcnnz);
    541 
    542     /* Copy the dc nnzs */
    543     pu1_nnz_c[0] = au1_dcnnz[0];
    544     pu1_nnz_c[5] = au1_dcnnz[1];
    545 
    546 }
    547 
    548 /**
    549 *******************************************************************************
    550 * @brief
    551 *  This function performs the inverse transform with process for chroma MB of H264
    552 *
    553 * @par Description:
    554 *  Does inverse DC transform ,inverse quantization inverse transform
    555 *
    556 * @param[in] pi2_src
    557 *  Input data, 16x16 size
    558 *  The input is in the form of, first 4 locations will contain DC coeffs of
    559 *  U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
    560 *  in raster scan order will follow, each block as linear array in raster scan order.
    561 *  After a stride next AC block will follow. After all AC blocks of U plane
    562 *  V plane AC blocks will follow in exact same order.
    563 *
    564 * @param[in] pu1_pred
    565 *  The predicted data, 8x16 size, U and V interleaved
    566 *
    567 * @param[in] pu1_out
    568 *  Output 8x16, U and V interleaved
    569 *
    570 * @param[in] src_strd
    571 *  Source stride
    572 *
    573 * @param[in] pred_strd
    574 *  input stride for prediction buffer
    575 *
    576 * @param[in] out_strd
    577 *  input stride for output buffer
    578 *
    579 * @param[in] pu2_iscale_mat
    580 *  Inverse quantization martix for 4x4 transform
    581 *
    582 * @param[in] pu2_weigh_mat
    583 *  weight matrix of 4x4 transform
    584 *
    585 * @param[in] qp_div
    586 *  QP/6
    587 *
    588 * @param[in] pi4_tmp
    589 *  Input temporary buffer
    590 *  needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma * number of planes
    591 *  in size
    592 *
    593 * @param[in] pu4_cntrl
    594 *  Controls the transform path
    595 *  the 15 th bit will correspond to DC block of U plane , 14th will indicate the V plane Dc block
    596 *  32-28 bits will indicate AC blocks of U plane in raster scan order
    597 *  27-23 bits will indicate AC blocks of V plane in rater scan order
    598 *  The bit 1 implies that there is at least one non zero coeff in a block
    599 *
    600 * @returns
    601 *  none
    602 *
    603 * @remarks
    604 *******************************************************************************
    605 */
    606 void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(codec_t *ps_codec,
    607                                                     WORD16 *pi2_src,
    608                                                     UWORD8 *pu1_pred,
    609                                                     UWORD8 *pu1_out,
    610                                                     WORD32 src_strd,
    611                                                     WORD32 pred_strd,
    612                                                     WORD32 out_strd,
    613                                                     const UWORD16 *pu2_iscale_mat,
    614                                                     const UWORD16 *pu2_weigh_mat,
    615                                                     UWORD32 qp_div,
    616                                                     UWORD32 u4_cntrl,
    617                                                     WORD32 *pi4_tmp)
    618 {
    619     /* Cntrl bits for 4x4 transforms
    620      * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
    621      * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
    622      *                    : dc block must contain only single dc coefficient
    623      * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
    624      *                    : ie not (ac or dc)
    625      */
    626 
    627     UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
    628 
    629     /* tmp registers for block ids */
    630     WORD32 u4_blk_id;
    631 
    632     /* Offsets for pointers */
    633     WORD32 i4_offset_x, i4_offset_y;
    634 
    635     /* Pointer to 4x4 blocks */
    636     UWORD8 *pu1_cur_4x4_prd_blk, *pu1_cur_4x4_out_blk;
    637 
    638     /* Tmp register for pointer to dc coffs */
    639     WORD16 *pi2_dc_src;
    640 
    641     WORD16 i2_zero = 0;
    642 
    643     /* Increment for dc block */
    644     WORD32 i4_dc_inc;
    645 
    646     /*
    647      * Lets do the inverse transform for dc coeffs in chroma
    648      */
    649     if (u4_cntrl & CNTRL_FLAG_DCBLK_MASK_CHROMA)
    650     {
    651         UWORD32 cntr, u4_dc_cntrl;
    652         /* Do inv hadamard for u an v block */
    653 
    654         ps_codec->pf_ihadamard_scaling_2x2_uv(pi2_src, pi2_src, pu2_iscale_mat,
    655                                               pu2_weigh_mat, qp_div, NULL);
    656         /*
    657          * Update the cntrl flag
    658          * Flag is updated as follows bits 15-11 -> u block dc bits
    659          */
    660         u4_dc_cntrl = 0;
    661         for (cntr = 0; cntr < 8; cntr++)
    662         {
    663             u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr));
    664         }
    665 
    666         /* Mark dc bits as 1 if corresponding ac bit is 0 */
    667         u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
    668         /* Combine both ac and dc bits */
    669         u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA)
    670                         | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_CHROMA);
    671 
    672         /* Since we populated the dc coffs, we have to read them from there */
    673         pi2_dc_src = pi2_src;
    674         i4_dc_inc = 1;
    675     }
    676     else
    677     {
    678         u4_cntrl = u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA;
    679         pi2_dc_src = &i2_zero;
    680         i4_dc_inc = 0;
    681     }
    682 
    683     /* Get the block bits */
    684     u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA);
    685     u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_CHROMA) << 16;
    686     u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFF000000;
    687 
    688     /* The AC blocks starts from 2nd row */
    689     pi2_src += src_strd;
    690 
    691     DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
    692     while (u4_blk_id < 8)
    693     {
    694         WORD32 dc_src_offset = u4_blk_id * i4_dc_inc;
    695 
    696         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
    697 
    698         pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
    699         pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
    700 
    701         ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc(
    702                         pi2_dc_src + dc_src_offset, pu1_cur_4x4_prd_blk,
    703                         pu1_cur_4x4_out_blk, pred_strd, out_strd, NULL, NULL, 0,
    704                         NULL, pi2_dc_src + dc_src_offset);
    705         /* Get next DC block to process */
    706         DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
    707     }
    708 
    709     /* now process ac/mixed blocks */
    710     DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
    711     while (u4_blk_id < 8)
    712     {
    713         WORD32 i4_src_offset = src_strd * u4_blk_id;
    714         WORD32 dc_src_offset = i4_dc_inc * u4_blk_id;
    715 
    716         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
    717 
    718         pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
    719         pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
    720 
    721         ps_codec->pf_iquant_itrans_recon_chroma_4x4(pi2_src + i4_src_offset,
    722                                                     pu1_cur_4x4_prd_blk,
    723                                                     pu1_cur_4x4_out_blk,
    724                                                     pred_strd, out_strd,
    725                                                     pu2_iscale_mat,
    726                                                     pu2_weigh_mat, qp_div,
    727                                                     (WORD16 *) pi4_tmp,
    728                                                     pi2_dc_src + dc_src_offset);
    729 
    730         DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
    731     }
    732 
    733     /* Now process empty blocks */
    734     DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
    735     while (u4_blk_id < 8)
    736     {
    737         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
    738 
    739         pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
    740         pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
    741 
    742         ps_codec->pf_interleave_copy(pu1_cur_4x4_prd_blk, pu1_cur_4x4_out_blk,
    743                                      pred_strd, out_strd, SIZE_4X4_BLK_VERT,
    744                                      SIZE_4X4_BLK_HRZ);
    745 
    746         DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
    747     }
    748 }
    749 
    750 /**
    751 ******************************************************************************
    752 *
    753 * @brief  This function packs residue of an i16x16 luma mb for entropy coding
    754 *
    755 * @par   Description
    756 *  An i16 macro block contains two classes of units, dc 4x4 block and
    757 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
    758 *  the 16 ac blocks are sent next in scan order. Each and every block is
    759 *  represented by 3 parameters (nnz, significant coefficient map and the
    760 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
    761 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
    762 *  sent in scan order.
    763 *
    764 *  The first byte of each block will be nnz of the block, if it is non zero,
    765 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
    766 *  This is repeated for 1 dc + 16 ac blocks.
    767 *
    768 * @param[in]  pi2_res_mb
    769 *  pointer to residue mb
    770 *
    771 * @param[in, out]  pv_mb_coeff_data
    772 *  buffer pointing to packed residue coefficients
    773 *
    774 * @param[in]  u4_res_strd
    775 *  residual block stride
    776 *
    777 * @param[out]  u1_cbp_l
    778 *  coded block pattern luma
    779 *
    780 * @param[in]   pu1_nnz
    781 *  number of non zero coefficients in each 4x4 unit
    782 *
    783 * @param[out]
    784 *  Control signal for inverse transform of 16x16 blocks
    785 *
    786 * @return none
    787 *
    788 * @ remarks
    789 *
    790 ******************************************************************************
    791 */
    792 void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb,
    793                           void **pv_mb_coeff_data,
    794                           WORD32 i4_res_strd,
    795                           UWORD8 *u1_cbp_l,
    796                           UWORD8 *pu1_nnz,
    797                           UWORD32 *pu4_cntrl)
    798 {
    799     /* pointer to packed sub block buffer space */
    800     tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data), *ps_mb_coeff_data_ac;
    801 
    802     /* no of non zero coefficients in the current sub block */
    803     UWORD32 u4_nnz_cnt;
    804 
    805     /* significant coefficient map */
    806     UWORD32 u4_s_map;
    807 
    808     /* pointer to scanning matrix */
    809     const UWORD8 *pu1_scan_order;
    810 
    811     /* number of non zeros in sub block */
    812     UWORD32 u4_nnz;
    813 
    814     /* coeff scan order */
    815     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
    816 
    817     /* temp var */
    818     UWORD32 coeff_cnt, mask, b4,u4_cntrl=0;
    819 
    820     /*DC and AC coeff pointers*/
    821     WORD16 *pi2_res_mb_ac,*pi2_res_mb_dc;
    822 
    823     /********************************************************/
    824     /*  pack dc coeff data for entropy coding               */
    825     /********************************************************/
    826 
    827     pi2_res_mb_dc = pi2_res_mb;
    828     pu1_scan_order = gu1_luma_scan_order_dc;
    829 
    830     u4_nnz = *pu1_nnz;
    831     u4_cntrl = 0;
    832 
    833     /* write number of non zero coefficients */
    834     ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
    835 
    836     if (u4_nnz)
    837     {
    838         for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
    839         {
    840             if (pi2_res_mb_dc[pu1_scan_order[coeff_cnt]])
    841             {
    842                 /* write residue */
    843                 ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_dc[pu1_scan_order[coeff_cnt]];
    844                 u4_s_map |= mask;
    845             }
    846             mask <<= 1;
    847         }
    848         /* write significant coeff map */
    849         ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
    850         (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
    851 
    852         u4_cntrl = 0x00008000;// Set DC bit in ctrl code
    853     }
    854     else
    855     {
    856         (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
    857     }
    858 
    859     /********************************************************/
    860     /*  pack ac coeff data for entropy coding               */
    861     /********************************************************/
    862 
    863     pu1_nnz ++;
    864     pu1_scan_order = gu1_luma_scan_order;
    865     pi2_res_mb += i4_res_strd; /*Move to AC block*/
    866 
    867     ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
    868 
    869     for (b4 = 0; b4 < 16; b4++)
    870     {
    871         ps_mb_coeff_data = (*pv_mb_coeff_data);
    872 
    873         u4_nnz = pu1_nnz[u1_scan_order[b4]];
    874 
    875         /* Jump according to the scan order */
    876         pi2_res_mb_ac = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
    877 
    878         /*
    879          * Since this is a i16x16 block, we should not count dc coeff on indi
    880          * vidual 4x4 blocks to nnz. But due to the implementation of 16x16
    881          * trans function, we add dc's nnz to u4_nnz too. Hence we adjust that
    882          * here
    883          */
    884         u4_nnz -= (pi2_res_mb_ac[0] != 0);
    885 
    886         /* write number of non zero coefficients */
    887         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
    888 
    889         if (u4_nnz)
    890         {
    891             for (u4_nnz_cnt = 0, coeff_cnt = 1, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
    892             {
    893                 if (pi2_res_mb_ac[pu1_scan_order[coeff_cnt]])
    894                 {
    895                     /* write residue */
    896                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_ac[pu1_scan_order[coeff_cnt]];
    897                     u4_s_map |= mask;
    898                 }
    899                 mask <<= 1;
    900             }
    901             /* write significant coeff map */
    902             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
    903             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
    904             *u1_cbp_l = 15;
    905 
    906             u4_cntrl |= (1 << (31 - u1_scan_order[b4]));
    907         }
    908         else
    909         {
    910             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
    911         }
    912 
    913     }
    914 
    915     if (!(*u1_cbp_l))
    916     {
    917         (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
    918     }
    919 
    920     /* Store the cntrl signal */
    921     (*pu4_cntrl) = u4_cntrl;
    922     return;
    923 }
    924 
    925 /**
    926 ******************************************************************************
    927 *
    928 * @brief  This function packs residue of an p16x16 luma mb for entropy coding
    929 *
    930 * @par   Description
    931 *  A p16x16 macro block contains two classes of units 16  4x4 ac blocks.
    932 *  while packing the mb, the dc block is sent first, and
    933 *  the 16 ac blocks are sent next in scan order. Each and every block is
    934 *  represented by 3 parameters (nnz, significant coefficient map and the
    935 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
    936 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
    937 *  sent in scan order.
    938 *
    939 *  The first byte of each block will be nnz of the block, if it is non zero,
    940 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
    941 *  This is repeated for 1 dc + 16 ac blocks.
    942 *
    943 * @param[in]  pi2_res_mb
    944 *  pointer to residue mb
    945 *
    946 * @param[in, out]  pv_mb_coeff_data
    947 *  buffer pointing to packed residue coefficients
    948 *
    949 * @param[in]  i4_res_strd
    950 *  residual block stride
    951 *
    952 * @param[out]  u1_cbp_l
    953 *  coded block pattern luma
    954 *
    955 * @param[in]   pu1_nnz
    956 *  number of non zero coefficients in each 4x4 unit
    957 *
    958 * @param[out] pu4_cntrl
    959 *  Control signal for inverse transform
    960 *
    961 * @return none
    962 *
    963 * @remarks Killing coffs not yet coded
    964 *
    965 ******************************************************************************
    966 */
    967 void ih264e_pack_l_mb(WORD16 *pi2_res_mb,
    968                       void **pv_mb_coeff_data,
    969                       WORD32 i4_res_strd,
    970                       UWORD8 *u1_cbp_l,
    971                       UWORD8 *pu1_nnz,
    972                       UWORD32 u4_thres_resi,
    973                       UWORD32 *pu4_cntrl)
    974 {
    975     /* pointer to packed sub block buffer space */
    976     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8, *ps_mb_coeff_data_mb;
    977 
    978     /* no of non zero coefficients in the current sub block */
    979     UWORD32 u4_nnz_cnt;
    980 
    981     /* significant coefficient map */
    982     UWORD32 u4_s_map;
    983 
    984     /* pointer to scanning matrix */
    985     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
    986 
    987     /* number of non zeros in sub block */
    988     UWORD32 u4_nnz;
    989 
    990     /* pointer to residual sub block */
    991     WORD16  *pi2_res_sb;
    992 
    993     /* coeff scan order */
    994     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
    995 
    996     /* coeff cost */
    997     const UWORD8  *pu1_coeff_cost = gu1_coeff_cost;
    998 
    999     /* temp var */
   1000     UWORD32 u4_mb_coeff_cost = 0, u4_b8_coeff_cost = 0, coeff_cnt, mask, u4_cntrl = 0, b4, b8;
   1001 
   1002     /* temp var */
   1003     WORD32 i4_res_val, i4_run = -1, dcac_block;
   1004 
   1005     /* When Hadamard transform is disabled, first row values are dont care, ignore them */
   1006     pi2_res_mb += i4_res_strd;
   1007 
   1008     /* When Hadamard transform is disabled, first unit value is dont care, ignore this */
   1009     pu1_nnz ++;
   1010 
   1011     ps_mb_coeff_data_mb = ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
   1012 
   1013     /********************************************************/
   1014     /*  pack coeff data for entropy coding                  */
   1015     /********************************************************/
   1016 
   1017     for (b4 = 0; b4 < 16; b4++)
   1018     {
   1019         ps_mb_coeff_data = (*pv_mb_coeff_data);
   1020 
   1021         b8 = b4 >> 2;
   1022 
   1023         u4_nnz = pu1_nnz[u1_scan_order[b4]];
   1024 
   1025         /* Jump according to the scan order */
   1026         pi2_res_sb = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
   1027 
   1028         /* write number of non zero coefficients */
   1029         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
   1030 
   1031         if (u4_nnz)
   1032         {
   1033             for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
   1034             {
   1035                 /* number of runs of zero before, this is used to compute coeff cost */
   1036                 i4_run++;
   1037 
   1038                 i4_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
   1039 
   1040                 if (i4_res_val)
   1041                 {
   1042                     /* write residue */
   1043                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i4_res_val;
   1044                     u4_s_map |= mask;
   1045 
   1046                     if (u4_thres_resi)
   1047                     {
   1048                         /* compute coeff cost */
   1049                         if (i4_res_val == 1 || i4_res_val == -1)
   1050                         {
   1051                             if (i4_run < 6)
   1052                                 u4_b8_coeff_cost += pu1_coeff_cost[i4_run];
   1053                         }
   1054                         else
   1055                             u4_b8_coeff_cost += 9;
   1056 
   1057                         i4_run = -1;
   1058                     }
   1059                 }
   1060 
   1061                 mask <<= 1;
   1062             }
   1063 
   1064             /* write significant coeff map */
   1065             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
   1066             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
   1067 
   1068             /* cbp */
   1069             *u1_cbp_l |= (1 << b8);
   1070 
   1071             /* Cntrl map for inverse transform computation
   1072              *
   1073              * If coeff_cnt is zero, it means that only nonzero was a dc coeff
   1074              * Hence we have to set the 16 - u1_scan_order[b4]) position instead
   1075              * of 31 - u1_scan_order[b4]
   1076              */
   1077             dcac_block = (coeff_cnt == 0)?16:31;
   1078             u4_cntrl |= (1 << (dcac_block - u1_scan_order[b4]));
   1079         }
   1080         else
   1081         {
   1082             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
   1083         }
   1084 
   1085         /* Decide if the 8x8 unit has to be sent for entropy coding? */
   1086         if ((b4+1) % 4 == 0)
   1087         {
   1088             if ( u4_thres_resi && (u4_b8_coeff_cost <= LUMA_SUB_BLOCK_SKIP_THRESHOLD) &&
   1089                             (*u1_cbp_l & (1 << b8)) )
   1090             {
   1091 
   1092 
   1093                 /*
   1094                  * When we want to reset the full 8x8 block, we have to reset
   1095                  * both the dc and ac coeff bits hence we have the symmetric
   1096                  * arrangement of bits
   1097                  */
   1098                 const UWORD32 cntrl_mask_map[4] = {0xcc00cc00, 0x33003300, 0x00cc00cc, 0x00330033};
   1099 
   1100                 /* restore cbp */
   1101                 *u1_cbp_l = (*u1_cbp_l & (~(1 << b8)));
   1102 
   1103                 /* correct cntrl flag */
   1104                 u4_cntrl = u4_cntrl & (~cntrl_mask_map[(b4 >> 2)]);
   1105 
   1106                 /* correct nnz */
   1107                 pu1_nnz[u1_scan_order[b4 - 3]] = 0;
   1108                 pu1_nnz[u1_scan_order[b4 - 2]] = 0;
   1109                 pu1_nnz[u1_scan_order[b4 - 1]] = 0;
   1110                 pu1_nnz[u1_scan_order[b4]] = 0;
   1111 
   1112                 /* reset blk cost */
   1113                 u4_b8_coeff_cost = 0;
   1114             }
   1115 
   1116             if (!(*u1_cbp_l & (1 << b8)))
   1117             {
   1118                 (*pv_mb_coeff_data) = ps_mb_coeff_data_b8;
   1119             }
   1120 
   1121             u4_mb_coeff_cost += u4_b8_coeff_cost;
   1122 
   1123             u4_b8_coeff_cost = 0;
   1124             i4_run = -1;
   1125             ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
   1126         }
   1127     }
   1128 
   1129     if (u4_thres_resi && (u4_mb_coeff_cost <= LUMA_BLOCK_SKIP_THRESHOLD)
   1130                     && (*u1_cbp_l))
   1131     {
   1132         (*pv_mb_coeff_data) = ps_mb_coeff_data_mb;
   1133         *u1_cbp_l = 0;
   1134         u4_cntrl = 0;
   1135         memset(pu1_nnz, 0, 16);
   1136     }
   1137 
   1138     (*pu4_cntrl) = u4_cntrl;
   1139 
   1140     return;
   1141 }
   1142 
   1143 /**
   1144 ******************************************************************************
   1145 *
   1146 * @brief  This function packs residue of an i8x8 chroma mb for entropy coding
   1147 *
   1148 * @par   Description
   1149 *  An i8 chroma macro block contains two classes of units, dc 2x2 block and
   1150 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
   1151 *  the 4 ac blocks are sent next in scan order. Each and every block is
   1152 *  represented by 3 parameters (nnz, significant coefficient map and the
   1153 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
   1154 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
   1155 *  sent in scan order.
   1156 *
   1157 *  The first byte of each block will be nnz of the block, if it is non zero,
   1158 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
   1159 *  This is repeated for 1 dc + 4 ac blocks.
   1160 *
   1161 * @param[in]  pi2_res_mb
   1162 *  pointer to residue mb
   1163 *
   1164 * @param[in, out]  pv_mb_coeff_data
   1165 *  buffer pointing to packed residue coefficients
   1166 *
   1167 * @param[in]  u4_res_strd
   1168 *  residual block stride
   1169 *
   1170 * @param[out]  u1_cbp_c
   1171 *  coded block pattern chroma
   1172 *
   1173 * @param[in]   pu1_nnz
   1174 *  number of non zero coefficients in each 4x4 unit
   1175 *
   1176 * @param[out]   pu1_nnz
   1177 *  Control signal for inverse transform
   1178 *
   1179 * @param[in]   u4_swap_uv
   1180 *  Swaps the order of U and V planes in entropy bitstream
   1181 *
   1182 * @return none
   1183 *
   1184 * @ remarks
   1185 *
   1186 ******************************************************************************
   1187 */
   1188 void ih264e_pack_c_mb(WORD16 *pi2_res_mb,
   1189                       void **pv_mb_coeff_data,
   1190                       WORD32 i4_res_strd,
   1191                       UWORD8 *u1_cbp_c,
   1192                       UWORD8 *pu1_nnz,
   1193                       UWORD32 u4_thres_resi,
   1194                       UWORD32 *pu4_cntrl,
   1195                       UWORD32 u4_swap_uv)
   1196 {
   1197     /* pointer to packed sub block buffer space */
   1198     tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data);
   1199     tu_sblk_coeff_data_t *ps_mb_coeff_data_dc, *ps_mb_coeff_data_ac;
   1200 
   1201     /* nnz pointer */
   1202     UWORD8 *pu1_nnz_ac, *pu1_nnz_dc;
   1203 
   1204     /* nnz counter */
   1205     UWORD32 u4_nnz_cnt;
   1206 
   1207     /* significant coefficient map */
   1208     UWORD32 u4_s_map;
   1209 
   1210     /* pointer to scanning matrix */
   1211     const UWORD8 *pu1_scan_order;
   1212 
   1213     /* no of non zero coefficients in the current sub block */
   1214     UWORD32 u4_nnz;
   1215 
   1216     /* pointer to residual sub block, res val */
   1217     WORD16 *pi2_res_sb, i2_res_val;
   1218 
   1219     /* temp var */
   1220     UWORD32 coeff_cnt, mask, b4,plane;
   1221 
   1222     /* temp var */
   1223     UWORD32 u4_coeff_cost;
   1224     WORD32 i4_run;
   1225 
   1226     /* coeff cost */
   1227     const UWORD8 *pu1_coeff_cost = gu1_coeff_cost;
   1228 
   1229     /* pointer to packed buffer space */
   1230     UWORD32 *pu4_mb_coeff_data = NULL;
   1231 
   1232     /* ac coded block pattern */
   1233     UWORD8 u1_cbp_ac;
   1234 
   1235     /* Variable to store the current bit pos in cntrl variable*/
   1236     UWORD32 cntrl_pos = 0;
   1237 
   1238     /********************************************************/
   1239     /*  pack dc coeff data for entropy coding               */
   1240     /********************************************************/
   1241     pu1_scan_order = gu1_chroma_scan_order_dc;
   1242     pi2_res_sb = pi2_res_mb;
   1243     pu1_nnz_dc = pu1_nnz;
   1244     (*pu4_cntrl) = 0;
   1245     cntrl_pos = 15;
   1246     ps_mb_coeff_data_dc = (*pv_mb_coeff_data);
   1247 
   1248     /* Color space conversion between SP_UV and SP_VU
   1249      * We always assume SP_UV for all the processing
   1250      * Hence to get proper stream output we need to swap U and V channels here
   1251      *
   1252      * For that there are two paths we need to look for
   1253      * One is the path to bitstream , these variables should have the proper input
   1254      * configured UV or VU
   1255      * For the other path the inverse transform variables should have what ever ordering the
   1256      * input had
   1257      */
   1258 
   1259     if (u4_swap_uv)
   1260     {
   1261         pu1_nnz_dc += 5;/* Move to NNZ of V planve */
   1262         pi2_res_sb += 4;/* Move to DC coff of V plane */
   1263 
   1264         cntrl_pos = 14; /* Control bit for V plane */
   1265     }
   1266 
   1267     for (plane = 0; plane < 2; plane++)
   1268     {
   1269         ps_mb_coeff_data = (*pv_mb_coeff_data);
   1270 
   1271         u4_nnz = *pu1_nnz_dc;
   1272         /* write number of non zero coefficients U/V */
   1273         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
   1274 
   1275         if (u4_nnz)
   1276         {
   1277             for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
   1278             {
   1279                 i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
   1280                 if (i2_res_val)
   1281                 {
   1282                     /* write residue U/V */
   1283                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
   1284                     u4_s_map |= mask;
   1285                 }
   1286                 mask <<= 1;
   1287             }
   1288             /* write significant coeff map U/V */
   1289             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
   1290             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
   1291             *u1_cbp_c = 1;
   1292 
   1293             (*pu4_cntrl) |= (1 << cntrl_pos);
   1294         }
   1295         else
   1296         {
   1297             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
   1298         }
   1299 
   1300         if (u4_swap_uv)
   1301         {
   1302             cntrl_pos++; /* Control bit for U plane */
   1303             pu1_nnz_dc -= 5; /* Move to NNZ of U plane */
   1304             pi2_res_sb -= 4; /* Move to DC coff of U plane */
   1305 
   1306         }
   1307         else
   1308         {
   1309             cntrl_pos--; /* Control bit for U plane */
   1310             pu1_nnz_dc += 5; /* 4 for AC NNZ and 1 for DC */
   1311             pi2_res_sb += 4; /* Move to DC coff of V plane */
   1312         }
   1313     }
   1314 
   1315     /********************************************************/
   1316     /*  pack ac coeff data for entropy coding               */
   1317     /********************************************************/
   1318 
   1319     pu1_scan_order = gu1_chroma_scan_order;
   1320     ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
   1321 
   1322     if (u4_swap_uv)
   1323     {
   1324         pi2_res_sb = pi2_res_mb + i4_res_strd * 5; /* Move to V plane ,ie 1dc row+ 4 ac row */
   1325         cntrl_pos = 27; /* The control bits are to be added for V bloc ie 31-4 th bit */
   1326         pu1_nnz_ac = pu1_nnz + 6;/*Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
   1327     }
   1328     else
   1329     {
   1330         pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to U plane ,ie 1dc row */
   1331         cntrl_pos = 31;
   1332         pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc */
   1333     }
   1334 
   1335     for (plane = 0; plane < 2; plane++)
   1336     {
   1337         pu4_mb_coeff_data = (*pv_mb_coeff_data);
   1338 
   1339         u4_coeff_cost = 0;
   1340         i4_run = -1;
   1341 
   1342         /* get the current cbp, so that it automatically
   1343          * gets reverted in case of zero ac values */
   1344         u1_cbp_ac = *u1_cbp_c;
   1345 
   1346         for (b4 = 0; b4 < 4; b4++)
   1347         {
   1348             ps_mb_coeff_data = (*pv_mb_coeff_data);
   1349 
   1350             u4_nnz = *pu1_nnz_ac;
   1351 
   1352             /*
   1353              * We are scanning only ac coeffs, but the nnz is for the
   1354              * complete 4x4 block. Hence we have to discount the nnz contributed
   1355              * by the dc coefficient
   1356              */
   1357             u4_nnz -= (pi2_res_sb[0]!=0);
   1358 
   1359             /* write number of non zero coefficients U/V */
   1360             ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
   1361 
   1362             if (u4_nnz)
   1363             {
   1364                 for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
   1365                 {
   1366                     i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
   1367 
   1368                     i4_run++;
   1369 
   1370                     if (i2_res_val)
   1371                     {
   1372                         /* write residue U/V */
   1373                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
   1374                         u4_s_map |= mask;
   1375 
   1376                         if ( u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD) )
   1377                         {
   1378                             /* compute coeff cost */
   1379                             if (i2_res_val == 1 || i2_res_val == -1)
   1380                             {
   1381                                 if (i4_run < 6)
   1382                                     u4_coeff_cost += pu1_coeff_cost[i4_run];
   1383                             }
   1384                             else
   1385                                 u4_coeff_cost += 9;
   1386 
   1387                             i4_run = -1;
   1388                         }
   1389                     }
   1390                     mask <<= 1;
   1391                 }
   1392 
   1393                 /* write significant coeff map U/V */
   1394                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
   1395                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
   1396                 u1_cbp_ac = 2;
   1397 
   1398                 (*pu4_cntrl) |= 1 << cntrl_pos;
   1399             }
   1400             else
   1401             {
   1402                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
   1403             }
   1404 
   1405             pu1_nnz_ac++;
   1406             pi2_res_sb += i4_res_strd;
   1407             cntrl_pos--;
   1408         }
   1409 
   1410         /* reset block */
   1411         if (u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD))
   1412         {
   1413             pu4_mb_coeff_data[0] = 0;
   1414             pu4_mb_coeff_data[1] = 0;
   1415             pu4_mb_coeff_data[2] = 0;
   1416             pu4_mb_coeff_data[3] = 0;
   1417             (*pv_mb_coeff_data) = pu4_mb_coeff_data + 4;
   1418 
   1419             /* Generate the control signal */
   1420             /* Zero out the current plane's AC coefficients */
   1421             (*pu4_cntrl) &= ((plane == u4_swap_uv) ? 0x0FFFFFFF : 0xF0FFFFFF);
   1422 
   1423             /* Similarly do for the NNZ also */
   1424             *(pu1_nnz_ac - 4) = 0;
   1425             *(pu1_nnz_ac - 3) = 0;
   1426             *(pu1_nnz_ac - 2) = 0;
   1427             *(pu1_nnz_ac - 1) = 0;
   1428         }
   1429         else
   1430         {
   1431             *u1_cbp_c = u1_cbp_ac;
   1432         }
   1433 
   1434         if (u4_swap_uv)
   1435         {
   1436             pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to V plane ,ie 1dc row+ 4 ac row + 1 dc row */
   1437             cntrl_pos = 31; /* The control bits are to be added for V bloc ie 31-4 th bit */
   1438             pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
   1439 
   1440             pu1_nnz_ac = pu1_nnz + 1;
   1441         }
   1442         else
   1443             pu1_nnz_ac = pu1_nnz + 6; /* Go to nnz of V plane */
   1444     }
   1445 
   1446     /* restore the ptr basing on cbp */
   1447     if (*u1_cbp_c == 0)
   1448     {
   1449         (*pv_mb_coeff_data) = ps_mb_coeff_data_dc;
   1450     }
   1451     else if (*u1_cbp_c == 1)
   1452     {
   1453         (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
   1454     }
   1455 
   1456     return ;
   1457 }
   1458 
   1459 /**
   1460 *******************************************************************************
   1461 *
   1462 * @brief performs luma core coding when intra mode is i16x16
   1463 *
   1464 * @par Description:
   1465 *  If the current mb is to be coded as intra of mb type i16x16, the mb is first
   1466 *  predicted using one of i16x16 prediction filters, basing on the intra mode
   1467 *  chosen. Then, error is computed between the input blk and the estimated blk.
   1468 *  This error is transformed (hierarchical transform i.e., dct followed by hada-
   1469 *  -mard), quantized. The quantized coefficients are packed in scan order for
   1470 *  entropy coding.
   1471 *
   1472 * @param[in] ps_proc_ctxt
   1473 *  pointer to the current macro block context
   1474 *
   1475 * @returns u1_cbp_l
   1476 *  coded block pattern luma
   1477 *
   1478 * @remarks none
   1479 *
   1480 *******************************************************************************
   1481 */
   1482 
   1483 UWORD8 ih264e_code_luma_intra_macroblock_16x16(process_ctxt_t *ps_proc)
   1484 {
   1485     /* Codec Context */
   1486     codec_t *ps_codec = ps_proc->ps_codec;
   1487 
   1488     /* pointer to ref macro block */
   1489     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
   1490 
   1491     /* pointer to src macro block */
   1492     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
   1493 
   1494     /* pointer to prediction macro block */
   1495     UWORD8 *pu1_pred_mb = NULL;
   1496 
   1497     /* pointer to residual macro block */
   1498     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
   1499 
   1500     /* strides */
   1501     WORD32 i4_src_strd = ps_proc->i4_src_strd;
   1502     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
   1503     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   1504     WORD32 i4_res_strd = ps_proc->i4_res_strd;
   1505 
   1506     /* intra mode */
   1507     UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
   1508 
   1509     /* coded block pattern */
   1510     UWORD8 u1_cbp_l = 0;
   1511 
   1512     /* number of non zero coeffs*/
   1513     UWORD32 au4_nnz[5];
   1514     UWORD8  *pu1_nnz = (UWORD8 *)au4_nnz;
   1515 
   1516     /*Cntrol signal for itrans*/
   1517     UWORD32 u4_cntrl;
   1518 
   1519     /* quantization parameters */
   1520     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
   1521 
   1522     /* pointer to packed mb coeff data */
   1523     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
   1524 
   1525     /* init nnz */
   1526     au4_nnz[0] = 0;
   1527     au4_nnz[1] = 0;
   1528     au4_nnz[2] = 0;
   1529     au4_nnz[3] = 0;
   1530     au4_nnz[4] = 0;
   1531 
   1532     if (u1_intra_mode == PLANE_I16x16)
   1533     {
   1534         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16_plane;
   1535     }
   1536     else
   1537     {
   1538         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16;
   1539     }
   1540 
   1541     /********************************************************/
   1542     /*  error estimation,                                   */
   1543     /*  transform                                           */
   1544     /*  quantization                                        */
   1545     /********************************************************/
   1546     ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
   1547                                                pu1_pred_mb, pi2_res_mb,
   1548                                                i4_src_strd, i4_pred_strd,
   1549                                                i4_res_strd,
   1550                                                ps_qp_params->pu2_scale_mat,
   1551                                                ps_qp_params->pu2_thres_mat,
   1552                                                ps_qp_params->u1_qbits,
   1553                                                ps_qp_params->u4_dead_zone,
   1554                                                pu1_nnz, ENABLE_DC_TRANSFORM);
   1555 
   1556     /********************************************************/
   1557     /*  pack coeff data for entropy coding                  */
   1558     /********************************************************/
   1559     ih264e_pack_l_mb_i16(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
   1560                          pu1_nnz, &u4_cntrl);
   1561 
   1562     /********************************************************/
   1563     /*  ierror estimation,                                  */
   1564     /*  itransform                                          */
   1565     /*  iquantization                                       */
   1566     /********************************************************/
   1567     /*
   1568      *if refernce frame is not to be computed
   1569      *we only need the right and bottom border 4x4 blocks to predict next intra
   1570      *blocks, hence only compute them
   1571      */
   1572     if (!ps_proc->u4_compute_recon)
   1573     {
   1574         u4_cntrl &= 0x111F8000;
   1575     }
   1576 
   1577     if (u4_cntrl)
   1578     {
   1579         ih264e_luma_16x16_idctrans_iquant_itrans_recon(
   1580                         ps_codec, pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
   1581                         i4_res_strd, i4_pred_strd, i4_rec_strd,
   1582                         ps_qp_params->pu2_iscale_mat,
   1583                         ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
   1584                         u4_cntrl, ENABLE_DC_TRANSFORM,
   1585                         ps_proc->pv_scratch_buff);
   1586     }
   1587     else
   1588     {
   1589         ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb, i4_pred_strd,
   1590                                           i4_rec_strd, MB_SIZE, MB_SIZE, NULL,
   1591                                           0);
   1592     }
   1593 
   1594     return (u1_cbp_l);
   1595 }
   1596 
   1597 
   1598 /**
   1599 *******************************************************************************
   1600 *
   1601 * @brief performs luma core coding when intra mode is i4x4
   1602 *
   1603 * @par Description:
   1604 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
   1605 *  predicted using one of i4x4 prediction filters, basing on the intra mode
   1606 *  chosen. Then, error is computed between the input blk and the estimated blk.
   1607 *  This error is dct transformed and quantized. The quantized coefficients are
   1608 *  packed in scan order for entropy coding.
   1609 *
   1610 * @param[in] ps_proc_ctxt
   1611 *  pointer to the current macro block context
   1612 *
   1613 * @returns u1_cbp_l
   1614 *  coded block pattern luma
   1615 *
   1616 * @remarks
   1617 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
   1618 *  mentioned in h.264 specification
   1619 *
   1620 *******************************************************************************
   1621 */
   1622 UWORD8 ih264e_code_luma_intra_macroblock_4x4(process_ctxt_t *ps_proc)
   1623 {
   1624     /* Codec Context */
   1625     codec_t *ps_codec = ps_proc->ps_codec;
   1626 
   1627     /* pointer to ref macro block */
   1628     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
   1629 
   1630     /* pointer to src macro block */
   1631     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
   1632 
   1633     /* pointer to prediction macro block */
   1634     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
   1635 
   1636     /* pointer to residual macro block */
   1637     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
   1638 
   1639     /* strides */
   1640     WORD32 i4_src_strd = ps_proc->i4_src_strd;
   1641     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
   1642     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   1643 
   1644     /* pointer to neighbors: left, top, top-left */
   1645     UWORD8 *pu1_mb_a;
   1646     UWORD8 *pu1_mb_b;
   1647     UWORD8 *pu1_mb_c;
   1648     UWORD8 *pu1_mb_d;
   1649 
   1650     /* intra mode */
   1651     UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
   1652 
   1653     /* neighbor availability */
   1654     WORD32 i4_ngbr_avbl;
   1655 
   1656     /* neighbor pels for intra prediction */
   1657     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
   1658 
   1659     /* coded block pattern */
   1660     UWORD8 u1_cbp_l = 0;
   1661 
   1662     /* number of non zero coeffs*/
   1663     UWORD8  u1_nnz;
   1664 
   1665     /* quantization parameters */
   1666     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
   1667 
   1668     /* pointer to packed mb coeff data */
   1669     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
   1670 
   1671     /* pointer to packed mb coeff data */
   1672     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
   1673 
   1674     /* no of non zero coefficients in the current sub block */
   1675     UWORD32 u4_nnz_cnt;
   1676 
   1677     /* significant coefficient map */
   1678     UWORD32 u4_s_map;
   1679 
   1680     /* pointer to scanning matrix */
   1681     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
   1682 
   1683     /*Dummy variable for 4x4 trans fucntion*/
   1684     WORD16 i2_dc_dummy;
   1685 
   1686     /* temp var */
   1687     UWORD32 i, b8, b4, u1_blk_x, u1_blk_y, u1_pix_x, u1_pix_y, coeff_cnt, mask;
   1688 
   1689     /* Process 16 4x4 lum sub-blocks of the MB in scan order */
   1690     for (b8 = 0; b8 < 4; b8++)
   1691     {
   1692         u1_blk_x = GET_BLK_RASTER_POS_X(b8) << 3;
   1693         u1_blk_y = GET_BLK_RASTER_POS_Y(b8) << 3;
   1694 
   1695         /* if in case cbp for the 8x8 block is zero, send no residue */
   1696         ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
   1697 
   1698         for (b4 = 0; b4 < 4; b4++)
   1699         {
   1700             /* index of pel in MB */
   1701             u1_pix_x = u1_blk_x + (GET_SUB_BLK_RASTER_POS_X(b4) << 2);
   1702             u1_pix_y = u1_blk_y + (GET_SUB_BLK_RASTER_POS_Y(b4) << 2);
   1703 
   1704             /* Initialize source and reference pointers */
   1705             pu1_curr_mb = ps_proc->pu1_src_buf_luma + u1_pix_x + (u1_pix_y * i4_src_strd);
   1706             pu1_ref_mb = ps_proc->pu1_rec_buf_luma + u1_pix_x + (u1_pix_y * i4_rec_strd);
   1707 
   1708             /* pointer to left of ref macro block */
   1709             pu1_mb_a = pu1_ref_mb - 1;
   1710             /* pointer to top of ref macro block */
   1711             pu1_mb_b = pu1_ref_mb - i4_rec_strd;
   1712             /* pointer to topright of ref macro block */
   1713             pu1_mb_c = pu1_mb_b + 4;
   1714             /* pointer to topleft macro block */
   1715             pu1_mb_d = pu1_mb_b - 1;
   1716 
   1717             /* compute neighbor availability */
   1718             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
   1719 
   1720             /* sub block intra mode */
   1721             u1_intra_mode = ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4];
   1722 
   1723             /********************************************************/
   1724             /* gather prediction pels from neighbors for prediction */
   1725             /********************************************************/
   1726             /* left pels */
   1727             if (i4_ngbr_avbl & LEFT_MB_AVAILABLE_MASK)
   1728             {
   1729                 for (i = 0; i < 4; i++)
   1730                     pu1_ngbr_pels_i4[4 - 1 - i] = pu1_mb_a[i * i4_rec_strd];
   1731             }
   1732             else
   1733             {
   1734                 memset(pu1_ngbr_pels_i4, 0, 4);
   1735             }
   1736 
   1737             /* top pels */
   1738             if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
   1739             {
   1740                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
   1741             }
   1742             else
   1743             {
   1744                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
   1745             }
   1746             /* top left pels */
   1747             if (i4_ngbr_avbl & TOP_LEFT_MB_AVAILABLE_MASK)
   1748             {
   1749                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
   1750             }
   1751             else
   1752             {
   1753                 pu1_ngbr_pels_i4[4] = 0;
   1754             }
   1755             /* top right pels */
   1756             if (i4_ngbr_avbl & TOP_RIGHT_MB_AVAILABLE_MASK)
   1757             {
   1758                 memcpy(pu1_ngbr_pels_i4+8+1,pu1_mb_c,4);
   1759             }
   1760             else if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
   1761             {
   1762                 memset(pu1_ngbr_pels_i4+8+1,pu1_ngbr_pels_i4[8],4);
   1763             }
   1764 
   1765             /********************************************************/
   1766             /*  prediction                                          */
   1767             /********************************************************/
   1768             (ps_codec->apf_intra_pred_4_l)[u1_intra_mode](pu1_ngbr_pels_i4,
   1769                                                           pu1_pred_mb, 0,
   1770                                                           i4_pred_strd,
   1771                                                           i4_ngbr_avbl);
   1772 
   1773             /********************************************************/
   1774             /*  error estimation,                                   */
   1775             /*  transform                                           */
   1776             /*  quantization                                        */
   1777             /********************************************************/
   1778             ps_codec->pf_resi_trans_quant_4x4(pu1_curr_mb, pu1_pred_mb,
   1779                                               pi2_res_mb, i4_src_strd,
   1780                                               i4_pred_strd,
   1781                                               ps_qp_params->pu2_scale_mat,
   1782                                               ps_qp_params->pu2_thres_mat,
   1783                                               ps_qp_params->u1_qbits,
   1784                                               ps_qp_params->u4_dead_zone,
   1785                                               &u1_nnz, &i2_dc_dummy);
   1786 
   1787             /********************************************************/
   1788             /*  pack coeff data for entropy coding                  */
   1789             /********************************************************/
   1790             ps_mb_coeff_data = *pv_mb_coeff_data;
   1791 
   1792             /* write number of non zero coefficients */
   1793             ps_mb_coeff_data->i4_sig_map_nnz = u1_nnz;
   1794 
   1795             if (u1_nnz)
   1796             {
   1797                 for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u1_nnz; coeff_cnt++)
   1798                 {
   1799                     if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
   1800                     {
   1801                         /* write residue */
   1802                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
   1803                         u4_s_map |= mask;
   1804                     }
   1805                     mask <<= 1;
   1806                 }
   1807                 /* write significant coeff map */
   1808                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
   1809 
   1810                 /* update ptr to coeff data */
   1811                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
   1812 
   1813                 /* cbp */
   1814                 u1_cbp_l |= (1 << b8);
   1815             }
   1816             else
   1817             {
   1818                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
   1819             }
   1820 
   1821             /********************************************************/
   1822             /*  ierror estimation,                                  */
   1823             /*  itransform                                          */
   1824             /*  iquantization                                       */
   1825             /********************************************************/
   1826             if (u1_nnz)
   1827                 ps_codec->pf_iquant_itrans_recon_4x4(
   1828                                 pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
   1829                                 /*No input stride,*/i4_pred_strd,
   1830                                 i4_rec_strd, ps_qp_params->pu2_iscale_mat,
   1831                                 ps_qp_params->pu2_weigh_mat,
   1832                                 ps_qp_params->u1_qp_div,
   1833                                 ps_proc->pv_scratch_buff, 0, 0);
   1834             else
   1835                 ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb,
   1836                                                   i4_pred_strd, i4_rec_strd,
   1837                                                   BLK_SIZE, BLK_SIZE, NULL,
   1838                                                   0);
   1839 
   1840         }
   1841 
   1842         /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
   1843         if (!(u1_cbp_l & (1 << b8)))
   1844         {
   1845             *pv_mb_coeff_data = ps_mb_coeff_data_b8;
   1846         }
   1847     }
   1848 
   1849     return (u1_cbp_l);
   1850 }
   1851 
   1852 /**
   1853 *******************************************************************************
   1854 *
   1855 * @brief performs luma core coding when intra mode is i4x4
   1856 *
   1857 * @par Description:
   1858 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
   1859 *  predicted using one of i4x4 prediction filters, basing on the intra mode
   1860 *  chosen. Then, error is computed between the input blk and the estimated blk.
   1861 *  This error is dct transformed and quantized. The quantized coefficients are
   1862 *  packed in scan order for entropy coding.
   1863 *
   1864 * @param[in] ps_proc_ctxt
   1865 *  pointer to the current macro block context
   1866 *
   1867 * @returns u1_cbp_l
   1868 *  coded block pattern luma
   1869 *
   1870 * @remarks
   1871 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
   1872 *  mentioned in h.264 specification
   1873 *
   1874 *******************************************************************************
   1875 */
   1876 UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on(process_ctxt_t *ps_proc)
   1877 {
   1878     /* Codec Context */
   1879     codec_t *ps_codec = ps_proc->ps_codec;
   1880 
   1881     /* pointer to ref macro block */
   1882     UWORD8 *pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4;
   1883 
   1884     /* pointer to recon buffer */
   1885     UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
   1886 
   1887     /* pointer to residual macro block */
   1888     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
   1889 
   1890     /* strides */
   1891     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
   1892 
   1893     /* number of non zero coeffs*/
   1894     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
   1895 
   1896     /* coded block pattern */
   1897     UWORD8 u1_cbp_l = 0;
   1898 
   1899     /* pointer to packed mb coeff data */
   1900     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
   1901 
   1902     /* pointer to packed mb coeff data */
   1903     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
   1904 
   1905     /* no of non zero coefficients in the current sub block */
   1906     UWORD32 u4_nnz_cnt;
   1907 
   1908     /* significant coefficient map */
   1909     UWORD32 u4_s_map;
   1910 
   1911     /* pointer to scanning matrix */
   1912     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
   1913 
   1914     /* temp var */
   1915     UWORD32 b8, b4, coeff_cnt, mask;
   1916 
   1917     /* Process 16 4x4 lum sub-blocks of the MB in scan order */
   1918     for (b8 = 0; b8 < 4; b8++)
   1919     {
   1920         /* if in case cbp for the 8x8 block is zero, send no residue */
   1921         ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
   1922 
   1923         for (b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
   1924         {
   1925             /********************************************************/
   1926             /*  pack coeff data for entropy coding                  */
   1927             /********************************************************/
   1928             ps_mb_coeff_data = *pv_mb_coeff_data;
   1929 
   1930             /* write number of non zero coefficients */
   1931             ps_mb_coeff_data->i4_sig_map_nnz = *pu1_nnz;
   1932 
   1933             if (*pu1_nnz)
   1934             {
   1935                 for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < *pu1_nnz; coeff_cnt++)
   1936                 {
   1937                     if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
   1938                     {
   1939                         /* write residue */
   1940                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
   1941                         u4_s_map |= mask;
   1942                     }
   1943                     mask <<= 1;
   1944                 }
   1945                 /* write significant coeff map */
   1946                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
   1947 
   1948                 /* update ptr to coeff data */
   1949                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
   1950 
   1951                 /* cbp */
   1952                 u1_cbp_l |= (1 << b8);
   1953             }
   1954             else
   1955             {
   1956                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
   1957             }
   1958         }
   1959 
   1960         /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
   1961         if (!(u1_cbp_l & (1 << b8)))
   1962         {
   1963             *pv_mb_coeff_data = ps_mb_coeff_data_b8;
   1964         }
   1965     }
   1966 
   1967     /* memcpy recon */
   1968     ps_codec->pf_inter_pred_luma_copy(pu1_ref_mb_intra_4x4, pu1_rec_mb, MB_SIZE, i4_rec_strd, MB_SIZE, MB_SIZE, NULL, 0);
   1969 
   1970     return (u1_cbp_l);
   1971 }
   1972 
   1973 
   1974 /**
   1975 *******************************************************************************
   1976 *
   1977 * @brief performs chroma core coding for intra macro blocks
   1978 *
   1979 * @par Description:
   1980 *  If the current MB is to be intra coded with mb type chroma I8x8, the MB is
   1981 *  first predicted using intra 8x8 prediction filters. The predicted data is
   1982 *  compared with the input for error and the error is transformed. The DC
   1983 *  coefficients of each transformed sub blocks are further transformed using
   1984 *  Hadamard transform. The resulting coefficients are quantized, packed and sent
   1985 *  for entropy coding.
   1986 *
   1987 * @param[in] ps_proc_ctxt
   1988 *  pointer to the current macro block context
   1989 *
   1990 * @returns u1_cbp_c
   1991 *  coded block pattern chroma
   1992 *
   1993 * @remarks
   1994 *  The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
   1995 *  mentioned in h.264 specification
   1996 *
   1997 *******************************************************************************
   1998 */
   1999 UWORD8 ih264e_code_chroma_intra_macroblock_8x8(process_ctxt_t *ps_proc)
   2000 {
   2001     /* Codec Context */
   2002     codec_t *ps_codec = ps_proc->ps_codec;
   2003 
   2004     /* pointer to ref macro block */
   2005     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
   2006 
   2007     /* pointer to src macro block */
   2008     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
   2009 
   2010     /* pointer to prediction macro block */
   2011     UWORD8 *pu1_pred_mb = NULL;
   2012 
   2013     /* pointer to residual macro block */
   2014     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
   2015 
   2016     /* strides */
   2017     WORD32 i4_src_strd = ps_proc->i4_src_chroma_strd;
   2018     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
   2019     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   2020     WORD32 i4_res_strd = ps_proc->i4_res_strd;
   2021 
   2022     /* intra mode */
   2023     UWORD8 u1_intra_mode = ps_proc->u1_c_i8_mode;
   2024 
   2025     /* coded block pattern */
   2026     UWORD8 u1_cbp_c = 0;
   2027 
   2028     /* number of non zero coeffs*/
   2029     UWORD8 au1_nnz[18] = {0};
   2030 
   2031     /* quantization parameters */
   2032     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
   2033 
   2034     /* Control signal for inverse transform */
   2035     UWORD32 u4_cntrl;
   2036 
   2037     /* pointer to packed mb coeff data */
   2038     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
   2039 
   2040     /* See if we need to swap U and V plances for entropy */
   2041     UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
   2042 
   2043     if (PLANE_CH_I8x8 == u1_intra_mode)
   2044     {
   2045         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma_plane;
   2046     }
   2047     else
   2048     {
   2049         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
   2050     }
   2051 
   2052     /********************************************************/
   2053     /*  error estimation,                                   */
   2054     /*  transform                                           */
   2055     /*  quantization                                        */
   2056     /********************************************************/
   2057     ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
   2058                                                pu1_pred_mb, pi2_res_mb,
   2059                                                i4_src_strd, i4_pred_strd,
   2060                                                i4_res_strd,
   2061                                                ps_qp_params->pu2_scale_mat,
   2062                                                ps_qp_params->pu2_thres_mat,
   2063                                                ps_qp_params->u1_qbits,
   2064                                                ps_qp_params->u4_dead_zone,
   2065                                                au1_nnz);
   2066 
   2067     /********************************************************/
   2068     /*  pack coeff data for entropy coding                  */
   2069     /********************************************************/
   2070     ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
   2071                      au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
   2072 
   2073     /********************************************************/
   2074     /*  ierror estimation,                                  */
   2075     /*  itransform                                          */
   2076     /*  iquantization                                       */
   2077     /********************************************************/
   2078     ih264e_chroma_8x8_idctrans_iquant_itrans_recon(ps_codec, pi2_res_mb,
   2079                                                    pu1_pred_mb, pu1_ref_mb,
   2080                                                    i4_res_strd, i4_pred_strd,
   2081                                                    i4_rec_strd,
   2082                                                    ps_qp_params->pu2_iscale_mat,
   2083                                                    ps_qp_params->pu2_weigh_mat,
   2084                                                    ps_qp_params->u1_qp_div,
   2085                                                    u4_cntrl,
   2086                                                    ps_proc->pv_scratch_buff);
   2087     return (u1_cbp_c);
   2088 }
   2089 
   2090 
   2091 /**
   2092 *******************************************************************************
   2093 *
   2094 * @brief performs luma core coding when  mode is inter
   2095 *
   2096 * @par Description:
   2097 *  If the current mb is to be coded as inter the mb is predicted based on the
   2098 *  sub mb partitions and corresponding motion vectors generated by ME. Then,
   2099 *  error is computed between the input blk and the estimated blk. This error is
   2100 *  transformed, quantized. The quantized coefficients are packed in scan order
   2101 *  for entropy coding
   2102 *
   2103 * @param[in] ps_proc_ctxt
   2104 *  pointer to the current macro block context
   2105 *
   2106 * @returns u1_cbp_l
   2107 *  coded block pattern luma
   2108 *
   2109 * @remarks none
   2110 *
   2111 *******************************************************************************
   2112 */
   2113 
   2114 UWORD8 ih264e_code_luma_inter_macroblock_16x16(process_ctxt_t *ps_proc)
   2115 {
   2116     /* Codec Context */
   2117     codec_t *ps_codec = ps_proc->ps_codec;
   2118 
   2119     /* pointer to ref macro block */
   2120     UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
   2121 
   2122     /* pointer to src macro block */
   2123     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
   2124 
   2125     /* pointer to prediction macro block */
   2126     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
   2127 
   2128     /* pointer to residual macro block */
   2129     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
   2130 
   2131     /* strides */
   2132     WORD32 i4_src_strd = ps_proc->i4_src_strd;
   2133     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
   2134     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   2135     WORD32 i4_res_strd = ps_proc->i4_res_strd;
   2136 
   2137     /* coded block pattern */
   2138     UWORD8 u1_cbp_l = 0;
   2139 
   2140     /*Control signal of itrans*/
   2141     UWORD32 u4_cntrl;
   2142 
   2143     /* number of non zero coeffs*/
   2144     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz;
   2145 
   2146     /* quantization parameters */
   2147     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
   2148 
   2149     /* pointer to packed mb coeff data */
   2150     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
   2151 
   2152     /* pseudo pred buffer */
   2153     UWORD8 *pu1_pseudo_pred = pu1_pred_mb;
   2154 
   2155     /* pseudo pred buffer stride */
   2156     WORD32 i4_pseudo_pred_strd = i4_pred_strd;
   2157 
   2158     /* init nnz */
   2159     ps_proc->au4_nnz[0] = 0;
   2160     ps_proc->au4_nnz[1] = 0;
   2161     ps_proc->au4_nnz[2] = 0;
   2162     ps_proc->au4_nnz[3] = 0;
   2163     ps_proc->au4_nnz[4] = 0;
   2164 
   2165     /********************************************************/
   2166     /*  prediction                                          */
   2167     /********************************************************/
   2168     ih264e_motion_comp_luma(ps_proc, &pu1_pseudo_pred, &i4_pseudo_pred_strd);
   2169 
   2170     /********************************************************/
   2171     /*  error estimation,                                   */
   2172     /*  transform                                           */
   2173     /*  quantization                                        */
   2174     /********************************************************/
   2175     if (ps_proc->u4_min_sad_reached == 0 || ps_proc->u4_min_sad != 0)
   2176     {
   2177         ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
   2178                                                    pu1_pseudo_pred, pi2_res_mb,
   2179                                                    i4_src_strd,
   2180                                                    i4_pseudo_pred_strd,
   2181                                                    i4_res_strd,
   2182                                                    ps_qp_params->pu2_scale_mat,
   2183                                                    ps_qp_params->pu2_thres_mat,
   2184                                                    ps_qp_params->u1_qbits,
   2185                                                    ps_qp_params->u4_dead_zone,
   2186                                                    pu1_nnz,
   2187                                                    DISABLE_DC_TRANSFORM);
   2188 
   2189         /********************************************************/
   2190         /*  pack coeff data for entropy coding                  */
   2191         /********************************************************/
   2192         ih264e_pack_l_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
   2193                          pu1_nnz, ps_codec->u4_thres_resi, &u4_cntrl);
   2194     }
   2195     else
   2196     {
   2197         u1_cbp_l = 0;
   2198         u4_cntrl = 0;
   2199     }
   2200 
   2201     /********************************************************/
   2202     /*  ierror estimation,                                  */
   2203     /*  itransform                                          */
   2204     /*  iquantization                                       */
   2205     /********************************************************/
   2206 
   2207     /*If the frame is not to be used for P frame reference or dumping recon
   2208      * we only will use the reocn for only predicting intra Mbs
   2209      * THis will need only right and bottom edge 4x4 blocks recon
   2210      * Hence we selectively enable them using control signal(including DC)
   2211      */
   2212     if (ps_proc->u4_compute_recon != 1)
   2213     {
   2214         u4_cntrl &= 0x111F0000;
   2215     }
   2216 
   2217     if (u4_cntrl)
   2218     {
   2219         ih264e_luma_16x16_idctrans_iquant_itrans_recon(
   2220                         ps_codec, pi2_res_mb, pu1_pseudo_pred, pu1_rec_mb,
   2221                         i4_res_strd, i4_pseudo_pred_strd, i4_rec_strd,
   2222                         ps_qp_params->pu2_iscale_mat,
   2223                         ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
   2224                         u4_cntrl /*Cntrl*/, DISABLE_DC_TRANSFORM,
   2225                         ps_proc->pv_scratch_buff);
   2226     }
   2227     else
   2228     {
   2229         ps_codec->pf_inter_pred_luma_copy(pu1_pseudo_pred, pu1_rec_mb,
   2230                                           i4_pseudo_pred_strd, i4_rec_strd,
   2231                                           MB_SIZE, MB_SIZE, NULL, 0);
   2232     }
   2233 
   2234 
   2235     return (u1_cbp_l);
   2236 }
   2237 
   2238 /**
   2239 *******************************************************************************
   2240 *
   2241 * @brief performs chroma core coding for inter macro blocks
   2242 *
   2243 * @par Description:
   2244 *  If the current mb is to be coded as inter predicted mb,based on the sub mb partitions
   2245 *  and corresponding motion vectors generated by ME  ,prediction is done.
   2246 *  Then, error is computed between the input blk and the estimated blk.
   2247 *  This error is transformed , quantized. The quantized coefficients
   2248 *  are packed in scan order for
   2249 *  entropy coding.
   2250 *
   2251 * @param[in] ps_proc_ctxt
   2252 *  pointer to the current macro block context
   2253 *
   2254 * @returns u1_cbp_l
   2255 *  coded block pattern chroma
   2256 *
   2257 * @remarks none
   2258 *
   2259 *******************************************************************************
   2260 */
   2261 UWORD8 ih264e_code_chroma_inter_macroblock_8x8(process_ctxt_t *ps_proc)
   2262 {
   2263     /* Codec Context */
   2264     codec_t *ps_codec = ps_proc->ps_codec;
   2265 
   2266     /* pointer to ref macro block */
   2267     UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_chroma;
   2268 
   2269     /* pointer to src macro block */
   2270     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
   2271 
   2272     /* pointer to prediction macro block */
   2273     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
   2274 
   2275     /* pointer to residual macro block */
   2276     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
   2277 
   2278     /* strides */
   2279     WORD32 i4_src_strd = ps_proc->i4_src_chroma_strd;
   2280     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
   2281     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   2282     WORD32 i4_res_strd = ps_proc->i4_res_strd;
   2283 
   2284     /* coded block pattern */
   2285     UWORD8 u1_cbp_c = 0;
   2286 
   2287     /*Control signal for inverse transform*/
   2288     UWORD32 u4_cntrl;
   2289 
   2290     /* number of non zero coeffs*/
   2291     UWORD8 au1_nnz[10] = {0};
   2292 
   2293     /* quantization parameters */
   2294     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
   2295 
   2296     /* pointer to packed mb coeff data */
   2297     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
   2298 
   2299     /*See if we need to swap U and V plances for entropy*/
   2300     UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
   2301 
   2302     /********************************************************/
   2303     /*  prediction                                          */
   2304     /********************************************************/
   2305     ih264e_motion_comp_chroma(ps_proc);
   2306 
   2307     /********************************************************/
   2308     /*  error estimation,                                   */
   2309     /*  transform                                           */
   2310     /*  quantization                                        */
   2311     /********************************************************/
   2312     ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
   2313                                                pu1_pred_mb, pi2_res_mb,
   2314                                                i4_src_strd, i4_pred_strd,
   2315                                                i4_res_strd,
   2316                                                ps_qp_params->pu2_scale_mat,
   2317                                                ps_qp_params->pu2_thres_mat,
   2318                                                ps_qp_params->u1_qbits,
   2319                                                ps_qp_params->u4_dead_zone,
   2320                                                au1_nnz);
   2321 
   2322     /********************************************************/
   2323     /*  pack coeff data for entropy coding                  */
   2324     /********************************************************/
   2325     ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
   2326                      au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
   2327 
   2328     /********************************************************/
   2329     /*  ierror estimation,                                  */
   2330     /*  itransform                                          */
   2331     /*  iquantization                                       */
   2332     /********************************************************/
   2333 
   2334     /* If the frame is not to be used for P frame reference or dumping recon
   2335      * we only will use the reocn for only predicting intra Mbs
   2336      * THis will need only right and bottom edge 4x4 blocks recon
   2337      * Hence we selectively enable them using control signal(including DC)
   2338      */
   2339     if (!ps_proc->u4_compute_recon)
   2340     {
   2341         u4_cntrl &= 0x7700C000;
   2342     }
   2343 
   2344     if (u4_cntrl)
   2345     {
   2346         ih264e_chroma_8x8_idctrans_iquant_itrans_recon(
   2347                         ps_codec, pi2_res_mb, pu1_pred_mb, pu1_rec_mb,
   2348                         i4_res_strd, i4_pred_strd, i4_rec_strd,
   2349                         ps_qp_params->pu2_iscale_mat,
   2350                         ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
   2351                         u4_cntrl, ps_proc->pv_scratch_buff);
   2352     }
   2353     else
   2354     {
   2355         ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_rec_mb, i4_pred_strd,
   2356                                           i4_rec_strd, MB_SIZE >> 1, MB_SIZE,
   2357                                           NULL, 0);
   2358     }
   2359 
   2360     return (u1_cbp_c);
   2361 }
   2362