Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /*!
     22 ******************************************************************************
     23 * \file ihevce_recur_bracketing.c
     24 *
     25 * \brief
     26 *    This file contains interface functions of recursive bracketing
     27 *    module
     28 * \date
     29 *    12/02/2012
     30 *
     31 * \author
     32 *    Ittiam
     33 *
     34 * List of Functions
     35 *
     36 *
     37 ******************************************************************************
     38 */
     39 
     40 /*****************************************************************************/
     41 /* File Includes                                                             */
     42 /*****************************************************************************/
     43 /* System include files */
     44 #include <stdio.h>
     45 #include <string.h>
     46 #include <stdlib.h>
     47 #include <assert.h>
     48 #include <stdarg.h>
     49 #include <math.h>
     50 
     51 /* User include files */
     52 #include "ihevc_typedefs.h"
     53 #include "itt_video_api.h"
     54 #include "ihevce_api.h"
     55 
     56 #include "rc_cntrl_param.h"
     57 #include "rc_frame_info_collector.h"
     58 #include "rc_look_ahead_params.h"
     59 
     60 #include "ihevc_defs.h"
     61 #include "ihevc_structs.h"
     62 #include "ihevc_platform_macros.h"
     63 #include "ihevc_deblk.h"
     64 #include "ihevc_itrans_recon.h"
     65 #include "ihevc_chroma_itrans_recon.h"
     66 #include "ihevc_chroma_intra_pred.h"
     67 #include "ihevc_intra_pred.h"
     68 #include "ihevc_inter_pred.h"
     69 #include "ihevc_mem_fns.h"
     70 #include "ihevc_padding.h"
     71 #include "ihevc_weighted_pred.h"
     72 #include "ihevc_sao.h"
     73 #include "ihevc_resi_trans.h"
     74 #include "ihevc_quant_iquant_ssd.h"
     75 #include "ihevc_cabac_tables.h"
     76 
     77 #include "ihevce_defs.h"
     78 #include "ihevce_lap_enc_structs.h"
     79 #include "ihevce_multi_thrd_structs.h"
     80 #include "ihevce_me_common_defs.h"
     81 #include "ihevce_had_satd.h"
     82 #include "ihevce_error_codes.h"
     83 #include "ihevce_bitstream.h"
     84 #include "ihevce_cabac.h"
     85 #include "ihevce_rdoq_macros.h"
     86 #include "ihevce_function_selector.h"
     87 #include "ihevce_enc_structs.h"
     88 #include "ihevce_entropy_structs.h"
     89 #include "ihevce_cmn_utils_instr_set_router.h"
     90 #include "ihevce_enc_loop_structs.h"
     91 #include "ihevce_ipe_instr_set_router.h"
     92 #include "ihevce_ipe_structs.h"
     93 #include "ihevce_ipe_pass.h"
     94 #include "ihevce_recur_bracketing.h"
     95 #include "ihevce_nbr_avail.h"
     96 #include "ihevc_common_tables.h"
     97 #include "ihevce_decomp_pre_intra_structs.h"
     98 #include "ihevce_decomp_pre_intra_pass.h"
     99 
    100 #include "cast_types.h"
    101 #include "osal.h"
    102 #include "osal_defaults.h"
    103 
    104 /*****************************************************************************/
    105 /* Constant Macros                                                           */
    106 /*****************************************************************************/
    107 #define IP_DBG_L1_l2 0
    108 #define CHILD_BIAS 12
    109 
    110 /*****************************************************************************/
    111 /* Globals                                                                   */
    112 /*****************************************************************************/
    113 extern pf_intra_pred g_apf_lum_ip[10];
    114 
    115 extern WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES];
    116 
    117 UWORD8 gau1_cu_pos_x[64] = { 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7,
    118                              6, 7, 4, 5, 4, 5, 6, 7, 6, 7, 0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1,
    119                              2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 4, 5, 4, 5, 6, 7, 6, 7 };
    120 
    121 UWORD8 gau1_cu_pos_y[64] = { 0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 0, 0, 1, 1, 0, 0,
    122                              1, 1, 2, 2, 3, 3, 2, 2, 3, 3, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7,
    123                              6, 6, 7, 7, 4, 4, 5, 5, 4, 4, 5, 5, 6, 6, 7, 7, 6, 6, 7, 7 };
    124 
    125 #define RESET_BIT(x, bit) (x = x & ~((WORD32)1 << bit))
    126 
    127 /*****************************************************************************/
    128 /* Function Definitions                                                      */
    129 /*****************************************************************************/
    130 
    131 /*!
    132 ******************************************************************************
    133 * \if Function name : ihevce_update_cand_list \endif
    134 *
    135 * \brief
    136 *    Final Candidate list population, nbr flag andd nbr mode update function
    137 *
    138 * \param[in] ps_row_cu : pointer to cu analyse struct
    139 * \param[in] ps_cu_node : pointer to cu node info buffer
    140 * \param[in] ps_ed_blk_l1 : pointer to level 1 and 2 decision buffer
    141 * \param[in] pu1_cand_mode_list  : pointer to candidate list buffer
    142 *
    143 * \return
    144 *    None
    145 *
    146 * \author
    147 *  Ittiam
    148 *
    149 *****************************************************************************
    150 */
    151 void ihevce_update_cand_list(
    152     ihevce_ipe_cu_tree_t *ps_cu_node, ihevce_ed_blk_t *ps_ed_blk_l1, ihevce_ipe_ctxt_t *ps_ctxt)
    153 {
    154     WORD32 row, col, x, y, size;
    155 
    156     /* Candidate mode Update */
    157     (void)ps_ed_blk_l1;
    158     /* Update CTB mode map for the finalised CU */
    159     x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
    160     y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
    161     size = ps_cu_node->u1_cu_size >> 2;
    162     for(row = y; row < (y + size); row++)
    163     {
    164         for(col = x; col < (x + size); col++)
    165         {
    166             ps_ctxt->au1_ctb_mode_map[row][col] = ps_cu_node->best_mode;
    167         }
    168     }
    169     return;
    170 }
    171 
    172 /*!
    173 ******************************************************************************
    174 * \if Function name : ihevce_intra_populate_mode_bits_cost_bracketing \endif
    175 *
    176 * \brief
    177 *    Mpm indx calc function based on left and top available modes
    178 *
    179 * \param[in] top_intra_mode : Top available intra mode
    180 * \param[in] left_intra_mode : Left available intra mode
    181 * \param[in] available_top : Top availability flag
    182 * \param[in] available_left : Left availability flag
    183 * \param[in] cu_pos_y : cu position wrt to CTB
    184 * \param[in] mode_bits_cost : pointer to mode bits buffer
    185 * \param[in] lambda : Lambda value (SAD/SATD)
    186 * \param[in] cand_mode_list  : pointer to candidate list buffer
    187 *
    188 * \return
    189 *    None
    190 *
    191 * \author
    192 *  Ittiam
    193 *
    194 *****************************************************************************
    195 */
    196 void ihevce_intra_populate_mode_bits_cost_bracketing(
    197     WORD32 top_intra_mode,
    198     WORD32 left_intra_mode,
    199     WORD32 available_top,
    200     WORD32 available_left,
    201     WORD32 cu_pos_y,
    202     UWORD16 *mode_bits_cost,
    203     UWORD16 *mode_bits,
    204     WORD32 lambda,
    205     WORD32 *cand_mode_list)
    206 {
    207     /* local variables */
    208     WORD32 i;
    209     WORD32 cand_intra_pred_mode_left, cand_intra_pred_mode_top;
    210 
    211     UWORD16 one_bits_cost =
    212         COMPUTE_RATE_COST_CLIP30(4, lambda, (LAMBDA_Q_SHIFT + 1));  //1.5 * lambda
    213     UWORD16 two_bits_cost =
    214         COMPUTE_RATE_COST_CLIP30(6, lambda, (LAMBDA_Q_SHIFT + 1));  //2.5 * lambda
    215     UWORD16 five_bits_cost =
    216         COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1));  //5.5 * lambda
    217 
    218     for(i = 0; i < 35; i++)
    219     {
    220         mode_bits_cost[i] = five_bits_cost;
    221         mode_bits[i] = 5;
    222     }
    223 
    224     /* EIID: set availability flag to zero if modes are invalid.
    225        Required since some CU's might be skipped (though available)
    226        and their modes will be set to 255 (-1)*/
    227     if(35 < top_intra_mode || 0 > top_intra_mode)
    228         available_top = 0;
    229     if(35 < left_intra_mode || 0 > left_intra_mode)
    230         available_left = 0;
    231 
    232     /* Calculate cand_intra_pred_mode_N as per sec. 8.4.2 in JCTVC-J1003_d7 */
    233     /* N = top */
    234     if(0 == available_top)
    235     {
    236         cand_intra_pred_mode_top = INTRA_DC;
    237     }
    238     /* for neighbour != INTRA, setting DC is done outside */
    239     else if(0 == cu_pos_y) /* It's on the CTB boundary */
    240     {
    241         cand_intra_pred_mode_top = INTRA_DC;
    242     }
    243     else
    244     {
    245         cand_intra_pred_mode_top = top_intra_mode;
    246     }
    247 
    248     /* N = left */
    249     if(0 == available_left)
    250     {
    251         cand_intra_pred_mode_left = INTRA_DC;
    252         //cand_intra_pred_mode_left = cand_intra_pred_mode_top;
    253     }
    254     /* for neighbour != INTRA, setting DC is done outside */
    255     else
    256     {
    257         cand_intra_pred_mode_left = left_intra_mode;
    258     }
    259 
    260     /* Calculate cand_mode_list as per sec. 8.4.2 in JCTVC-J1003_d7 */
    261     if(cand_intra_pred_mode_left == cand_intra_pred_mode_top)
    262     {
    263         if(cand_intra_pred_mode_left < 2)
    264         {
    265             cand_mode_list[0] = INTRA_PLANAR;
    266             cand_mode_list[1] = INTRA_DC;
    267             cand_mode_list[2] = INTRA_ANGULAR(26); /* angular 26 = Vertical */
    268         }
    269         else
    270         {
    271             cand_mode_list[0] = cand_intra_pred_mode_left;
    272             cand_mode_list[1] = 2 + ((cand_intra_pred_mode_left + 29) % 32);
    273             cand_mode_list[2] = 2 + ((cand_intra_pred_mode_left - 2 + 1) % 32);
    274         }
    275     }
    276     else
    277     {
    278         if(0 == available_left)
    279         {
    280             cand_mode_list[0] = cand_intra_pred_mode_top;
    281             cand_mode_list[1] = cand_intra_pred_mode_left;
    282         }
    283         else
    284         {
    285             cand_mode_list[0] = cand_intra_pred_mode_left;
    286             cand_mode_list[1] = cand_intra_pred_mode_top;
    287         }
    288         if((cand_intra_pred_mode_left != INTRA_PLANAR) &&
    289            (cand_intra_pred_mode_top != INTRA_PLANAR))
    290         {
    291             cand_mode_list[2] = INTRA_PLANAR;
    292         }
    293         else if((cand_intra_pred_mode_left != INTRA_DC) && (cand_intra_pred_mode_top != INTRA_DC))
    294         {
    295             cand_mode_list[2] = INTRA_DC;
    296         }
    297         else
    298         {
    299             cand_mode_list[2] = INTRA_ANGULAR(26);
    300         }
    301     }
    302     mode_bits_cost[cand_mode_list[0]] = one_bits_cost;
    303     mode_bits_cost[cand_mode_list[1]] = two_bits_cost;
    304     mode_bits_cost[cand_mode_list[2]] = two_bits_cost;
    305 
    306     mode_bits[cand_mode_list[0]] = 2;
    307     mode_bits[cand_mode_list[1]] = 3;
    308     mode_bits[cand_mode_list[2]] = 3;
    309 }
    310 
    311 /*!
    312 ******************************************************************************
    313 * \if Function name : ihevce_pu_calc_4x4_blk \endif
    314 *
    315 * \brief
    316 *    4x4 pu (8x8 CU) mode decision using step 8421 method
    317 *
    318 * \param[in] ps_cu_node : pointer to cu node info buffer
    319 * \param[in] pu1_src : pointer to src pixels
    320 * \param[in] src_stride : frm source stride
    321 * \param[in] ref : pointer to reference pixels for prediction
    322 * \param[in] cand_mode_list  : pointer to candidate list buffer
    323 * \param[in] best_costs_4x4  : pointer to 3 best cost buffer
    324 * \param[in] best_modes_4x4  : pointer to 3 best mode buffer
    325 *
    326 * \return
    327 *    None
    328 *
    329 * \author
    330 *  Ittiam
    331 *
    332 *****************************************************************************
    333 */
    334 void ihevce_pu_calc_4x4_blk(
    335     ihevce_ipe_ctxt_t *ps_ctxt,
    336     ihevce_ipe_cu_tree_t *ps_cu_node,
    337     UWORD8 *pu1_src,
    338     WORD32 src_stride,
    339     UWORD8 *ref,
    340     UWORD16 *mode_bits_cost,
    341     WORD32 *best_costs_4x4,
    342     UWORD8 *best_modes_4x4,
    343     func_selector_t *ps_func_selector)
    344 {
    345     WORD16 *pi2_trans_tmp = ps_ctxt->pi2_trans_tmp;
    346     WORD16 *pi2_trans_out = ps_ctxt->pi2_trans_out;
    347     UWORD8 u1_use_satd = ps_ctxt->u1_use_satd;
    348     UWORD8 u1_level_1_refine_on = ps_ctxt->u1_level_1_refine_on;
    349 
    350     WORD32 i, j = 0, i_end;
    351     UWORD8 mode, best_amode = 255;
    352     UWORD8 pred[16];
    353 
    354     UWORD16 sad;
    355     WORD32 sad_cost = 0;
    356     WORD32 best_asad_cost = 0xFFFFF;
    357     WORD32 temp;
    358     UWORD8 modes_to_eval[5];
    359     WORD32 costs_4x4[5];
    360     UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
    361 
    362     /* LO resolution hence low resolution disable */
    363     WORD32 u1_low_resol = 0;
    364     UWORD8 au1_best_modes[1] = { 0 };
    365     WORD32 ai4_best_sad_costs[1] = { 0 };
    366 
    367     WORD16 *pi2_tmp = &pi2_trans_tmp[0];
    368 
    369     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list =
    370         &ps_ctxt->s_ipe_optimised_function_list;
    371 
    372     //apf_resd_trns[0] = &ihevc_resi_trans_4x4_ttype1;
    373     //apf_resd_trns[0] = &ihevc_HAD_4x4_8bit;
    374 
    375     for(i = 0; i < 5; i++)
    376     {
    377         costs_4x4[i] = MAX_INTRA_COST_IPE;
    378     }
    379 
    380     ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
    381         pu1_src,
    382         src_stride,
    383         ref,
    384         mode_bits_cost,
    385         au1_best_modes,
    386         ai4_best_sad_costs,
    387         u1_low_resol,
    388         ps_ipe_optimised_function_list->pf_4x4_sad_computer);
    389 
    390     best_amode = au1_best_modes[0];
    391     best_asad_cost = ai4_best_sad_costs[0];
    392 
    393     ASSERT(best_amode != 255);
    394     /* Around best level 4 angular mode, search for best level 2 mode */
    395     modes_to_eval[0] = best_amode - 2;
    396     modes_to_eval[1] = best_amode + 2;
    397     i = 0;
    398     i_end = 2;
    399     if(best_amode == 2)
    400         i = 1;
    401     else if(best_amode == 34)
    402         i_end = 1;
    403     for(; i < i_end; i++)
    404     {
    405         mode = modes_to_eval[i];
    406 
    407         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
    408 
    409         sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
    410 
    411         sad_cost = sad;
    412         sad_cost += mode_bits_cost[mode];
    413 
    414         if(sad_cost < best_asad_cost)
    415         {
    416             best_amode = mode;
    417             best_asad_cost = sad_cost;
    418         }
    419     }
    420 
    421     /* Around best level 2 angular mode, search for best level 1 mode */
    422     /* Also evaluate for non-angular mode */
    423 
    424     i = 0;
    425     /*Level 1 refinement is disabled for ES preset */
    426     if(1 == u1_level_1_refine_on)
    427     {
    428         if(best_amode != 2)
    429             modes_to_eval[i++] = best_amode - 1;
    430         modes_to_eval[i++] = best_amode;
    431     }
    432 
    433     modes_to_eval[i++] = 0;
    434     modes_to_eval[i++] = 1;
    435 
    436     if(1 == u1_level_1_refine_on)
    437     {
    438         if(best_amode != 34)
    439             modes_to_eval[i++] = best_amode + 1;
    440     }
    441     i_end = i;
    442     i = 0;
    443 
    444     for(; i < i_end; i++)
    445     {
    446         mode = modes_to_eval[i];
    447 
    448         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
    449 
    450         /* Hard coding to use SATD */
    451         if(u1_use_satd)
    452         {
    453             ps_func_selector->ihevc_resi_trans_4x4_ttype1_fptr(
    454                 pu1_src, &pred[0], (WORD32 *)pi2_tmp, pi2_trans_out, src_stride, 4, (4 << 16) | 0);
    455 
    456             sad = ihevce_ipe_pass_satd(pi2_trans_out, 4, 4);
    457         }
    458         else
    459         {
    460             sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(
    461                 pu1_src, &pred[0], src_stride, 4);
    462         }
    463         sad_cost = sad;
    464         sad_cost += mode_bits_cost[mode];
    465 
    466         costs_4x4[i] = sad_cost;
    467     }
    468 
    469     /* Arrange the reference array in ascending order */
    470     for(i = 0; i < (i_end - 1); i++)
    471     {
    472         for(j = i + 1; j < i_end; j++)
    473         {
    474             if(costs_4x4[i] > costs_4x4[j])
    475             {
    476                 temp = costs_4x4[i];
    477                 costs_4x4[i] = costs_4x4[j];
    478                 costs_4x4[j] = temp;
    479 
    480                 temp = modes_4x4[i];
    481                 modes_4x4[i] = modes_4x4[j];
    482                 modes_4x4[j] = temp;
    483             }
    484         }
    485     }
    486     for(i = 0; i < 3; i++)
    487     {
    488         best_costs_4x4[i] = costs_4x4[i];
    489         best_modes_4x4[i] = modes_to_eval[modes_4x4[i]];
    490     }
    491 
    492     {
    493         ps_cu_node->best_mode = best_modes_4x4[0];
    494         ps_cu_node->best_cost = best_costs_4x4[0];
    495         ps_cu_node->best_satd = best_costs_4x4[0] - mode_bits_cost[ps_cu_node->best_mode];
    496     }
    497 }
    498 
    499 /*!
    500 ******************************************************************************
    501 * \if Function name : ihevce_pu_calc_8x8_blk \endif
    502 *
    503 * \brief
    504 *    4x4 pu (8x8 CU) mode decision loop using step 8421 method
    505 *
    506 * \param[in] ps_curr_src : pointer to src pixels struct
    507 * \param[in] ps_ctxt : pointer to IPE context struct
    508 * \param[in] ps_cu_node : pointer to cu node info buffer
    509 *
    510 * \return
    511 *    None
    512 *
    513 * \author
    514 *  Ittiam
    515 *
    516 *****************************************************************************
    517 */
    518 void ihevce_pu_calc_8x8_blk(
    519     iv_enc_yuv_buf_t *ps_curr_src,
    520     ihevce_ipe_ctxt_t *ps_ctxt,
    521     ihevce_ipe_cu_tree_t *ps_cu_node,
    522     func_selector_t *ps_func_selector)
    523 {
    524     WORD32 i, j;
    525     WORD32 nbr_flags;
    526     nbr_avail_flags_t s_nbr;
    527     WORD32 trans_size = ps_cu_node->ps_parent->u1_cu_size >> 1;
    528 
    529     UWORD8 *pu1_src_4x4;
    530     WORD32 xA, xB, yA, yB;
    531     //WORD32 x, y, size;
    532     WORD32 top_intra_mode;
    533     WORD32 left_intra_mode;
    534     //    WORD8 *top_intra_mode_ptr;
    535     //  WORD8 *left_intra_mode_ptr;
    536     UWORD8 *pu1_orig;
    537     WORD32 src_strd = ps_curr_src->i4_y_strd;
    538 
    539     WORD32 cu_pos_x = ps_cu_node->ps_parent->u2_x0 << 1;
    540     WORD32 cu_pos_y = ps_cu_node->ps_parent->u2_y0 << 1;
    541     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
    542 
    543     ihevc_intra_pred_luma_ref_substitution_fptr =
    544         ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
    545 
    546     pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) +
    547                ((ps_cu_node->ps_parent->u2_y0 << 3) * src_strd) +
    548                (ps_cu_node->ps_parent->u2_x0 << 3);
    549     for(i = 0; i < 2; i++)
    550     {
    551         for(j = 0; j < 2; j++)
    552         {
    553             WORD32 cand_mode_list[3];
    554             pu1_src_4x4 = pu1_orig + (i * trans_size * src_strd) + (j * trans_size);
    555             /* get the neighbour availability flags */
    556             nbr_flags = ihevce_get_nbr_intra(
    557                 &s_nbr,
    558                 ps_ctxt->pu1_ctb_nbr_map,
    559                 ps_ctxt->i4_nbr_map_strd,
    560                 cu_pos_x + ((j) * (trans_size >> 2)),
    561                 cu_pos_y + ((i) * (trans_size >> 2)),
    562                 trans_size >> 2);
    563 
    564             /* call the function which populates sad cost for all the modes */
    565             xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + j;
    566             yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
    567             xB = xA + 1;
    568             yB = yA - 1;
    569             left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
    570             top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
    571 
    572             ihevce_intra_populate_mode_bits_cost_bracketing(
    573                 top_intra_mode,
    574                 left_intra_mode,
    575                 s_nbr.u1_top_avail,
    576                 s_nbr.u1_left_avail,
    577                 ps_cu_node->ps_parent->u2_y0,
    578                 &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
    579                 &ps_ctxt->au2_mode_bits_8x8_pu[0],
    580                 ps_ctxt->i4_ol_sad_lambda,
    581                 cand_mode_list);
    582 
    583             /* call the function which populates ref data for intra predicion */
    584             ihevc_intra_pred_luma_ref_substitution_fptr(
    585                 pu1_src_4x4 - src_strd - 1,
    586                 pu1_src_4x4 - src_strd,
    587                 pu1_src_4x4 - 1,
    588                 src_strd,
    589                 4,
    590                 nbr_flags,
    591                 &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
    592                 0);
    593 
    594             ihevce_pu_calc_4x4_blk(
    595                 ps_ctxt,
    596                 ps_cu_node->ps_sub_cu[(i * 2) + j],
    597                 pu1_src_4x4,
    598                 src_strd,
    599                 &ps_ctxt->au1_ref_8x8pu[i * 2 + j][0],
    600                 &ps_ctxt->au2_mode_bits_cost_8x8pu[i * 2 + j][0],
    601                 &ps_cu_node->ps_sub_cu[(i * 2) + j]->au4_best_cost_1tu[0],
    602                 &ps_cu_node->ps_sub_cu[(i * 2) + j]->au1_best_mode_1tu[0],
    603                 ps_func_selector);
    604 
    605             /*&au4_cost_4x4[i*2 + j][0],
    606                 &au1_modes_4x4[i*2 + j][0]);*/ //TTODO : mode will change for the four partition
    607 
    608             ihevce_set_nbr_map(
    609                 ps_ctxt->pu1_ctb_nbr_map,
    610                 ps_ctxt->i4_nbr_map_strd,
    611                 cu_pos_x + ((j) * (trans_size >> 2)),
    612                 cu_pos_y + ((i) * (trans_size >> 2)),
    613                 (trans_size >> 2),
    614                 1);
    615 
    616             xA = ((ps_cu_node->ps_parent->u2_x0 << 3) >> 2) + 1 + j;
    617             yA = ((ps_cu_node->ps_parent->u2_y0 << 3) >> 2) + 1 + i;
    618             ps_ctxt->au1_ctb_mode_map[yA][xA] = ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode;
    619             ps_cu_node->ps_sub_cu[i * 2 + j]->u2_mode_bits_cost =
    620                 ps_ctxt->au2_mode_bits_8x8_pu[ps_cu_node->ps_sub_cu[i * 2 + j]->best_mode];
    621         }
    622     }
    623 }
    624 
    625 /*!
    626 ******************************************************************************
    627 * \if Function name : ihevce_bracketing_analysis \endif
    628 *
    629 * \brief
    630 *    Interface function that evaluates MAX cu and MAX - 1 cu, with MAX cu size
    631 *    info decided coarse resolution mode decision. Compares the SATD/SAD cost btwn
    632 *    2 CUS and determines the actual CU size and best 3 modes to be given to rdopt
    633 *
    634 * \param[in] ps_ctxt : pointer to IPE context struct
    635 * \param[in] ps_cu_node : pointer to cu node info buffer
    636 * \param[in] ps_curr_src : pointer to src pixels struct
    637 * \param[in] ps_ctb_out : pointer to ip ctb out struct
    638 * \param[in] ps_row_cu : pointer to cu analyse struct
    639 * \param[in] ps_ed_l1_ctb : pointer to level 1 early deci struct
    640 * \param[in] ps_ed_l2_ctb : pointer to level 2 early deci struct
    641 * \param[in] ps_l0_ipe_out_ctb : pointer to ipe_l0_ctb_analyse_for_me_t struct
    642 *
    643 * \return
    644 *    None
    645 *
    646 * \author
    647 *  Ittiam
    648 *
    649 *****************************************************************************
    650 */
    651 void ihevce_bracketing_analysis(
    652     ihevce_ipe_ctxt_t *ps_ctxt,
    653     ihevce_ipe_cu_tree_t *ps_cu_node,
    654     iv_enc_yuv_buf_t *ps_curr_src,
    655     ctb_analyse_t *ps_ctb_out,
    656     //cu_analyse_t         *ps_row_cu,
    657     ihevce_ed_blk_t *ps_ed_l1_ctb,
    658     ihevce_ed_blk_t *ps_ed_l2_ctb,
    659     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
    660     ipe_l0_ctb_analyse_for_me_t *ps_l0_ipe_out_ctb)
    661 {
    662     WORD32 cu_pos_x = 0;
    663     WORD32 cu_pos_y = 0;
    664 
    665     UWORD8 u1_curr_ctb_wdt = ps_cu_node->u1_width;
    666     UWORD8 u1_curr_ctb_hgt = ps_cu_node->u1_height;
    667     WORD32 num_8x8_blks_x = (u1_curr_ctb_wdt >> 3);
    668     WORD32 num_8x8_blks_y = (u1_curr_ctb_hgt >> 3);
    669 
    670     ihevce_ed_blk_t *ps_ed_blk_l1 = ps_ed_l1_ctb;
    671     ihevce_ed_blk_t *ps_ed_blk_l2 = ps_ed_l2_ctb;
    672 
    673     WORD32 i;
    674     WORD32 cand_mode_list[3];
    675     //cu_analyse_t *ps_curr_cu = ps_row_cu;
    676     WORD32 blk_cnt = 0;
    677     WORD32 j = 0;
    678     WORD32 merge_32x32_l1, merge_32x32_l2;
    679 
    680     WORD32 i4_skip_intra_eval_32x32_l1;
    681     //EIID: flag indicating number of 16x16 blocks to be skipped for intra evaluation within 32x32 block
    682 
    683     WORD32 parent_cost = 0;
    684     WORD32 child_cost[4] = { 0 };
    685     WORD32 child_cost_least = 0;
    686     WORD32 child_satd[4] = { 0 };
    687     WORD32 x, y, size;
    688     WORD32 merge_64x64 = 1;
    689     UWORD8 au1_best_32x32_modes[4];
    690     WORD32 au4_best_32x32_cost[4];
    691     WORD32 parent_best_mode;
    692     UWORD8 best_mode;
    693 
    694     WORD32 i4_quality_preset = ps_ctxt->i4_quality_preset;
    695     /* flag to control 1CU-4TU modes based on quality preset                */
    696     /* if set 1CU-4TU are explicity evaluated else 1CU-1TU modes are copied */
    697     WORD32 i4_enable_1cu_4tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
    698                                (i4_quality_preset == IHEVCE_QUALITY_P0);
    699 
    700     /* flag to control 4CU-16TU mode based on quality preset                */
    701     /* if set 4CU-16TU are explicity evaluated else 4CU-4TU modes are copied*/
    702     WORD32 i4_enable_4cu_16tu = (i4_quality_preset == IHEVCE_QUALITY_P2) ||
    703                                 (i4_quality_preset == IHEVCE_QUALITY_P0);
    704 
    705     WORD32 i4_mod_factor_num, i4_mod_factor_den = QP_MOD_FACTOR_DEN;  //2;
    706     float f_strength;
    707     /* Accumalte satd */
    708     LWORD64 i8_frame_acc_satd_cost = 0, i8_frame_acc_satd_by_modqp_q10 = 0;
    709     WORD32 i4_ctb_acc_satd = 0;
    710 
    711     /* Accumalate Mode bits cost */
    712     LWORD64 i8_frame_acc_mode_bits_cost = 0;
    713 
    714     /* Step2 is bypassed for parent, uses children modes*/
    715     WORD32 step2_bypass = 1;
    716 
    717     if(1 == ps_ctxt->u1_disable_child_cu_decide)
    718         step2_bypass = 0;
    719 
    720     ps_cu_node->ps_parent = ps_ctxt->ps_ipe_cu_tree;
    721     for(i = 0; i < 4; i++)
    722     {
    723         ps_cu_node->ps_sub_cu[i] = ps_ctxt->ps_ipe_cu_tree + 1 + i;
    724     }
    725 
    726     /* Loop for all 8x8 block in a CTB */
    727     ps_ctb_out->u4_cu_split_flags = 0x1;
    728 
    729     /* Initialize intra 64x64, 32x32 and 16x16 costs to max value */
    730     for(i = 0; i < (MAX_CU_IN_CTB >> 4); i++)
    731     {
    732         ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i] = MAX_INTRA_COST_IPE;
    733     }
    734 
    735     for(i = 0; i < (MAX_CU_IN_CTB >> 2); i++)
    736     {
    737         ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[i] = MAX_INTRA_COST_IPE;
    738     }
    739 
    740     for(i = 0; i < (MAX_CU_IN_CTB); i++)
    741     {
    742         ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[i] = MAX_INTRA_COST_IPE;
    743     }
    744 
    745     ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = MAX_INTRA_COST_IPE;
    746 
    747     /* by default 64x64 modes are set to default values DC and Planar */
    748     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = 0;
    749     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = 1;
    750     ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = 255;
    751 
    752     /* by default 64x4 split is set to 1 */
    753     ps_l0_ipe_out_ctb->u1_split_flag = 1;
    754 
    755     /* Modulation factor calculated based on spatial variance instead of hardcoded val*/
    756     i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[1];  //16;
    757 
    758     f_strength = ps_ctxt->f_strength;
    759 
    760     /* ------------------------------------------------ */
    761     /* populate the early decisions done by L1 analysis */
    762     /* ------------------------------------------------ */
    763     {
    764         ihevce_ed_blk_t *ps_ed_blk_l1_curr = ps_ed_l1_ctb;
    765         WORD32 ctr_8x8;
    766         WORD8 *pi1_ed_buf;
    767 
    768         /* set all the decisions to invalid */
    769         memset(
    770             &ps_l0_ipe_out_ctb->ai1_early_intra_inter_decision[0],
    771             0,
    772             sizeof(UWORD8) * MAX_CU_IN_CTB);
    773 
    774         pi1_ed_buf = &ps_l0_ipe_out_ctb->ai1_early_intra_inter_decision[0];
    775 
    776         for(ctr_8x8 = 0; ctr_8x8 < MAX_CTB_SIZE; ctr_8x8++)
    777         {
    778             WORD32 pos_x_8x8, pos_y_8x8;
    779 
    780             pos_x_8x8 = gau1_cu_pos_x[ctr_8x8];
    781             pos_y_8x8 = gau1_cu_pos_y[ctr_8x8];
    782 
    783             pi1_ed_buf[pos_x_8x8 + (pos_y_8x8 * MAX_CU_IN_CTB_ROW)] =
    784                 ps_ed_blk_l1_curr->intra_or_inter;
    785             ps_ed_blk_l1_curr++;
    786         }
    787 
    788         for(ctr_8x8 = 0; ctr_8x8 < (MAX_CU_IN_CTB >> 2); ctr_8x8++)
    789         {
    790             ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_ipe[ctr_8x8] =
    791                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[ctr_8x8];
    792 
    793             ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[ctr_8x8] =
    794                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[ctr_8x8];
    795 
    796             /*Earlier only me sad was getting populated, now best of ipe and me is populated*/
    797             ps_l0_ipe_out_ctb->ai4_best_sad_8x8_l1_me[ctr_8x8] =
    798                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[ctr_8x8];
    799             //ps_ed_ctb_l1->i4_sad_me_for_ref[ctr_8x8];
    800 
    801             ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_me[ctr_8x8] =
    802                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[ctr_8x8];
    803             //ps_ed_ctb_l1->i4_sad_cost_me_for_ref[ctr_8x8];
    804         }
    805 
    806         /*Init CTB level accumalated SATD and MPM bits */
    807         ps_l0_ipe_out_ctb->i4_ctb_acc_satd = 0;
    808         ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = 0;
    809     }
    810 
    811     /* ------------------------------------------------ */
    812     /* Loop over all the blocks in current CTB          */
    813     /* ------------------------------------------------ */
    814 
    815     {
    816         /* 64 8x8 blocks should be encountered for the do,while loop to exit */
    817         do
    818         {
    819             intra32_analyse_t *ps_intra32_analyse;
    820             intra16_analyse_t *ps_intra16_analyse;
    821             WORD32 *pi4_intra_32_cost;
    822             WORD32 *pi4_intra_16_cost;
    823             WORD32 *pi4_intra_8_cost;
    824             WORD32 merge_16x16_l1;
    825 
    826             /* Given the blk_cnt, get the CU's top-left 8x8 block's x and y positions within the CTB */
    827             cu_pos_x = gau1_cu_pos_x[blk_cnt];
    828             cu_pos_y = gau1_cu_pos_y[blk_cnt];
    829 
    830             /* default value for 32x32 best mode - blk_cnt increases by 16 for each 32x32 */
    831             au1_best_32x32_modes[blk_cnt >> 4] = 255;
    832 
    833             /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
    834             /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
    835             ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[blk_cnt >> 4];
    836 
    837             /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
    838             /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
    839             ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[(blk_cnt & 0xF) >> 2];
    840 
    841             /* Line below assumes min_cu_size of 8 - checks whether CU starts are within picture */
    842             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
    843             {
    844                 /* Reset to zero for every cu decision */
    845                 merge_32x32_l1 = 0;
    846 
    847                 child_cost_least = 0;
    848 
    849                 /* At L2, each 4x4 corresponds to 16x16 at L0. Every 4 16x16 stores a merge_success flag */
    850                 ps_ed_blk_l2 = ps_ed_l2_ctb + (blk_cnt >> 2);
    851 
    852                 pi4_intra_32_cost = &ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[blk_cnt >> 4];
    853 
    854                 /* by default 32x32 modes are set to default values DC and Planar */
    855                 ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 0;
    856                 ps_intra32_analyse->au1_best_modes_32x32_tu[1] = 1;
    857                 ps_intra32_analyse->au1_best_modes_32x32_tu[2] = 255;
    858 
    859                 /* By default 32x32 split is set to 1 */
    860                 ps_intra32_analyse->b1_split_flag = 1;
    861 
    862                 ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 0;
    863                 ps_intra32_analyse->au1_best_modes_16x16_tu[1] = 1;
    864                 ps_intra32_analyse->au1_best_modes_16x16_tu[2] = 255;
    865 
    866                 /* 16x16 cost & 8x8 cost are stored in Raster scan order */
    867                 /* stride of 16x16 buffer is MAX_CU_IN_CTB_ROW >> 1      */
    868                 /* stride of 8x8 buffer is MAX_CU_IN_CTB_ROW             */
    869                 {
    870                     WORD32 pos_x_8x8, pos_y_8x8;
    871 
    872                     pos_x_8x8 = gau1_cu_pos_x[blk_cnt];
    873                     pos_y_8x8 = gau1_cu_pos_y[blk_cnt];
    874 
    875                     pi4_intra_16_cost = &ps_l0_ipe_out_ctb->ai4_best16x16_intra_cost[0];
    876 
    877                     pi4_intra_16_cost +=
    878                         ((pos_x_8x8 >> 1) + ((pos_y_8x8 >> 1) * (MAX_CU_IN_CTB_ROW >> 1)));
    879 
    880                     pi4_intra_8_cost = &ps_l0_ipe_out_ctb->ai4_best8x8_intra_cost[0];
    881 
    882                     pi4_intra_8_cost += (pos_x_8x8 + (pos_y_8x8 * MAX_CU_IN_CTB_ROW));
    883                 }
    884 
    885                 merge_32x32_l1 = 0;
    886                 merge_32x32_l2 = 0;
    887                 i4_skip_intra_eval_32x32_l1 = 0;
    888 
    889                 /* Enable 16x16 merge iff sufficient 8x8 blocks remain in the current CTB */
    890                 merge_16x16_l1 = 0;
    891                 if(((num_8x8_blks_x - cu_pos_x) >= 2) && ((num_8x8_blks_y - cu_pos_y) >= 2))
    892                 {
    893 #if !ENABLE_UNIFORM_CU_SIZE_8x8
    894                     merge_16x16_l1 = ps_ed_blk_l1->merge_success;
    895 #else
    896                     merge_16x16_l1 = 0;
    897 #endif
    898                 }
    899 
    900                 /* Enable 32x32 merge iff sufficient 8x8 blocks remain in the current CTB */
    901                 if(((num_8x8_blks_x - cu_pos_x) >= 4) && ((num_8x8_blks_y - cu_pos_y) >= 4))
    902                 {
    903                     /* Check 4 flags of L1(8x8) say merge */
    904                     for(i = 0; i < 4; i++)
    905                     {
    906                         merge_32x32_l1 += (ps_ed_blk_l1 + (i * 4))->merge_success;
    907 
    908                         //EIDD: num 16x16 blocks for which inter_intra flag says eval only inter, i.e. skip intra eval
    909                         i4_skip_intra_eval_32x32_l1 +=
    910                             ((ps_ed_blk_l1 + (i * 4))->intra_or_inter == 2) ? 1 : 0;
    911                     }
    912 
    913 #if !ENABLE_UNIFORM_CU_SIZE_8x8
    914                     /* Check 1 flag from L2(16x16) say merge */
    915                     merge_32x32_l2 = ps_ed_blk_l2->merge_success;
    916 #else
    917                     merge_32x32_l1 = 0;
    918                     merge_32x32_l2 = 0;
    919 #endif
    920                 }
    921 
    922 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
    923                 if((i4_quality_preset == IHEVCE_QUALITY_P6) && (ps_ctxt->i4_slice_type != ISLICE))
    924                 {
    925                     merge_32x32_l2 = 0;
    926                     ps_ed_blk_l2->merge_success = 0;
    927                 }
    928 #endif
    929 
    930                 ps_intra32_analyse->b1_valid_cu = 1;
    931 
    932                 /* If Merge success from all 4 L1 and L2, max CU size 32x32 is chosen */
    933                 /* EIID: if all blocks to be skipped then skip entire 32x32 for intra eval,
    934                 if no blocks to be skipped then eval entire 32x32,
    935                 else break the merge and go to 16x16 level eval */
    936                 if((merge_32x32_l1 == 4) && merge_32x32_l2 &&
    937                    ((i4_skip_intra_eval_32x32_l1 == 0) ||
    938                     (i4_skip_intra_eval_32x32_l1 == 4))  //comment this line to disable break-merge
    939                 )
    940                 {
    941 #if IP_DBG_L1_l2
    942                     /* Populate params for 32x32 block analysis */
    943                     ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
    944 
    945                     ps_cu_node->ps_parent->u1_cu_size = 32;
    946                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
    947                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
    948                     ps_cu_node->ps_parent->best_mode = ps_ed_blk_l2->best_merge_mode;
    949                     /* CU size 32x32 and fill the final cu params */
    950 
    951                     ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
    952 
    953                     /* Increment pointers */
    954                     ps_ed_blk_l1 += 16;
    955                     blk_cnt += 16;
    956                     ps_row_cu++;
    957                     merge_64x64 &= 1;
    958 #else
    959 
    960                     /* EIID: dont evaluate if all 4 blocks at L1 said inter is winning*/
    961                     if(4 == i4_skip_intra_eval_32x32_l1 && (ps_ctxt->i4_slice_type != ISLICE))
    962                     {
    963                         WORD32 i4_local_ctr1, i4_local_ctr2;
    964 
    965                         ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
    966 
    967                         ps_cu_node->ps_parent->u1_cu_size = 32;
    968                         ps_cu_node->ps_parent->u2_x0 =
    969                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
    970                         ps_cu_node->ps_parent->u2_y0 =
    971                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
    972                         ps_cu_node->ps_parent->best_mode =
    973                             INTRA_DC;  //ps_ed_blk_l2->best_merge_mode;
    974                         /* CU size 32x32 and fill the final cu params */
    975 
    976                         /* fill in the first modes as invalid */
    977                         ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
    978                         ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
    979                             INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
    980                         ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
    981 
    982                         ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
    983                         ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
    984                         ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
    985 
    986                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
    987 
    988                         //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
    989                         //ps_row_cu->u1_num_intra_rdopt_cands = 0;
    990 
    991                         ps_intra32_analyse->b1_valid_cu = 0;
    992                         ps_intra32_analyse->b1_split_flag = 0;
    993                         ps_intra32_analyse->b1_merge_flag = 0;
    994                         /*memset (&ps_intra32_analyse->au1_best_modes_32x32_tu,
    995                         255,
    996                         NUM_BEST_MODES);
    997                         memset (&ps_intra32_analyse->au1_best_modes_16x16_tu,
    998                         255,
    999                         NUM_BEST_MODES);*/
   1000                         //set only first mode since if it's 255. it wont go ahead
   1001                         ps_intra32_analyse->au1_best_modes_32x32_tu[0] = 255;
   1002                         ps_intra32_analyse->au1_best_modes_16x16_tu[0] = 255;
   1003                         ps_intra32_analyse->i4_best_intra_cost = MAX_INTRA_COST_IPE;
   1004 
   1005                         *pi4_intra_32_cost = MAX_INTRA_COST_IPE;
   1006 
   1007                         /*since ME will start evaluating from bottom up, set the lower
   1008                         cu size data invalid */
   1009                         for(i4_local_ctr1 = 0; i4_local_ctr1 < 4; i4_local_ctr1++)
   1010                         {
   1011                             WORD32 *pi4_intra_8_cost_curr16;
   1012 
   1013                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1014                                 .au1_best_modes_16x16_tu[0] = 255;
   1015                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1016                                 .au1_best_modes_8x8_tu[0] = 255;
   1017                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1018                                 .i4_best_intra_cost = MAX_INTRA_COST_IPE;
   1019                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_merge_flag = 0;
   1020                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_valid_cu = 0;
   1021                             ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1].b1_split_flag = 0;
   1022 
   1023                             pi4_intra_16_cost
   1024                                 [(i4_local_ctr1 & 1) + ((MAX_CU_IN_CTB_ROW >> 1) *
   1025                                                         (i4_local_ctr1 >> 1))] = MAX_INTRA_COST_IPE;
   1026 
   1027                             pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((i4_local_ctr1 & 1) << 1);
   1028                             pi4_intra_8_cost_curr16 +=
   1029                                 ((i4_local_ctr1 >> 1) << 1) * MAX_CU_IN_CTB_ROW;
   1030 
   1031                             for(i4_local_ctr2 = 0; i4_local_ctr2 < 4; i4_local_ctr2++)
   1032                             {
   1033                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1034                                     .as_intra8_analyse[i4_local_ctr2]
   1035                                     .au1_4x4_best_modes[0][0] = 255;
   1036                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1037                                     .as_intra8_analyse[i4_local_ctr2]
   1038                                     .au1_4x4_best_modes[1][0] = 255;
   1039                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1040                                     .as_intra8_analyse[i4_local_ctr2]
   1041                                     .au1_4x4_best_modes[2][0] = 255;
   1042                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1043                                     .as_intra8_analyse[i4_local_ctr2]
   1044                                     .au1_4x4_best_modes[3][0] = 255;
   1045                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1046                                     .as_intra8_analyse[i4_local_ctr2]
   1047                                     .au1_best_modes_8x8_tu[0] = 255;
   1048                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1049                                     .as_intra8_analyse[i4_local_ctr2]
   1050                                     .au1_best_modes_4x4_tu[0] = 255;
   1051                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1052                                     .as_intra8_analyse[i4_local_ctr2]
   1053                                     .i4_best_intra_cost = MAX_INTRA_COST_IPE;
   1054                                 ps_intra32_analyse->as_intra16_analyse[i4_local_ctr1]
   1055                                     .as_intra8_analyse[i4_local_ctr2]
   1056                                     .b1_valid_cu = 0;
   1057 
   1058                                 pi4_intra_8_cost_curr16
   1059                                     [(i4_local_ctr2 & 1) +
   1060                                      (MAX_CU_IN_CTB_ROW * (i4_local_ctr2 >> 1))] =
   1061                                         MAX_INTRA_COST_IPE;
   1062                             }
   1063                         }
   1064 
   1065                         /* set neighbours even if intra is not evaluated, since source is always available. */
   1066                         ihevce_set_nbr_map(
   1067                             ps_ctxt->pu1_ctb_nbr_map,
   1068                             ps_ctxt->i4_nbr_map_strd,
   1069                             ps_cu_node->ps_parent->u2_x0 << 1,
   1070                             ps_cu_node->ps_parent->u2_y0 << 1,
   1071                             (ps_cu_node->ps_parent->u1_cu_size >> 2),
   1072                             1);
   1073 
   1074                         /* cost accumalation of best cu size candiate */
   1075                         /*i8_frame_acc_satd_cost += parent_cost;*/
   1076 
   1077                         /* Mode bits cost accumalation for best cu size and cu mode */
   1078                         /*i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;*/
   1079 
   1080                         /*satd/mod_qp accumulation of best cu */
   1081                         /*i8_frame_acc_satd_by_modqp_q10 += ((LWORD64)ps_cu_node->ps_parent->best_satd << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3))/i4_q_scale_q3_mod;*/
   1082 
   1083                         /* Increment pointers */
   1084                         ps_ed_blk_l1 += 16;
   1085                         blk_cnt += 16;
   1086                         //ps_row_cu++;
   1087                         merge_64x64 = 0;
   1088 
   1089                         /* increment for stat purpose only. Increment is valid only on single thread */
   1090                         ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 4;
   1091                     }
   1092                     else
   1093                     {
   1094                         /* Revaluation of 4 16x16 blocks at 8x8 prediction level */
   1095                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
   1096 
   1097                         if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
   1098                            (ps_ctxt->i4_slice_type == PSLICE))
   1099                         {
   1100                             ps_ctxt->u1_disable_child_cu_decide = 1;
   1101                             step2_bypass = 0;
   1102                         }
   1103 
   1104                         /* Based on the flag, Child modes decision can be disabled*/
   1105                         if(0 == ps_ctxt->u1_disable_child_cu_decide)
   1106                         {
   1107                             for(j = 0; j < 4; j++)
   1108                             {
   1109                                 ps_cu_node->ps_sub_cu[j]->u2_x0 =
   1110                                     gau1_cu_pos_x[blk_cnt + (j * 4)]; /* Populate properly */
   1111                                 ps_cu_node->ps_sub_cu[j]->u2_y0 =
   1112                                     gau1_cu_pos_y[blk_cnt + (j * 4)]; /* Populate properly */
   1113                                 ps_cu_node->ps_sub_cu[j]->u1_cu_size = 16;
   1114 
   1115                                 {
   1116                                     WORD32 best_ang_mode =
   1117                                         (ps_ed_blk_l1 + (j * 4))->best_merge_mode;
   1118 
   1119                                     if(best_ang_mode < 2)
   1120                                         best_ang_mode = 26;
   1121 
   1122                                     ihevce_mode_eval_filtering(
   1123                                         ps_cu_node->ps_sub_cu[j],
   1124                                         ps_cu_node,
   1125                                         ps_ctxt,
   1126                                         ps_curr_src,
   1127                                         best_ang_mode,
   1128                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
   1129                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
   1130                                         !step2_bypass,
   1131                                         1);
   1132 
   1133                                     if(i4_enable_4cu_16tu)
   1134                                     {
   1135                                         ihevce_mode_eval_filtering(
   1136                                             ps_cu_node->ps_sub_cu[j],
   1137                                             ps_cu_node,
   1138                                             ps_ctxt,
   1139                                             ps_curr_src,
   1140                                             best_ang_mode,
   1141                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
   1142                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
   1143                                             !step2_bypass,
   1144                                             0);
   1145                                     }
   1146                                     else
   1147                                     {
   1148                                         /* 4TU not evaluated :  4tu modes set same as 1tu modes */
   1149                                         memcpy(
   1150                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
   1151                                             &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
   1152                                             NUM_BEST_MODES);
   1153 
   1154                                         /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
   1155                                         memcpy(
   1156                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
   1157                                             &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
   1158                                             NUM_BEST_MODES * sizeof(WORD32));
   1159                                     }
   1160 
   1161                                     child_cost[j] =
   1162                                         MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
   1163                                             ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
   1164 
   1165                                     /* Child cost is sum of costs at 16x16 level  */
   1166                                     child_cost_least += child_cost[j];
   1167 
   1168                                     /* Select the best mode to be populated as top and left nbr depending on the
   1169                                     4tu and 1tu cost */
   1170                                     if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
   1171                                        ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
   1172                                     {
   1173                                         ps_cu_node->ps_sub_cu[j]->best_mode =
   1174                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
   1175                                     }
   1176                                     else
   1177                                     {
   1178                                         ps_cu_node->ps_sub_cu[j]->best_mode =
   1179                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
   1180                                     }
   1181 
   1182                                     { /* Update the CTB nodes only for MAX - 1 CU nodes */
   1183                                         WORD32 xA, yA, row, col;
   1184                                         xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
   1185                                         yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
   1186                                         size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
   1187                                         for(row = yA; row < (yA + size); row++)
   1188                                         {
   1189                                             for(col = xA; col < (xA + size); col++)
   1190                                             {
   1191                                                 ps_ctxt->au1_ctb_mode_map[row][col] =
   1192                                                     ps_cu_node->ps_sub_cu[j]->best_mode;
   1193                                             }
   1194                                         }
   1195                                     }
   1196                                 }
   1197 
   1198                                 /*Child SATD cost*/
   1199                                 child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
   1200 
   1201                                 /* store the child 16x16 costs */
   1202                                 pi4_intra_16_cost[(j & 1) + ((MAX_CU_IN_CTB_ROW >> 1) * (j >> 1))] =
   1203                                     child_cost[j];
   1204 
   1205                                 /* set the CU valid flag */
   1206                                 ps_intra16_analyse[j].b1_valid_cu = 1;
   1207 
   1208                                 /* All 16x16 merge is valid, if Cu 32x32 is chosen */
   1209                                 /* To be reset, if CU 64x64 is chosen */
   1210                                 ps_intra16_analyse[j].b1_merge_flag = 1;
   1211 
   1212                                 /* storing the modes to intra 16 analyse */
   1213                                 /* store the best 16x16 modes 8x8 tu */
   1214                                 memcpy(
   1215                                     &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
   1216                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
   1217                                     sizeof(UWORD8) * (NUM_BEST_MODES));
   1218                                 ps_intra16_analyse[j].au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
   1219 
   1220                                 /* store the best 16x16 modes 16x16 tu */
   1221                                 memcpy(
   1222                                     &ps_intra16_analyse[j].au1_best_modes_16x16_tu[0],
   1223                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
   1224                                     sizeof(UWORD8) * (NUM_BEST_MODES));
   1225                                 ps_intra16_analyse[j].au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
   1226 
   1227                                 /* divide the 16x16 costs (pro rating) to 4 8x8 costs */
   1228                                 /* store the same 16x16 modes as 4 8x8 child modes    */
   1229                                 {
   1230                                     WORD32 idx_8x8;
   1231                                     WORD32 *pi4_intra_8_cost_curr16;
   1232                                     intra8_analyse_t *ps_intra8_analyse;
   1233 
   1234                                     pi4_intra_8_cost_curr16 = pi4_intra_8_cost + ((j & 1) << 1);
   1235                                     pi4_intra_8_cost_curr16 += ((j >> 1) << 1) * MAX_CU_IN_CTB_ROW;
   1236 
   1237                                     for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
   1238                                     {
   1239                                         pi4_intra_8_cost_curr16
   1240                                             [(idx_8x8 & 1) + (MAX_CU_IN_CTB_ROW * (idx_8x8 >> 1))] =
   1241                                                 (child_cost[j] + 3) >> 2;
   1242 
   1243                                         ps_intra8_analyse =
   1244                                             &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
   1245 
   1246                                         ps_intra8_analyse->b1_enable_nxn = 0;
   1247                                         ps_intra8_analyse->b1_valid_cu = 1;
   1248 
   1249                                         /* store the best 8x8 modes 8x8 tu */
   1250                                         memcpy(
   1251                                             &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
   1252                                             &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
   1253                                             sizeof(UWORD8) * (NUM_BEST_MODES + 1));
   1254 
   1255                                         /* store the best 8x8 modes 4x4 tu */
   1256                                         memcpy(
   1257                                             &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
   1258                                             &ps_intra16_analyse[j].au1_best_modes_8x8_tu[0],
   1259                                             sizeof(UWORD8) * (NUM_BEST_MODES + 1));
   1260 
   1261                                         /* NXN modes not evaluated hence set to 0 */
   1262                                         memset(
   1263                                             &ps_intra8_analyse->au1_4x4_best_modes[0][0],
   1264                                             255,
   1265                                             sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
   1266                                     }
   1267                                 }
   1268                             }
   1269 
   1270                             ihevce_set_nbr_map(
   1271                                 ps_ctxt->pu1_ctb_nbr_map,
   1272                                 ps_ctxt->i4_nbr_map_strd,
   1273                                 ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
   1274                                 ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
   1275                                 (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
   1276                                 0);
   1277                         }
   1278 #if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
   1279                         else
   1280                         {
   1281                             for(j = 0; j < 4; j++)
   1282                             {
   1283                                 WORD32 idx_8x8;
   1284                                 intra8_analyse_t *ps_intra8_analyse;
   1285                                 ps_intra16_analyse[j].au1_best_modes_8x8_tu[0] = 255;
   1286                                 ps_intra16_analyse[j].au1_best_modes_16x16_tu[0] = 255;
   1287 
   1288                                 ps_intra16_analyse[j].b1_valid_cu = 0;
   1289 
   1290                                 for(idx_8x8 = 0; idx_8x8 < 4; idx_8x8++)
   1291                                 {
   1292                                     ps_intra8_analyse =
   1293                                         &ps_intra16_analyse[j].as_intra8_analyse[idx_8x8];
   1294 
   1295                                     ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
   1296                                     ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
   1297 
   1298                                     ps_intra8_analyse->b1_enable_nxn = 0;
   1299                                     ps_intra8_analyse->b1_valid_cu = 0;
   1300 
   1301                                     /* NXN modes not evaluated hence set to 0 */
   1302                                     memset(
   1303                                         &ps_intra8_analyse->au1_4x4_best_modes[0][0],
   1304                                         255,
   1305                                         sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
   1306                                 }
   1307                             }
   1308 
   1309                             child_cost_least = MAX_INTRA_COST_IPE;
   1310                         }
   1311 #endif
   1312 
   1313                         /* Populate params for 32x32 block analysis */
   1314 
   1315                         ps_cu_node->ps_parent->u1_cu_size = 32;
   1316                         ps_cu_node->ps_parent->u2_x0 =
   1317                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
   1318                         ps_cu_node->ps_parent->u2_y0 =
   1319                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
   1320 
   1321                         /* Revaluation for 32x32 parent block at 16x16 prediction level */
   1322                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
   1323 
   1324                         {
   1325                             /* Eval for TUSize = CuSize */
   1326                             ihevce_mode_eval_filtering(
   1327                                 ps_cu_node->ps_parent,
   1328                                 ps_cu_node,
   1329                                 ps_ctxt,
   1330                                 ps_curr_src,
   1331                                 26,
   1332                                 &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
   1333                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   1334                                 step2_bypass,
   1335                                 1);
   1336 
   1337                             if(i4_enable_1cu_4tu)
   1338                             {
   1339                                 /* Eval for TUSize = CuSize/2 */
   1340                                 ihevce_mode_eval_filtering(
   1341                                     ps_cu_node->ps_parent,
   1342                                     ps_cu_node,
   1343                                     ps_ctxt,
   1344                                     ps_curr_src,
   1345                                     26,
   1346                                     &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   1347                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   1348                                     step2_bypass,
   1349                                     0);
   1350                             }
   1351                             else
   1352                             {
   1353                                 /* 4TU not evaluated :  4tu modes set same as 1tu modes */
   1354                                 memcpy(
   1355                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   1356                                     &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   1357                                     NUM_BEST_MODES);
   1358 
   1359                                 /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
   1360                                 memcpy(
   1361                                     &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   1362                                     &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
   1363                                     NUM_BEST_MODES * sizeof(WORD32));
   1364                             }
   1365                         }
   1366 
   1367                         ps_ctxt->u1_disable_child_cu_decide = 0;
   1368                         step2_bypass = 1;
   1369 
   1370                         /* Update parent cost */
   1371                         parent_cost =
   1372                             MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   1373                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
   1374 
   1375                         /* Select the best mode to be populated as top and left nbr depending on the
   1376                         4tu and 1tu cost */
   1377                         if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
   1378                            ps_cu_node->ps_parent->au4_best_cost_1tu[0])
   1379                         {
   1380                             ps_cu_node->ps_parent->best_mode =
   1381                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
   1382                         }
   1383                         else
   1384                         {
   1385                             ps_cu_node->ps_parent->best_mode =
   1386                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0];
   1387                         }
   1388 
   1389                         /* store the 32x32 cost */
   1390                         *pi4_intra_32_cost = parent_cost;
   1391 
   1392                         /* set the CU valid flag */
   1393                         ps_intra32_analyse->b1_valid_cu = 1;
   1394 
   1395                         ps_intra32_analyse->b1_merge_flag = 1;
   1396 
   1397                         /* storing the modes to intra 32 analyse */
   1398                         {
   1399                             /* store the best 32x32 modes 16x16 tu */
   1400                             memcpy(
   1401                                 &ps_intra32_analyse->au1_best_modes_16x16_tu[0],
   1402                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   1403                                 sizeof(UWORD8) * (NUM_BEST_MODES));
   1404                             ps_intra32_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
   1405 
   1406                             /* store the best 32x32 modes 32x32 tu */
   1407                             memcpy(
   1408                                 &ps_intra32_analyse->au1_best_modes_32x32_tu[0],
   1409                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   1410                                 sizeof(UWORD8) * (NUM_BEST_MODES));
   1411                             ps_intra32_analyse->au1_best_modes_32x32_tu[NUM_BEST_MODES] = 255;
   1412                         }
   1413                         parent_best_mode = ps_cu_node->ps_parent->best_mode;
   1414                         if((parent_cost <=
   1415                             child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
   1416                                                 LAMBDA_Q_SHIFT)))  //|| identical_modes)
   1417                         {
   1418                             WORD32 i4_q_scale_q3_mod;
   1419                             UWORD8 u1_cu_possible_qp;
   1420                             WORD32 i4_act_factor;
   1421 
   1422                             /* CU size 32x32 and fill the final cu params */
   1423 
   1424                             ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
   1425 
   1426                             if((IHEVCE_QUALITY_P3 > i4_quality_preset))
   1427                             {
   1428                                 for(i = 0; i < 4; i++)
   1429                                 {
   1430                                     intra8_analyse_t *ps_intra8_analyse;
   1431                                     ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
   1432                                     for(j = 0; j < 4; j++)
   1433                                     {
   1434                                         /* Populate best 3 nxn modes */
   1435                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] =
   1436                                             ps_cu_node->ps_sub_cu[i]->au1_best_mode_4tu[0];
   1437                                         ps_intra8_analyse->au1_4x4_best_modes[j][1] =
   1438                                             ps_cu_node->ps_sub_cu[i]
   1439                                                 ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
   1440                                         ps_intra8_analyse->au1_4x4_best_modes[j][2] =
   1441                                             ps_cu_node->ps_sub_cu[i]
   1442                                                 ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
   1443                                         ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
   1444                                     }
   1445                                 }
   1446                             }
   1447                             /* store the 32x32 non split flag */
   1448                             ps_intra32_analyse->b1_split_flag = 0;
   1449                             ps_intra32_analyse->as_intra16_analyse[0].b1_split_flag = 0;
   1450                             ps_intra32_analyse->as_intra16_analyse[1].b1_split_flag = 0;
   1451                             ps_intra32_analyse->as_intra16_analyse[2].b1_split_flag = 0;
   1452                             ps_intra32_analyse->as_intra16_analyse[3].b1_split_flag = 0;
   1453 
   1454                             au1_best_32x32_modes[blk_cnt >> 4] =
   1455                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
   1456 
   1457                             au4_best_32x32_cost[blk_cnt >> 4] =
   1458                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0];
   1459                             /*As 32*32 has won, pick L2 8x8 qp which maps
   1460                             to L0 32x32 Qp*/
   1461                             ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
   1462                             ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
   1463                             u1_cu_possible_qp = ihevce_cu_level_qp_mod(
   1464                                 ps_ctxt->i4_qscale,
   1465                                 ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
   1466                                 ps_ctxt->ld_curr_frame_16x16_log_avg[0],
   1467                                 f_strength,
   1468                                 &i4_act_factor,
   1469                                 &i4_q_scale_q3_mod,
   1470                                 ps_ctxt->ps_rc_quant_ctxt);
   1471                             /* cost accumalation of best cu size candiate */
   1472                             i8_frame_acc_satd_cost += parent_cost;
   1473 
   1474                             /* satd and mpm bits accumalation of best cu size candiate */
   1475                             i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
   1476 
   1477                             /* Mode bits cost accumalation for best cu size and cu mode */
   1478                             i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
   1479 
   1480                             /*satd/mod_qp accumulation of best cu */
   1481                             i8_frame_acc_satd_by_modqp_q10 +=
   1482                                 ((LWORD64)ps_cu_node->ps_parent->best_satd
   1483                                  << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
   1484                                 i4_q_scale_q3_mod;
   1485 
   1486                             /* Increment pointers */
   1487                             ps_ed_blk_l1 += 16;
   1488                             blk_cnt += 16;
   1489                             //ps_row_cu++;
   1490                             merge_64x64 &= 1;
   1491                         }
   1492                         else
   1493                         {
   1494                             /* store the 32x32 split flag */
   1495                             ps_intra32_analyse->b1_split_flag = 1;
   1496 
   1497                             /* CU size 16x16 and fill the final cu params for all 4 blocks */
   1498                             for(j = 0; j < 4; j++)
   1499                             {
   1500                                 WORD32 i4_q_scale_q3_mod;
   1501                                 UWORD8 u1_cu_possible_qp;
   1502                                 WORD32 i4_act_factor;
   1503 
   1504                                 /* Set CU split flag */
   1505                                 ASSERT(blk_cnt % 4 == 0);
   1506 
   1507                                 ihevce_update_cand_list(
   1508                                     ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
   1509 
   1510                                 /* store the 16x16 non split flag  */
   1511                                 ps_intra16_analyse[j].b1_split_flag = 0;
   1512 
   1513                                 ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
   1514                                 ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
   1515                                 /*As 16*16 has won, pick L1 8x8 qp which maps
   1516                                 to L0 16x16 Qp*/
   1517                                 u1_cu_possible_qp = ihevce_cu_level_qp_mod(
   1518                                     ps_ctxt->i4_qscale,
   1519                                     ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
   1520                                     ps_ctxt->ld_curr_frame_8x8_log_avg[0],
   1521                                     f_strength,
   1522                                     &i4_act_factor,
   1523                                     &i4_q_scale_q3_mod,
   1524                                     ps_ctxt->ps_rc_quant_ctxt);
   1525 
   1526                                 /*accum satd/qp for all child block*/
   1527                                 i8_frame_acc_satd_by_modqp_q10 +=
   1528                                     ((LWORD64)child_satd[j]
   1529                                      << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
   1530                                     i4_q_scale_q3_mod;
   1531 
   1532                                 /* Accumalate mode bits for all child blocks */
   1533                                 i8_frame_acc_mode_bits_cost +=
   1534                                     ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
   1535 
   1536                                 /* satd and mpm bits accumalation of best cu size candiate */
   1537                                 i4_ctb_acc_satd += child_satd[j];
   1538 
   1539                                 /* Increment pointers */
   1540                                 //ps_row_cu++;
   1541                                 ps_ed_blk_l1 += 4;
   1542                                 blk_cnt += 4;
   1543                             }
   1544 
   1545                             /* cost accumalation of best cu size candiate */
   1546                             i8_frame_acc_satd_cost += child_cost_least;
   1547 
   1548                             /* 64x64 merge is not possible */
   1549                             merge_64x64 = 0;
   1550                         }
   1551 
   1552                         //ps_ed_blk_l2 += 4;
   1553 
   1554                     }  //end of EIID's else
   1555 #endif
   1556                 }
   1557                 /* If Merge success for L1 max CU size 16x16 is chosen */
   1558                 else if(merge_16x16_l1)
   1559                 {
   1560 #if IP_DBG_L1_l2
   1561                     ps_cu_node->ps_parent->u1_cu_size = 16;
   1562                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[blk_cnt]; /* Populate properly */
   1563                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[blk_cnt]; /* Populate properly */
   1564                     ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_merge_mode;
   1565                     ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
   1566 
   1567                     blk_cnt += 4;
   1568                     ps_ed_blk_l1 += 4;
   1569                     ps_row_cu++;
   1570                     merge_64x64 = 0;
   1571 #else
   1572 
   1573                     /*EIID: evaluate only if L1 early-inter-intra decision is not favouring inter*/
   1574                     /* enable this only in B pictures */
   1575                     if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
   1576                     {
   1577                         WORD32 i4_q_scale_q3_mod, i4_local_ctr;
   1578                         WORD8 i1_cu_possible_qp;
   1579                         WORD32 i4_act_factor;
   1580                         /* make cost infinity. */
   1581                         /* make modes invalid */
   1582                         /* update loop variables */
   1583                         /* set other output variales */
   1584                         /* dont set neighbour flag so that next blocks wont access this cu */
   1585                         /* what happens to ctb_mode_map?? */
   1586 
   1587                         ps_cu_node->ps_parent->u1_cu_size = 16;
   1588                         ps_cu_node->ps_parent->u2_x0 =
   1589                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
   1590                         ps_cu_node->ps_parent->u2_y0 =
   1591                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
   1592                         ps_cu_node->ps_parent->best_mode =
   1593                             INTRA_DC;  //ps_ed_blk_l1->best_merge_mode;
   1594 
   1595                         /* fill in the first modes as invalid */
   1596 
   1597                         ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
   1598                         ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
   1599                             INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
   1600                         ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
   1601 
   1602                         ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
   1603                         ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
   1604                         ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
   1605 
   1606                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
   1607 
   1608                         //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
   1609                         //ps_row_cu->u1_num_intra_rdopt_cands = 0;
   1610 
   1611                         ps_intra32_analyse->b1_split_flag = 1;
   1612                         ps_intra32_analyse->b1_merge_flag = 0;
   1613 
   1614                         ps_intra16_analyse->b1_valid_cu = 0;
   1615                         ps_intra16_analyse->b1_split_flag = 0;
   1616                         ps_intra16_analyse->b1_merge_flag = 1;
   1617                         //memset (&ps_intra16_analyse->au1_best_modes_16x16_tu,
   1618                         //  255,
   1619                         //  NUM_BEST_MODES);
   1620                         //memset (&ps_intra16_analyse->au1_best_modes_8x8_tu,
   1621                         //  255,
   1622                         //  NUM_BEST_MODES);
   1623                         //set only first mode since if it's 255. it wont go ahead
   1624                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
   1625                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
   1626                         ps_intra16_analyse->i4_best_intra_cost = MAX_INTRA_COST_IPE;
   1627                         *pi4_intra_16_cost = MAX_INTRA_COST_IPE;
   1628 
   1629                         /*since ME will start evaluating from bottom up, set the lower
   1630                         cu size data invalid */
   1631                         for(i4_local_ctr = 0; i4_local_ctr < 4; i4_local_ctr++)
   1632                         {
   1633                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
   1634                                 .au1_4x4_best_modes[0][0] = 255;
   1635                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
   1636                                 .au1_4x4_best_modes[1][0] = 255;
   1637                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
   1638                                 .au1_4x4_best_modes[2][0] = 255;
   1639                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
   1640                                 .au1_4x4_best_modes[3][0] = 255;
   1641                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
   1642                                 .au1_best_modes_8x8_tu[0] = 255;
   1643                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr]
   1644                                 .au1_best_modes_4x4_tu[0] = 255;
   1645                             ps_intra16_analyse->as_intra8_analyse[i4_local_ctr].i4_best_intra_cost =
   1646                                 MAX_INTRA_COST_IPE;
   1647 
   1648                             pi4_intra_8_cost
   1649                                 [(i4_local_ctr & 1) + (MAX_CU_IN_CTB_ROW * (i4_local_ctr >> 1))] =
   1650                                     MAX_INTRA_COST_IPE;
   1651                         }
   1652 
   1653                         /* set neighbours even if intra is not evaluated, since source is always available. */
   1654                         ihevce_set_nbr_map(
   1655                             ps_ctxt->pu1_ctb_nbr_map,
   1656                             ps_ctxt->i4_nbr_map_strd,
   1657                             ps_cu_node->ps_parent->u2_x0 << 1,
   1658                             ps_cu_node->ps_parent->u2_y0 << 1,
   1659                             (ps_cu_node->ps_parent->u1_cu_size >> 2),
   1660                             1);
   1661 
   1662                         //what happends to RC variables??
   1663                         /* run only constant Qp */
   1664                         ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
   1665                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0] != -2);
   1666                         i1_cu_possible_qp = ihevce_cu_level_qp_mod(
   1667                             ps_ctxt->i4_qscale,
   1668                             ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][0],
   1669                             ps_ctxt->ld_curr_frame_8x8_log_avg[0],
   1670                             f_strength,
   1671                             &i4_act_factor,
   1672                             &i4_q_scale_q3_mod,
   1673                             ps_ctxt->ps_rc_quant_ctxt);
   1674 
   1675                         /* cost accumalation of best cu size candiate */
   1676                         i8_frame_acc_satd_cost += 0;  //parent_cost;  //incorrect accumulation
   1677 
   1678                         /*satd/mod_qp accumulation of best cu */
   1679                         i8_frame_acc_satd_by_modqp_q10 += 0;  //incorrect accumulation
   1680                         //((LWORD64)ps_cu_node->ps_parent->best_satd << SATD_BY_ACT_Q_FAC)/i4_q_scale_q3_mod;
   1681 
   1682                         /* Accumalate mode bits for all child blocks */
   1683                         i8_frame_acc_mode_bits_cost +=
   1684                             0;  //ps_cu_node->ps_parent->u2_mode_bits_cost;
   1685                         //incoorect accumulation
   1686 
   1687                         blk_cnt += 4;
   1688                         ps_ed_blk_l1 += 4;
   1689                         //ps_row_cu++;
   1690                         merge_64x64 = 0;
   1691 
   1692                         /* increment for stat purpose only. Increment is valid only on single thread */
   1693                         ps_ctxt->u4_num_16x16_skips_at_L0_IPE += 1;
   1694                     }
   1695                     else
   1696                     {
   1697                         /* 64x64 merge is not possible */
   1698                         merge_64x64 = 0;
   1699 
   1700                         /* set the 32x32 split flag to 1 */
   1701                         ps_intra32_analyse->b1_split_flag = 1;
   1702 
   1703                         ps_intra32_analyse->b1_merge_flag = 0;
   1704 
   1705                         ps_intra16_analyse->b1_merge_flag = 1;
   1706 
   1707                         if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
   1708                            (ps_ctxt->i4_slice_type == PSLICE))
   1709                         {
   1710                             ps_ctxt->u1_disable_child_cu_decide = 1;
   1711                             step2_bypass = 0;
   1712                         }
   1713                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
   1714                         /* Based on the flag, Child modes decision can be disabled*/
   1715                         if(0 == ps_ctxt->u1_disable_child_cu_decide)
   1716                         {
   1717                             for(j = 0; j < 4; j++)
   1718                             {
   1719                                 intra8_analyse_t *ps_intra8_analyse;
   1720                                 WORD32 best_ang_mode = (ps_ed_blk_l1 + j)->best_mode;
   1721 
   1722                                 if(best_ang_mode < 2)
   1723                                     best_ang_mode = 26;
   1724 
   1725                                 //ps_cu_node->ps_sub_cu[j]->best_cost = MAX_INTRA_COST_IPE;
   1726                                 //ps_cu_node->ps_sub_cu[j]->best_mode = (ps_ed_blk_l1 + j)->best_mode;
   1727 
   1728                                 ps_cu_node->ps_sub_cu[j]->u2_x0 =
   1729                                     gau1_cu_pos_x[blk_cnt + j]; /* Populate properly */
   1730                                 ps_cu_node->ps_sub_cu[j]->u2_y0 =
   1731                                     gau1_cu_pos_y[blk_cnt + j]; /* Populate properly */
   1732                                 ps_cu_node->ps_sub_cu[j]->u1_cu_size = 8;
   1733 
   1734                                 ihevce_mode_eval_filtering(
   1735                                     ps_cu_node->ps_sub_cu[j],
   1736                                     ps_cu_node,
   1737                                     ps_ctxt,
   1738                                     ps_curr_src,
   1739                                     best_ang_mode,
   1740                                     &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
   1741                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
   1742                                     !step2_bypass,
   1743                                     1);
   1744 
   1745                                 if(i4_enable_4cu_16tu)
   1746                                 {
   1747                                     ihevce_mode_eval_filtering(
   1748                                         ps_cu_node->ps_sub_cu[j],
   1749                                         ps_cu_node,
   1750                                         ps_ctxt,
   1751                                         ps_curr_src,
   1752                                         best_ang_mode,
   1753                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
   1754                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
   1755                                         !step2_bypass,
   1756                                         0);
   1757                                 }
   1758                                 else
   1759                                 {
   1760                                     /* 4TU not evaluated :  4tu modes set same as 1tu modes */
   1761                                     memcpy(
   1762                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
   1763                                         &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
   1764                                         NUM_BEST_MODES);
   1765 
   1766                                     /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
   1767                                     memcpy(
   1768                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
   1769                                         &ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0],
   1770                                         NUM_BEST_MODES * sizeof(WORD32));
   1771                                 }
   1772 
   1773                                 child_cost[j] =
   1774                                     MIN(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0],
   1775                                         ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0]);
   1776 
   1777                                 child_cost_least += child_cost[j];
   1778 
   1779                                 /* Select the best mode to be populated as top and left nbr depending on the
   1780                                 4tu and 1tu cost */
   1781                                 if(ps_cu_node->ps_sub_cu[j]->au4_best_cost_4tu[0] >
   1782                                    ps_cu_node->ps_sub_cu[j]->au4_best_cost_1tu[0])
   1783                                 {
   1784                                     ps_cu_node->ps_sub_cu[j]->best_mode =
   1785                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
   1786                                 }
   1787                                 else
   1788                                 {
   1789                                     ps_cu_node->ps_sub_cu[j]->best_mode =
   1790                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
   1791                                 }
   1792                                 { /* Update the CTB nodes only for MAX - 1 CU nodes */
   1793                                     WORD32 xA, yA, row, col;
   1794                                     xA = ((ps_cu_node->ps_sub_cu[j]->u2_x0 << 3) >> 2) + 1;
   1795                                     yA = ((ps_cu_node->ps_sub_cu[j]->u2_y0 << 3) >> 2) + 1;
   1796                                     size = ps_cu_node->ps_sub_cu[j]->u1_cu_size >> 2;
   1797                                     for(row = yA; row < (yA + size); row++)
   1798                                     {
   1799                                         for(col = xA; col < (xA + size); col++)
   1800                                         {
   1801                                             ps_ctxt->au1_ctb_mode_map[row][col] =
   1802                                                 ps_cu_node->ps_sub_cu[j]->best_mode;
   1803                                         }
   1804                                     }
   1805                                 }
   1806 
   1807                                 /*collect individual child satd for final SATD/qp accum*/
   1808                                 child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
   1809 
   1810                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
   1811 
   1812                                 /* store the child 8x8 costs */
   1813                                 pi4_intra_8_cost[(j & 1) + (MAX_CU_IN_CTB_ROW * (j >> 1))] =
   1814                                     child_cost[j];
   1815 
   1816                                 /* set the CU valid flag */
   1817                                 ps_intra8_analyse->b1_valid_cu = 1;
   1818                                 ps_intra8_analyse->b1_enable_nxn = 0;
   1819 
   1820                                 /* storing the modes to intra8  analyse */
   1821 
   1822                                 /* store the best 8x8 modes 8x8 tu */
   1823                                 memcpy(
   1824                                     &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
   1825                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0],
   1826                                     sizeof(UWORD8) * (NUM_BEST_MODES));
   1827                                 ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
   1828 
   1829                                 /* store the best 8x8 modes 4x4 tu */
   1830                                 memcpy(
   1831                                     &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
   1832                                     &ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0],
   1833                                     sizeof(UWORD8) * (NUM_BEST_MODES));
   1834                                 ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
   1835 
   1836                                 /* NXN modes not evaluated hence set to 255 */
   1837                                 memset(
   1838                                     &ps_intra8_analyse->au1_4x4_best_modes[0][0],
   1839                                     255,
   1840                                     sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
   1841                             }
   1842 
   1843                             ihevce_set_nbr_map(
   1844                                 ps_ctxt->pu1_ctb_nbr_map,
   1845                                 ps_ctxt->i4_nbr_map_strd,
   1846                                 ps_cu_node->ps_sub_cu[0]->u2_x0 << 1,
   1847                                 ps_cu_node->ps_sub_cu[0]->u2_y0 << 1,
   1848                                 (ps_cu_node->ps_sub_cu[0]->u1_cu_size >> 1),
   1849                                 0);
   1850                         }
   1851 #if 1  //DISBLE_CHILD_CU_EVAL_L0_IPE //1
   1852                         else
   1853                         {
   1854                             for(j = 0; j < 4; j++)
   1855                             {
   1856                                 intra8_analyse_t *ps_intra8_analyse;
   1857                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
   1858                                 ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
   1859                                 ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
   1860                                 /* NXN modes not evaluated hence set to 255 */
   1861                                 memset(
   1862                                     &ps_intra8_analyse->au1_4x4_best_modes[0][0],
   1863                                     255,
   1864                                     sizeof(UWORD8) * 4 * (NUM_BEST_MODES + 1));
   1865 
   1866                                 ps_intra8_analyse->b1_valid_cu = 0;
   1867                                 ps_intra8_analyse->b1_enable_nxn = 0;
   1868                             }
   1869                             child_cost_least = MAX_INTRA_COST_IPE;
   1870                         }
   1871 #endif
   1872                         //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
   1873                         //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
   1874 
   1875                         ps_cu_node->ps_parent->u1_cu_size = 16;
   1876                         ps_cu_node->ps_parent->u2_x0 =
   1877                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
   1878                         ps_cu_node->ps_parent->u2_y0 =
   1879                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
   1880 
   1881                         //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
   1882 
   1883                         /* Eval for TUSize = CuSize */
   1884                         ihevce_mode_eval_filtering(
   1885                             ps_cu_node->ps_parent,
   1886                             ps_cu_node,
   1887                             ps_ctxt,
   1888                             ps_curr_src,
   1889                             26,
   1890                             &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
   1891                             &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   1892                             step2_bypass,
   1893                             1);
   1894 
   1895                         if(i4_enable_1cu_4tu)
   1896                         {
   1897                             /* Eval for TUSize = CuSize/2 */
   1898                             ihevce_mode_eval_filtering(
   1899                                 ps_cu_node->ps_parent,
   1900                                 ps_cu_node,
   1901                                 ps_ctxt,
   1902                                 ps_curr_src,
   1903                                 26,
   1904                                 &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   1905                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   1906                                 step2_bypass,
   1907                                 0);
   1908                         }
   1909                         else
   1910                         {
   1911                             /* 4TU not evaluated :  4tu modes set same as 1tu modes */
   1912                             memcpy(
   1913                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   1914                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   1915                                 NUM_BEST_MODES);
   1916 
   1917                             /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
   1918                             memcpy(
   1919                                 &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   1920                                 &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
   1921                                 NUM_BEST_MODES * sizeof(WORD32));
   1922                         }
   1923 
   1924                         ps_ctxt->u1_disable_child_cu_decide = 0;
   1925                         step2_bypass = 1;
   1926 
   1927                         /* Update parent cost */
   1928                         parent_cost =
   1929                             MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   1930                                 ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
   1931 
   1932                         /* Select the best mode to be populated as top and left nbr depending on the
   1933                         4tu and 1tu cost */
   1934                         if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
   1935                            ps_cu_node->ps_parent->au4_best_cost_1tu[0])
   1936                         {
   1937                             ps_cu_node->ps_parent->best_mode =
   1938                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0];
   1939                         }
   1940                         else
   1941                         {
   1942                             ps_cu_node->ps_parent->best_mode =
   1943                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0];
   1944                         }
   1945 
   1946                         /* store the 16x16 cost */
   1947                         *pi4_intra_16_cost = parent_cost;
   1948 
   1949                         /* accumulate the 32x32 cost */
   1950                         if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
   1951                         {
   1952                             *pi4_intra_32_cost = parent_cost;
   1953                         }
   1954                         else
   1955                         {
   1956                             *pi4_intra_32_cost += parent_cost;
   1957                         }
   1958 
   1959                         /* set the CU valid flag */
   1960                         ps_intra16_analyse->b1_valid_cu = 1;
   1961 
   1962                         /* storing the modes to intra 16 analyse */
   1963                         {
   1964                             /* store the best 16x16 modes 16x16 tu */
   1965                             memcpy(
   1966                                 &ps_intra16_analyse->au1_best_modes_16x16_tu[0],
   1967                                 &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   1968                                 sizeof(UWORD8) * NUM_BEST_MODES);
   1969                             ps_intra16_analyse->au1_best_modes_16x16_tu[NUM_BEST_MODES] = 255;
   1970 
   1971                             /* store the best 16x16 modes 8x8 tu */
   1972                             memcpy(
   1973                                 &ps_intra16_analyse->au1_best_modes_8x8_tu[0],
   1974                                 &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   1975                                 sizeof(UWORD8) * NUM_BEST_MODES);
   1976                             ps_intra16_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
   1977                         }
   1978 
   1979                         parent_best_mode = ps_cu_node->ps_parent->best_mode;
   1980                         if(parent_cost <=
   1981                            child_cost_least + (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >>
   1982                                                LAMBDA_Q_SHIFT))  //|| identical_modes)
   1983                         {
   1984                             WORD32 i4_q_scale_q3_mod;
   1985                             WORD8 i1_cu_possible_qp;
   1986                             WORD32 i4_act_factor;
   1987                             //choose parent CU
   1988 
   1989                             ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
   1990 
   1991                             /* set the 16x16 non split flag */
   1992                             ps_intra16_analyse->b1_split_flag = 0;
   1993 
   1994                             /*As 16*16 has won, pick L1 8x8 qp which maps
   1995                             to L0 16x16 Qp*/
   1996                             ASSERT(((blk_cnt >> 4) & 3) == (blk_cnt >> 4));
   1997                             ASSERT(ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0] != -2);
   1998                             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
   1999                                 ps_ctxt->i4_qscale,
   2000                                 ps_ed_ctb_l1->i4_16x16_satd[blk_cnt >> 4][0],
   2001                                 ps_ctxt->ld_curr_frame_8x8_log_avg[0],
   2002                                 f_strength,
   2003                                 &i4_act_factor,
   2004                                 &i4_q_scale_q3_mod,
   2005                                 ps_ctxt->ps_rc_quant_ctxt);
   2006 
   2007                             /* cost accumalation of best cu size candiate */
   2008                             i8_frame_acc_satd_cost += parent_cost;
   2009 
   2010                             /* satd and mpm bits accumalation of best cu size candiate */
   2011                             i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
   2012 
   2013                             /*satd/mod_qp accumulation of best cu */
   2014                             i8_frame_acc_satd_by_modqp_q10 +=
   2015                                 ((LWORD64)ps_cu_node->ps_parent->best_satd
   2016                                  << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
   2017                                 i4_q_scale_q3_mod;
   2018 
   2019                             /* Accumalate mode bits for all child blocks */
   2020                             i8_frame_acc_mode_bits_cost += ps_cu_node->ps_parent->u2_mode_bits_cost;
   2021 
   2022                             blk_cnt += 4;
   2023                             ps_ed_blk_l1 += 4;
   2024                             //ps_row_cu++;
   2025                         }
   2026                         else
   2027                         {
   2028                             //choose child CU
   2029                             WORD8 i1_cu_possible_qp;
   2030                             WORD32 i4_act_factor;
   2031                             WORD32 i4_q_scale_q3_mod;
   2032 
   2033                             ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
   2034                             ASSERT(ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1] != -2);
   2035                             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
   2036                                 ps_ctxt->i4_qscale,
   2037                                 ps_ed_ctb_l1->i4_8x8_satd[blk_cnt >> 2][1],
   2038                                 ps_ctxt->ld_curr_frame_8x8_log_avg[1],
   2039                                 f_strength,
   2040                                 &i4_act_factor,
   2041                                 &i4_q_scale_q3_mod,
   2042                                 ps_ctxt->ps_rc_quant_ctxt);
   2043 
   2044                             /* set the 16x16 split flag */
   2045                             ps_intra16_analyse->b1_split_flag = 1;
   2046 
   2047                             for(j = 0; j < 4; j++)
   2048                             {
   2049                                 ihevce_update_cand_list(
   2050                                     ps_cu_node->ps_sub_cu[j], ps_ed_blk_l1, ps_ctxt);
   2051 
   2052                                 if((IHEVCE_QUALITY_P3 > i4_quality_preset))
   2053                                 {
   2054                                     WORD32 k;
   2055                                     intra8_analyse_t *ps_intra8_analyse;
   2056                                     ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[j];
   2057 
   2058                                     for(k = 0; k < 4; k++)
   2059                                     {
   2060                                         /* Populate best 3 nxn modes */
   2061                                         ps_intra8_analyse->au1_4x4_best_modes[k][0] =
   2062                                             ps_cu_node->ps_sub_cu[j]->au1_best_mode_4tu[0];
   2063                                         ps_intra8_analyse->au1_4x4_best_modes[k][1] =
   2064                                             ps_cu_node->ps_sub_cu[j]
   2065                                                 ->au1_best_mode_4tu[1];  //(ps_ed + 1)->best_mode;
   2066                                         ps_intra8_analyse->au1_4x4_best_modes[k][2] =
   2067                                             ps_cu_node->ps_sub_cu[j]
   2068                                                 ->au1_best_mode_4tu[2];  //(ps_ed + 2)->best_mode;
   2069                                         ps_intra8_analyse->au1_4x4_best_modes[k][3] = 255;
   2070                                     }
   2071                                 }
   2072                                 /*accum satd/qp for all child block*/
   2073                                 i8_frame_acc_satd_by_modqp_q10 +=
   2074                                     ((LWORD64)child_satd[j]
   2075                                      << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
   2076                                     i4_q_scale_q3_mod;
   2077 
   2078                                 /* Accumalate mode bits for all child blocks */
   2079                                 i8_frame_acc_mode_bits_cost +=
   2080                                     ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
   2081 
   2082                                 /* satd and mpm bits accumalation of best cu size candiate */
   2083                                 i4_ctb_acc_satd += child_satd[j];
   2084 
   2085                                 blk_cnt += 1;
   2086                                 ps_ed_blk_l1 += 1;
   2087                                 //ps_row_cu++;
   2088                             }
   2089 
   2090                             /* cost accumalation of best cu size candiate */
   2091                             i8_frame_acc_satd_cost += child_cost_least;
   2092                         }
   2093 
   2094                     }  //else of EIID
   2095 #endif
   2096                 }  // if(merge_16x16_l1)
   2097                 /* MAX CU SIZE 8x8 */
   2098                 else
   2099                 {
   2100 #if IP_DBG_L1_l2
   2101                     for(i = 0; i < 4; i++)
   2102                     {
   2103                         ps_cu_node->ps_parent->u1_cu_size = 8;
   2104                         ps_cu_node->ps_parent->u2_x0 =
   2105                             gau1_cu_pos_x[blk_cnt]; /* Populate properly */
   2106                         ps_cu_node->ps_parent->u2_y0 =
   2107                             gau1_cu_pos_y[blk_cnt]; /* Populate properly */
   2108                         ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
   2109 
   2110                         ihevce_update_cand_list(ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
   2111                         blk_cnt++;
   2112                         ps_ed_blk_l1++;
   2113                         ps_row_cu++;
   2114                         merge_64x64 = 0;
   2115                     }
   2116 #else
   2117 
   2118                     /* EIID: Skip all 4 8x8 block if L1 decisions says skip intra */
   2119                     if(ps_ed_blk_l1->intra_or_inter == 2 && (ps_ctxt->i4_slice_type != ISLICE))
   2120                     {
   2121                         WORD32 i4_q_scale_q3_mod;
   2122                         WORD8 i1_cu_possible_qp;
   2123                         WORD32 i4_act_factor;
   2124 
   2125                         merge_64x64 = 0;
   2126 
   2127                         ps_intra32_analyse->b1_merge_flag = 0;
   2128 
   2129                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 255;
   2130                         ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 255;
   2131                         ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
   2132 
   2133                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 255;
   2134                         ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 255;
   2135                         ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
   2136                         ps_intra16_analyse->b1_split_flag = 1;
   2137                         ps_intra16_analyse->b1_valid_cu = 0;
   2138                         ps_intra16_analyse->b1_merge_flag = 0;
   2139 
   2140                         ps_intra16_analyse->i4_best_intra_cost = MAX_INTRA_COST_IPE;
   2141 
   2142                         for(i = 0; i < 4; i++)
   2143                         {
   2144                             intra8_analyse_t *ps_intra8_analyse;
   2145                             WORD32 ctr_sub_cu;
   2146 
   2147                             cu_pos_x = gau1_cu_pos_x[blk_cnt];
   2148                             cu_pos_y = gau1_cu_pos_y[blk_cnt];
   2149 
   2150                             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
   2151                             {
   2152                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
   2153 
   2154                                 ps_intra8_analyse->b1_valid_cu = 0;
   2155                                 ps_intra8_analyse->b1_enable_nxn = 0;
   2156                                 ps_intra8_analyse->au1_4x4_best_modes[0][0] = 255;
   2157                                 ps_intra8_analyse->au1_4x4_best_modes[1][0] = 255;
   2158                                 ps_intra8_analyse->au1_4x4_best_modes[2][0] = 255;
   2159                                 ps_intra8_analyse->au1_4x4_best_modes[3][0] = 255;
   2160                                 ps_intra8_analyse->au1_best_modes_4x4_tu[0] = 255;
   2161                                 ps_intra8_analyse->au1_best_modes_8x8_tu[0] = 255;
   2162                                 ps_intra8_analyse->i4_best_intra_cost = MAX_INTRA_COST_IPE;
   2163 
   2164                                 ps_cu_node->ps_parent->u1_cu_size = 8;
   2165                                 ps_cu_node->ps_parent->u2_x0 =
   2166                                     gau1_cu_pos_x[blk_cnt]; /* Populate properly */
   2167                                 ps_cu_node->ps_parent->u2_y0 =
   2168                                     gau1_cu_pos_y[blk_cnt]; /* Populate properly */
   2169                                 ps_cu_node->ps_parent->best_mode =
   2170                                     INTRA_DC;  //ps_ed_blk_l1->best_mode;
   2171 
   2172                                 /* fill in the first modes as invalid */
   2173 
   2174                                 ps_cu_node->ps_parent->au1_best_mode_1tu[0] = INTRA_DC;
   2175                                 ps_cu_node->ps_parent->au1_best_mode_1tu[1] =
   2176                                     INTRA_DC;  //for safery. Since update_cand_list will set num_modes as 3
   2177                                 ps_cu_node->ps_parent->au1_best_mode_1tu[2] = INTRA_DC;
   2178 
   2179                                 ps_cu_node->ps_parent->au1_best_mode_4tu[0] = INTRA_DC;
   2180                                 ps_cu_node->ps_parent->au1_best_mode_4tu[1] = INTRA_DC;
   2181                                 ps_cu_node->ps_parent->au1_best_mode_4tu[2] = INTRA_DC;
   2182 
   2183                                 ihevce_update_cand_list(
   2184                                     ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
   2185 
   2186                                 //ps_row_cu->s_cu_intra_cand.b6_num_intra_cands = 0;
   2187                                 //ps_row_cu->u1_num_intra_rdopt_cands = 0;
   2188 
   2189                                 for(ctr_sub_cu = 0; ctr_sub_cu < 4; ctr_sub_cu++)
   2190                                 {
   2191                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_1tu[0] =
   2192                                         INTRA_DC;
   2193                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au1_best_mode_4tu[0] =
   2194                                         INTRA_DC;
   2195                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_1tu[0] =
   2196                                         MAX_INTRA_COST_IPE;
   2197 
   2198                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->au4_best_cost_4tu[0] =
   2199                                         MAX_INTRA_COST_IPE;
   2200                                     ps_cu_node->ps_sub_cu[ctr_sub_cu]->best_cost =
   2201                                         MAX_INTRA_COST_IPE;
   2202                                 }
   2203 
   2204                                 pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
   2205                                     MAX_INTRA_COST_IPE;
   2206 
   2207                                 ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
   2208                                 ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
   2209                                 i1_cu_possible_qp = ihevce_cu_level_qp_mod(
   2210                                     ps_ctxt->i4_qscale,
   2211                                     ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
   2212                                     ps_ctxt->ld_curr_frame_8x8_log_avg[1],
   2213                                     f_strength,
   2214                                     &i4_act_factor,
   2215                                     &i4_q_scale_q3_mod,
   2216                                     ps_ctxt->ps_rc_quant_ctxt);
   2217 
   2218                                 /* set neighbours even if intra is not evaluated, since source is always available. */
   2219                                 ihevce_set_nbr_map(
   2220                                     ps_ctxt->pu1_ctb_nbr_map,
   2221                                     ps_ctxt->i4_nbr_map_strd,
   2222                                     ps_cu_node->ps_parent->u2_x0 << 1,
   2223                                     ps_cu_node->ps_parent->u2_y0 << 1,
   2224                                     (ps_cu_node->ps_parent->u1_cu_size >> 2),
   2225                                     1);
   2226 
   2227                                 //ps_row_cu++;
   2228                             }
   2229                             blk_cnt++;
   2230                             ps_ed_blk_l1++;
   2231                         }
   2232                     }
   2233                     else
   2234                     {
   2235                         //cu_intra_cand_t *ps_cu_intra_cand;
   2236                         WORD8 i1_cu_possible_qp;
   2237                         WORD32 i4_act_factor;
   2238                         WORD32 i4_q_scale_q3_mod;
   2239 
   2240                         ASSERT(((blk_cnt >> 2) & 0xF) == (blk_cnt >> 2));
   2241                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1] != -2);
   2242                         i1_cu_possible_qp = ihevce_cu_level_qp_mod(
   2243                             ps_ctxt->i4_qscale,
   2244                             ps_ed_ctb_l1->i4_8x8_satd[(blk_cnt >> 2)][1],
   2245                             ps_ctxt->ld_curr_frame_8x8_log_avg[1],
   2246                             f_strength,
   2247                             &i4_act_factor,
   2248                             &i4_q_scale_q3_mod,
   2249                             ps_ctxt->ps_rc_quant_ctxt);
   2250 
   2251                         /* 64x64 merge is not possible */
   2252                         merge_64x64 = 0;
   2253 
   2254                         ps_intra32_analyse->b1_merge_flag = 0;
   2255 
   2256                         ps_intra16_analyse->b1_merge_flag = 0;
   2257 
   2258                         /* by default 16x16 modes are set to default values DC and Planar */
   2259                         ps_intra16_analyse->au1_best_modes_8x8_tu[0] = 0;
   2260                         ps_intra16_analyse->au1_best_modes_8x8_tu[1] = 1;
   2261                         ps_intra16_analyse->au1_best_modes_8x8_tu[2] = 255;
   2262 
   2263                         ps_intra16_analyse->au1_best_modes_16x16_tu[0] = 0;
   2264                         ps_intra16_analyse->au1_best_modes_16x16_tu[1] = 1;
   2265                         ps_intra16_analyse->au1_best_modes_16x16_tu[2] = 255;
   2266                         ps_intra16_analyse->b1_split_flag = 1;
   2267                         ps_intra16_analyse->b1_valid_cu = 1;
   2268 
   2269                         for(i = 0; i < 4; i++)
   2270                         {
   2271                             intra8_analyse_t *ps_intra8_analyse;
   2272                             cu_pos_x = gau1_cu_pos_x[blk_cnt];
   2273                             cu_pos_y = gau1_cu_pos_y[blk_cnt];
   2274                             if((cu_pos_x < num_8x8_blks_x) && (cu_pos_y < num_8x8_blks_y))
   2275                             {
   2276                                 //ps_cu_intra_cand = &ps_row_cu->s_cu_intra_cand;
   2277                                 //ps_cu_node->ps_parent->best_cost = MAX_INTRA_COST_IPE;
   2278 
   2279                                 //ps_cu_node->ps_parent->best_mode = ps_ed_blk_l1->best_mode;
   2280 
   2281                                 child_cost_least = 0;
   2282 
   2283                                 ps_intra8_analyse = &ps_intra16_analyse->as_intra8_analyse[i];
   2284                                 ps_cu_node->ps_parent->u1_cu_size = 8;
   2285                                 ps_cu_node->ps_parent->u2_x0 =
   2286                                     gau1_cu_pos_x[blk_cnt]; /* Populate properly */
   2287                                 ps_cu_node->ps_parent->u2_y0 =
   2288                                     gau1_cu_pos_y[blk_cnt]; /* Populate properly */
   2289 
   2290                                 //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
   2291 
   2292                                 /*EARLY DECISION 8x8 block */
   2293                                 ihevce_pu_calc_8x8_blk(
   2294                                     ps_curr_src, ps_ctxt, ps_cu_node, ps_ctxt->ps_func_selector);
   2295                                 for(j = 0; j < 4; j++)
   2296                                 {
   2297                                     child_cost_least += ps_cu_node->ps_sub_cu[j]->best_cost;
   2298                                     child_satd[j] = ps_cu_node->ps_sub_cu[j]->best_satd;
   2299                                 }
   2300 
   2301                                 /* Based on the flag, CU = 4TU modes decision can be disabled, CU = 4PU is retained */
   2302                                 if(0 == ps_ctxt->u1_disable_child_cu_decide)
   2303                                 {
   2304                                     ihevce_set_nbr_map(
   2305                                         ps_ctxt->pu1_ctb_nbr_map,
   2306                                         ps_ctxt->i4_nbr_map_strd,
   2307                                         ps_cu_node->ps_parent->u2_x0 << 1,
   2308                                         ps_cu_node->ps_parent->u2_y0 << 1,
   2309                                         (ps_cu_node->ps_parent->u1_cu_size >> 2),
   2310                                         0);
   2311 
   2312                                     //memcpy(ps_ctxt->ai1_ctb_mode_map_temp, ps_ctxt->ai1_ctb_mode_map, sizeof(ps_ctxt->ai1_ctb_mode_map));
   2313 
   2314                                     /* Eval for TUSize = CuSize */
   2315                                     ihevce_mode_eval_filtering(
   2316                                         ps_cu_node->ps_parent,
   2317                                         ps_cu_node,
   2318                                         ps_ctxt,
   2319                                         ps_curr_src,
   2320                                         26,
   2321                                         &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
   2322                                         &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   2323                                         step2_bypass,
   2324                                         1);
   2325 
   2326                                     if(i4_enable_1cu_4tu)
   2327                                     {
   2328                                         /* Eval for TUSize = CuSize/2 */
   2329                                         ihevce_mode_eval_filtering(
   2330                                             ps_cu_node->ps_parent,
   2331                                             ps_cu_node,
   2332                                             ps_ctxt,
   2333                                             ps_curr_src,
   2334                                             26,
   2335                                             &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   2336                                             &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   2337                                             step2_bypass,
   2338                                             0);
   2339                                     }
   2340                                     else
   2341                                     {
   2342                                         /* 4TU not evaluated :  4tu modes set same as 1tu modes */
   2343                                         memcpy(
   2344                                             &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   2345                                             &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   2346                                             NUM_BEST_MODES);
   2347 
   2348                                         /* 4TU not evaluated : currently 4tu cost set same as 1tu cost */
   2349                                         memcpy(
   2350                                             &ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   2351                                             &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
   2352                                             NUM_BEST_MODES * sizeof(WORD32));
   2353                                     }
   2354 
   2355                                     /* Update parent cost */
   2356                                     parent_cost =
   2357                                         MIN(ps_cu_node->ps_parent->au4_best_cost_4tu[0],
   2358                                             ps_cu_node->ps_parent->au4_best_cost_1tu[0]);
   2359 
   2360                                     /* Select the best mode to be populated as top and left nbr depending on the
   2361                             4tu and 1tu cost */
   2362                                     if(ps_cu_node->ps_parent->au4_best_cost_4tu[0] >
   2363                                        ps_cu_node->ps_parent->au4_best_cost_1tu[0])
   2364                                     {
   2365                                         ps_cu_node->ps_parent->best_mode =
   2366                                             ps_cu_node->ps_parent->au1_best_mode_1tu[0];
   2367                                     }
   2368                                     else
   2369                                     {
   2370                                         ps_cu_node->ps_parent->best_mode =
   2371                                             ps_cu_node->ps_parent->au1_best_mode_4tu[0];
   2372                                     }
   2373                                 }
   2374 
   2375                                 /* set the CU valid flag */
   2376                                 ps_intra8_analyse->b1_valid_cu = 1;
   2377                                 ps_intra8_analyse->b1_enable_nxn = 0;
   2378 
   2379                                 /* storing the modes to intra 8 analyse */
   2380 
   2381                                 /* store the best 8x8 modes 8x8 tu */
   2382                                 memcpy(
   2383                                     &ps_intra8_analyse->au1_best_modes_8x8_tu[0],
   2384                                     &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   2385                                     sizeof(UWORD8) * (NUM_BEST_MODES));
   2386                                 ps_intra8_analyse->au1_best_modes_8x8_tu[NUM_BEST_MODES] = 255;
   2387 
   2388                                 /* store the best 8x8 modes 4x4 tu */
   2389                                 memcpy(
   2390                                     &ps_intra8_analyse->au1_best_modes_4x4_tu[0],
   2391                                     &ps_cu_node->ps_parent->au1_best_mode_4tu[0],
   2392                                     sizeof(UWORD8) * (NUM_BEST_MODES));
   2393                                 ps_intra8_analyse->au1_best_modes_4x4_tu[NUM_BEST_MODES] = 255;
   2394 
   2395                                 /*As 8*8 has won, pick L1 4x4 qp which is equal to
   2396                                 L1 8x8 Qp*/
   2397                                 //ps_row_cu->u1_cu_possible_qp[0] = u1_cu_possible_qp;
   2398                                 //ps_row_cu->i4_act_factor[0][1] = i4_act_factor;
   2399 
   2400                                 parent_best_mode = ps_cu_node->ps_parent->best_mode;
   2401                                 if(parent_cost <=
   2402                                    child_cost_least +
   2403                                        (ps_ctxt->i4_ol_satd_lambda * CHILD_BIAS >> LAMBDA_Q_SHIFT))
   2404                                 {
   2405                                     /*CU = 4TU */
   2406                                     ihevce_update_cand_list(
   2407                                         ps_cu_node->ps_parent, ps_ed_blk_l1, ps_ctxt);
   2408 
   2409                                     /* store the child 8x8 costs */
   2410                                     pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
   2411                                         parent_cost;
   2412 
   2413                                     /* cost accumalation of best cu size candiate */
   2414                                     i8_frame_acc_satd_cost += parent_cost;
   2415 
   2416                                     /*satd/mod_qp accumulation of best cu */
   2417                                     i8_frame_acc_satd_by_modqp_q10 +=
   2418                                         ((LWORD64)ps_cu_node->ps_parent->best_satd
   2419                                          << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
   2420                                         i4_q_scale_q3_mod;
   2421 
   2422                                     /* Accumalate mode bits for all child blocks */
   2423                                     i8_frame_acc_mode_bits_cost +=
   2424                                         ps_cu_node->ps_parent->u2_mode_bits_cost;
   2425 
   2426                                     /* satd and mpm bits accumalation of best cu size candiate */
   2427                                     i4_ctb_acc_satd += ps_cu_node->ps_parent->best_satd;
   2428 
   2429                                     /* accumulate the 16x16 cost*/
   2430                                     if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
   2431                                     {
   2432                                         *pi4_intra_16_cost = parent_cost;
   2433                                     }
   2434                                     else
   2435                                     {
   2436                                         *pi4_intra_16_cost += parent_cost;
   2437                                     }
   2438 
   2439                                     /* accumulate the 32x32 cost*/
   2440                                     if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
   2441                                     {
   2442                                         *pi4_intra_32_cost = parent_cost;
   2443                                     }
   2444                                     else
   2445                                     {
   2446                                         *pi4_intra_32_cost += parent_cost;
   2447                                     }
   2448                                 }
   2449                                 else
   2450                                 {
   2451                                     /*CU = 4PU*/
   2452                                     //ps_row_cu->b3_cu_pos_x = (UWORD8) ps_cu_node->ps_parent->u2_x0;
   2453                                     //ps_row_cu->b3_cu_pos_y = (UWORD8) ps_cu_node->ps_parent->u2_y0;
   2454                                     //ps_row_cu->u1_cu_size  = ps_cu_node->ps_parent->u1_cu_size;
   2455 
   2456                                     /* store the child 8x8 costs woth 4x4 pu summed cost */
   2457                                     pi4_intra_8_cost[(i & 1) + (MAX_CU_IN_CTB_ROW * (i >> 1))] =
   2458                                         (child_cost_least);
   2459 
   2460                                     /* accumulate the 16x16 cost*/
   2461                                     if(MAX_INTRA_COST_IPE == *pi4_intra_16_cost)
   2462                                     {
   2463                                         *pi4_intra_16_cost = child_cost_least;
   2464                                     }
   2465                                     else
   2466                                     {
   2467                                         *pi4_intra_16_cost += child_cost_least;
   2468                                     }
   2469 
   2470                                     /* cost accumalation of best cu size candiate */
   2471                                     i8_frame_acc_satd_cost += child_cost_least;
   2472 
   2473                                     for(j = 0; j < 4; j++)
   2474                                     {
   2475                                         /*satd/qp accumualtion*/
   2476                                         i8_frame_acc_satd_by_modqp_q10 +=
   2477                                             ((LWORD64)child_satd[j]
   2478                                              << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
   2479                                             i4_q_scale_q3_mod;
   2480 
   2481                                         /* Accumalate mode bits for all child blocks */
   2482                                         i8_frame_acc_mode_bits_cost +=
   2483                                             ps_cu_node->ps_sub_cu[j]->u2_mode_bits_cost;
   2484 
   2485                                         /* satd and mpm bits accumalation of best cu size candiate */
   2486                                         i4_ctb_acc_satd += child_satd[j];
   2487                                     }
   2488 
   2489                                     /* accumulate the 32x32 cost*/
   2490                                     if(MAX_INTRA_COST_IPE == *pi4_intra_32_cost)
   2491                                     {
   2492                                         *pi4_intra_32_cost = child_cost_least;
   2493                                     }
   2494                                     else
   2495                                     {
   2496                                         *pi4_intra_32_cost += child_cost_least;
   2497                                     }
   2498 
   2499                                     ps_intra8_analyse->b1_enable_nxn = 1;
   2500 
   2501                                     /* Insert the best 8x8 modes unconditionally */
   2502 
   2503                                     x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
   2504                                     y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
   2505                                     size = ps_cu_node->u1_cu_size >> 2;
   2506 
   2507                                     ps_ctxt->au1_ctb_mode_map[y][x] =
   2508                                         ps_cu_node->ps_sub_cu[0]->best_mode;
   2509                                     ps_ctxt->au1_ctb_mode_map[y][x + 1] =
   2510                                         ps_cu_node->ps_sub_cu[1]->best_mode;
   2511                                     ps_ctxt->au1_ctb_mode_map[y + 1][x] =
   2512                                         ps_cu_node->ps_sub_cu[2]->best_mode;
   2513                                     ps_ctxt->au1_ctb_mode_map[y + 1][x + 1] =
   2514                                         ps_cu_node->ps_sub_cu[3]->best_mode;
   2515                                 }
   2516                                 /* NXN mode population */
   2517                                 for(j = 0; j < 4; j++)
   2518                                 {
   2519                                     cand_mode_list[0] =
   2520                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[0];
   2521                                     cand_mode_list[1] =
   2522                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[1];
   2523                                     cand_mode_list[2] =
   2524                                         ps_cu_node->ps_sub_cu[j]->au1_best_mode_1tu[2];
   2525 
   2526                                     if(1)
   2527                                     {
   2528                                         /* Populate best 3 nxn modes */
   2529                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] =
   2530                                             cand_mode_list[0];
   2531                                         ps_intra8_analyse->au1_4x4_best_modes[j][1] =
   2532                                             cand_mode_list[1];  //(ps_ed + 1)->best_mode;
   2533                                         ps_intra8_analyse->au1_4x4_best_modes[j][2] =
   2534                                             cand_mode_list[2];  //(ps_ed + 2)->best_mode;
   2535                                         ps_intra8_analyse->au1_4x4_best_modes[j][3] = 255;
   2536 
   2537                                         //memcpy(ps_intra8_analyse->au1_4x4_best_modes[j], ps_row_cu->s_cu_intra_cand.au1_intra_luma_modes_nxn[j], 4);
   2538                                     }
   2539                                     /* For HQ, all 35 modes to be used for RDOPT, removed from here for memory clean-up */
   2540 
   2541                                     else /* IHEVCE_QUALITY_P0 == i4_quality_preset */
   2542                                     {
   2543                                         /* To indicate to enc loop that NXN is enabled in HIGH QUALITY fior CU 8x8*/
   2544                                         ps_intra8_analyse->au1_4x4_best_modes[j][0] = 0;
   2545                                     }
   2546 
   2547                                     ps_intra8_analyse
   2548                                         ->au1_4x4_best_modes[j][MAX_INTRA_CU_CANDIDATES] = 255;
   2549                                 }
   2550 
   2551                                 //ps_row_cu++;
   2552                             }
   2553                             else
   2554                             {
   2555                                 /* For Incomplete CTB, 16x16 is not valid */
   2556                                 ps_intra16_analyse->b1_valid_cu = 0;
   2557                             }
   2558                             blk_cnt++;
   2559                             ps_ed_blk_l1++;
   2560                         }
   2561                         //ps_ed_blk_l2 ++;
   2562                     }  //else of EIID
   2563 #endif
   2564                 }
   2565             }
   2566             else
   2567             {
   2568                 /* For incomplete CTB, init valid CU to 0 */
   2569                 ps_ed_blk_l1++;
   2570                 ps_intra32_analyse->b1_valid_cu = 0;
   2571                 ps_intra16_analyse[0].b1_valid_cu = 0;
   2572                 blk_cnt++;
   2573                 merge_64x64 = 0;
   2574             }
   2575         } while(blk_cnt != MAX_CTB_SIZE);
   2576         /* if 64x64 merge is possible then check for 32x32 having same best modes */
   2577         if(1 == merge_64x64)
   2578         {
   2579             WORD32 act_mode = au1_best_32x32_modes[0];
   2580 
   2581             ps_ed_blk_l2 = ps_ed_l2_ctb;
   2582             best_mode = ps_ed_blk_l2->best_mode;
   2583             merge_64x64 =
   2584                 ((act_mode == au1_best_32x32_modes[0]) + (act_mode == au1_best_32x32_modes[1]) +
   2585                      (act_mode == au1_best_32x32_modes[2]) +
   2586                      (act_mode == au1_best_32x32_modes[3]) ==
   2587                  4);
   2588             if(merge_64x64 == 1)
   2589                 best_mode = au1_best_32x32_modes[0];
   2590             else
   2591                 best_mode = ps_ed_blk_l2->best_mode;
   2592             /* All 32x32 costs are accumalated to 64x64 cost */
   2593             ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
   2594             for(i = 0; i < 4; i++)
   2595             {
   2596                 ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
   2597                     ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
   2598             }
   2599 
   2600             /* If all modes of 32x32 block is not same */
   2601             if(0 == merge_64x64)
   2602             {
   2603                 /*Compute CHILD cost for 32x32 */
   2604                 WORD32 child_cost_64x64 = au4_best_32x32_cost[0] + au4_best_32x32_cost[1] +
   2605                                           au4_best_32x32_cost[2] + au4_best_32x32_cost[3];
   2606                 WORD32 cost = MAX_INTRA_COST_IPE;
   2607 
   2608                 WORD32 best_mode_temp = 0;
   2609                 /*Compute 64x64 cost for each mode of 32x32*/
   2610                 for(i = 0; i < 4; i++)
   2611                 {
   2612                     WORD32 mode = au1_best_32x32_modes[i];
   2613                     if(mode < 2)
   2614                         mode = 26;
   2615                     ps_cu_node->ps_parent->u1_cu_size = 64;
   2616                     ps_cu_node->ps_parent->u2_x0 = gau1_cu_pos_x[0]; /* Populate properly */
   2617                     ps_cu_node->ps_parent->u2_y0 = gau1_cu_pos_y[0]; /* Populate properly */
   2618 
   2619                     ihevce_set_nbr_map(
   2620                         ps_ctxt->pu1_ctb_nbr_map,
   2621                         ps_ctxt->i4_nbr_map_strd,
   2622                         (ps_cu_node->ps_parent->u2_x0 << 1),
   2623                         (ps_cu_node->ps_parent->u2_y0 << 1),
   2624                         (ps_cu_node->ps_parent->u1_cu_size >> 2),
   2625                         0);
   2626 
   2627                     ihevce_mode_eval_filtering(
   2628                         ps_cu_node->ps_parent,
   2629                         ps_cu_node,
   2630                         ps_ctxt,
   2631                         ps_curr_src,
   2632                         mode,
   2633                         &ps_cu_node->ps_parent->au4_best_cost_1tu[0],
   2634                         &ps_cu_node->ps_parent->au1_best_mode_1tu[0],
   2635                         !step2_bypass,
   2636                         0);
   2637 
   2638                     parent_cost = ps_cu_node->ps_parent->best_cost;
   2639                     if(cost > parent_cost)
   2640                     {
   2641                         cost = parent_cost;
   2642                         best_mode_temp = ps_cu_node->ps_parent->best_mode;
   2643                     }
   2644                 }
   2645                 if(cost < child_cost_64x64)
   2646                 {
   2647                     merge_64x64 = 1;
   2648                     best_mode = best_mode_temp;
   2649 
   2650                     /* Update 64x64 cost if CU 64x64 is chosen  */
   2651                     ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = cost;
   2652 
   2653                     /* Accumalate the least cost for CU 64x64 */
   2654                     i8_frame_acc_satd_cost = cost;
   2655                     i8_frame_acc_mode_bits_cost = ps_cu_node->ps_parent->u2_mode_bits_cost;
   2656 
   2657                     /* satd and mpm bits accumalation of best cu size candiate */
   2658                     i4_ctb_acc_satd = ps_cu_node->ps_parent->best_satd;
   2659                 }
   2660             }
   2661         }
   2662 
   2663         if(merge_64x64)
   2664         {
   2665             WORD32 i, j;
   2666             intra32_analyse_t *ps_intra32_analyse;
   2667             intra16_analyse_t *ps_intra16_analyse;
   2668             WORD32 row, col;
   2669             WORD32 i4_q_scale_q3_mod;
   2670             WORD8 i1_cu_possible_qp;
   2671             WORD32 i4_act_factor;
   2672             //ps_row_cu = ps_curr_cu;
   2673             ps_ctb_out->u4_cu_split_flags = 0x0;
   2674             ps_ed_blk_l1 = ps_ed_l1_ctb;
   2675             ps_ed_blk_l2 = ps_ed_l2_ctb;
   2676 
   2677             ps_l0_ipe_out_ctb->u1_split_flag = 0;
   2678 
   2679             /* If CU size of 64x64 is chosen, disbale all the 16x16 flag*/
   2680             for(i = 0; i < 4; i++)
   2681             {
   2682                 /* get the corresponding intra 32 analyse pointer  use (blk_cnt / 16) */
   2683                 /* blk cnt is in terms of 8x8 units so a 32x32 will have 16 8x8 units */
   2684                 ps_intra32_analyse = &ps_l0_ipe_out_ctb->as_intra32_analyse[i];
   2685 
   2686                 for(j = 0; j < 4; j++)
   2687                 {
   2688                     /* get the corresponding intra 16 analyse pointer use (blk_cnt & 0xF / 4)*/
   2689                     /* blk cnt is in terms of 8x8 units so a 16x16 will have 4 8x8 units */
   2690                     ps_intra16_analyse = &ps_intra32_analyse->as_intra16_analyse[j];
   2691                     ps_intra16_analyse->b1_merge_flag = 0;
   2692                 }
   2693             }
   2694 
   2695             /* CU size 64x64 and fill the final cu params */
   2696             //ps_row_cu->b3_cu_pos_x = gau1_cu_pos_x[0];
   2697             //ps_row_cu->b3_cu_pos_y = gau1_cu_pos_y[0];
   2698             //ps_row_cu->u1_cu_size  = 64;
   2699 
   2700             /* Candidate mode Update */
   2701             cand_mode_list[0] = best_mode;
   2702             if(cand_mode_list[0] > 1)
   2703             {
   2704                 if(cand_mode_list[0] == 2)
   2705                 {
   2706                     cand_mode_list[1] = 34;
   2707                     cand_mode_list[2] = 3;
   2708                 }
   2709                 else if(cand_mode_list[0] == 34)
   2710                 {
   2711                     cand_mode_list[1] = 2;
   2712                     cand_mode_list[2] = 33;
   2713                 }
   2714                 else
   2715                 {
   2716                     cand_mode_list[1] = cand_mode_list[0] - 1;
   2717                     cand_mode_list[2] = cand_mode_list[0] + 1;
   2718                 }
   2719                 //cand_mode_list[1] = ps_ed_blk_l1->nang_attr.best_mode;
   2720                 //cand_mode_list[2] = ps_ed_blk_l1->ang_attr.best_mode;
   2721             }
   2722             else
   2723             {
   2724                 cand_mode_list[0] = 0;
   2725                 cand_mode_list[1] = 1;
   2726                 cand_mode_list[2] = 26;
   2727                 //cand_mode_list[2] = ps_ed_blk_l1->nang_attr.best_mode;
   2728             }
   2729 
   2730             /* All 32x32 costs are accumalated to 64x64 cost */
   2731             ps_l0_ipe_out_ctb->i4_best64x64_intra_cost = 0;
   2732             for(i = 0; i < 4; i++)
   2733             {
   2734                 ps_l0_ipe_out_ctb->i4_best64x64_intra_cost +=
   2735                     ps_l0_ipe_out_ctb->ai4_best32x32_intra_cost[i];
   2736             }
   2737             /* by default 64x64 modes are set to default values DC and Planar */
   2738             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[0] = cand_mode_list[0];
   2739             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[1] = cand_mode_list[1];
   2740             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[2] = cand_mode_list[2];
   2741             ps_l0_ipe_out_ctb->au1_best_modes_32x32_tu[3] = 255;
   2742 
   2743             /* Update CTB mode map for the finalised CU */
   2744             x = ((ps_cu_node->u2_x0 << 3) >> 2) + 1;
   2745             y = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
   2746             size = ps_cu_node->u1_cu_size >> 2;
   2747 
   2748             for(row = y; row < (y + size); row++)
   2749             {
   2750                 for(col = x; col < (x + size); col++)
   2751                 {
   2752                     ps_ctxt->au1_ctb_mode_map[row][col] = best_mode;
   2753                 }
   2754             }
   2755 
   2756             ihevce_set_nbr_map(
   2757                 ps_ctxt->pu1_ctb_nbr_map,
   2758                 ps_ctxt->i4_nbr_map_strd,
   2759                 (ps_cu_node->u2_x0 << 1),
   2760                 (ps_cu_node->u2_y0 << 1),
   2761                 (ps_cu_node->u1_cu_size >> 2),
   2762                 1);
   2763 
   2764             /*As 64*64 has won, pick L1 32x32 qp*/
   2765             //ASSERT(((blk_cnt>>6) & 0xF) == (blk_cnt>>6));
   2766             //ASSERT((blk_cnt>>6) == 0);
   2767             ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
   2768             i1_cu_possible_qp = ihevce_cu_level_qp_mod(
   2769                 ps_ctxt->i4_qscale,
   2770                 ps_ed_ctb_l1->i4_32x32_satd[0][0],
   2771                 ps_ctxt->ld_curr_frame_32x32_log_avg[0],
   2772                 f_strength,
   2773                 &i4_act_factor,
   2774                 &i4_q_scale_q3_mod,
   2775                 ps_ctxt->ps_rc_quant_ctxt);
   2776 
   2777             i8_frame_acc_satd_by_modqp_q10 =
   2778                 (i8_frame_acc_satd_cost << (SATD_BY_ACT_Q_FAC + QSCALE_Q_FAC_3)) /
   2779                 i4_q_scale_q3_mod;
   2780             /* Increment pointers */
   2781             ps_ed_blk_l1 += 64;
   2782             ps_ed_blk_l2 += 16;
   2783             //ps_row_cu++;
   2784         }
   2785     }
   2786 
   2787     //ps_ctb_out->u1_num_cus_in_ctb = (UWORD8)(ps_row_cu - ps_curr_cu);
   2788 
   2789     {
   2790         WORD32 i4_i, i4_j;
   2791         WORD32 dummy;
   2792         WORD8 i1_cu_qp;
   2793         (void)i1_cu_qp;
   2794         /*MAM_VAR_L1*/
   2795         for(i4_j = 0; i4_j < 2; i4_j++)
   2796         {
   2797             i4_mod_factor_num = ps_ctxt->ai4_mod_factor_derived_by_variance[i4_j];
   2798             f_strength = ps_ctxt->f_strength;
   2799 
   2800             //i4_mod_factor_num = 4;
   2801 
   2802             ps_ed_blk_l1 = ps_ed_l1_ctb;
   2803             ps_ed_blk_l2 = ps_ed_l2_ctb;
   2804             //ps_row_cu = ps_curr_cu;
   2805 
   2806             /*Valid only for complete CTB */
   2807             if((64 == u1_curr_ctb_wdt) && (64 == u1_curr_ctb_hgt))
   2808             {
   2809                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][0] != -2);
   2810                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][1] != -2);
   2811                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][2] != -2);
   2812                 ASSERT(ps_ed_ctb_l1->i4_32x32_satd[0][3] != -2);
   2813 
   2814                 i1_cu_qp = ihevce_cu_level_qp_mod(
   2815                     ps_ctxt->i4_qscale,
   2816                     ps_ed_ctb_l1->i4_32x32_satd[0][0],
   2817                     ps_ctxt->ld_curr_frame_32x32_log_avg[0],
   2818                     f_strength,
   2819                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j],
   2820                     &dummy,
   2821                     ps_ctxt->ps_rc_quant_ctxt);
   2822 
   2823                 i1_cu_qp = ihevce_cu_level_qp_mod(
   2824                     ps_ctxt->i4_qscale,
   2825                     ps_ed_ctb_l1->i4_32x32_satd[0][1],
   2826                     ps_ctxt->ld_curr_frame_32x32_log_avg[1],
   2827                     f_strength,
   2828                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j],
   2829                     &dummy,
   2830                     ps_ctxt->ps_rc_quant_ctxt);
   2831                 i1_cu_qp = ihevce_cu_level_qp_mod(
   2832                     ps_ctxt->i4_qscale,
   2833                     ps_ed_ctb_l1->i4_32x32_satd[0][2],
   2834                     ps_ctxt->ld_curr_frame_32x32_log_avg[2],
   2835                     f_strength,
   2836                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j],
   2837                     &dummy,
   2838                     ps_ctxt->ps_rc_quant_ctxt);
   2839 
   2840                 i1_cu_qp = ihevce_cu_level_qp_mod(
   2841                     ps_ctxt->i4_qscale,
   2842                     ps_ed_ctb_l1->i4_32x32_satd[0][3],
   2843                     2.0 + ps_ctxt->ld_curr_frame_16x16_log_avg[0],
   2844                     f_strength,
   2845                     &ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j],
   2846                     &dummy,
   2847                     ps_ctxt->ps_rc_quant_ctxt);
   2848 
   2849                 ASSERT(ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] > 0);
   2850             }
   2851             else
   2852             {
   2853                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[0][i4_j] = 1024;
   2854                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[1][i4_j] = 1024;
   2855                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[2][i4_j] = 1024;
   2856                 ps_l0_ipe_out_ctb->i4_64x64_act_factor[3][i4_j] = 1024;
   2857             }
   2858 
   2859             /*Store the 8x8 Qps from L2 (in raster order) as output of intra prediction
   2860             for the usage by ME*/
   2861 
   2862             {
   2863                 WORD32 pos_x_32, pos_y_32, pos;
   2864                 //WORD32 i4_incomplete_ctb_val_8;
   2865                 pos_x_32 = u1_curr_ctb_wdt / 16;
   2866                 pos_y_32 = u1_curr_ctb_hgt / 16;
   2867 
   2868                 pos = (pos_x_32 < pos_y_32) ? pos_x_32 : pos_y_32;
   2869 
   2870                 for(i4_i = 0; i4_i < 4; i4_i++)
   2871                 {
   2872                     if(i4_i < pos)
   2873                     {
   2874                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] != -2);
   2875                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] != -2);
   2876                         ASSERT(ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] != -2);
   2877                         i1_cu_qp = ihevce_cu_level_qp_mod(
   2878                             ps_ctxt->i4_qscale,
   2879                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][0],
   2880                             ps_ctxt->ld_curr_frame_16x16_log_avg[0],
   2881                             f_strength,
   2882                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j],
   2883                             &dummy,
   2884                             ps_ctxt->ps_rc_quant_ctxt);
   2885                         i1_cu_qp = ihevce_cu_level_qp_mod(
   2886                             ps_ctxt->i4_qscale,
   2887                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][1],
   2888                             ps_ctxt->ld_curr_frame_16x16_log_avg[1],
   2889                             f_strength,
   2890                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j],
   2891                             &dummy,
   2892                             ps_ctxt->ps_rc_quant_ctxt);
   2893                         i1_cu_qp = ihevce_cu_level_qp_mod(
   2894                             ps_ctxt->i4_qscale,
   2895                             ps_ed_ctb_l1->i4_16x16_satd[i4_i][2],
   2896                             ps_ctxt->ld_curr_frame_16x16_log_avg[2],
   2897                             f_strength,
   2898                             &ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j],
   2899                             &dummy,
   2900                             ps_ctxt->ps_rc_quant_ctxt);
   2901                     }
   2902                     else
   2903                     {
   2904                         /*For incomplete CTB */
   2905                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][0][i4_j] = 1024;
   2906                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][1][i4_j] = 1024;
   2907                         ps_l0_ipe_out_ctb->i4_32x32_act_factor[i4_i][2][i4_j] = 1024;
   2908                     }
   2909                 }
   2910             }
   2911 
   2912             /*Store the 8x8 Qps from L1 (in raster order) as output of intra prediction
   2913             for the usage by ME*/
   2914             {
   2915                 WORD32 pos_x_16, pos_y_16, pos;
   2916                 //WORD32 i4_incomplete_ctb_val_8;
   2917                 pos_x_16 = u1_curr_ctb_wdt / 4;
   2918                 pos_y_16 = u1_curr_ctb_hgt / 4;
   2919 
   2920                 pos = (pos_x_16 < pos_y_16) ? pos_x_16 : pos_y_16;
   2921                 for(i4_i = 0; i4_i < 16; i4_i++)
   2922                 {
   2923                     if(i4_i < pos)
   2924                     {
   2925                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] != -2);
   2926                         ASSERT(ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] != -2);
   2927                         i1_cu_qp = ihevce_cu_level_qp_mod(
   2928                             ps_ctxt->i4_qscale,
   2929                             ps_ed_ctb_l1->i4_8x8_satd[i4_i][0],
   2930                             ps_ctxt->ld_curr_frame_8x8_log_avg[0],
   2931                             f_strength,
   2932                             &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j],
   2933                             &dummy,
   2934                             ps_ctxt->ps_rc_quant_ctxt);
   2935                         i1_cu_qp = ihevce_cu_level_qp_mod(
   2936                             ps_ctxt->i4_qscale,
   2937                             ps_ed_ctb_l1->i4_8x8_satd[i4_i][1],
   2938                             ps_ctxt->ld_curr_frame_8x8_log_avg[1],
   2939                             f_strength,
   2940                             &ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j],
   2941                             &dummy,
   2942                             ps_ctxt->ps_rc_quant_ctxt);
   2943                     }
   2944                     else
   2945                     {
   2946                         /*For incomplete CTB */
   2947                         ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][0][i4_j] = 1024;
   2948                         ps_l0_ipe_out_ctb->i4_16x16_act_factor[i4_i][1][i4_j] = 1024;
   2949                     }
   2950                 }
   2951             }
   2952         }  //for loop
   2953 
   2954         /* Accumalate the cost of ctb to the total cost */
   2955         ps_ctxt->i8_frame_acc_satd_cost += i8_frame_acc_satd_cost;
   2956         ps_ctxt->i8_frame_acc_satd_by_modqp_q10 += i8_frame_acc_satd_by_modqp_q10;
   2957 
   2958         ps_ctxt->i8_frame_acc_mode_bits_cost += i8_frame_acc_mode_bits_cost;
   2959 
   2960         /* satd and mpm bits accumalation of best cu size candiate for the ctb */
   2961         ps_l0_ipe_out_ctb->i4_ctb_acc_satd = i4_ctb_acc_satd;
   2962         ps_l0_ipe_out_ctb->i4_ctb_acc_mpm_bits = i8_frame_acc_mode_bits_cost;
   2963 
   2964         ps_ctxt->i8_frame_acc_satd += i4_ctb_acc_satd;
   2965     }
   2966 
   2967     {
   2968         WORD32 ctr_8x8;
   2969         for(ctr_8x8 = 0; ctr_8x8 < (MAX_CU_IN_CTB >> 2); ctr_8x8++)
   2970         {
   2971             /*Accumalate activity factor for Intra and Inter*/
   2972             if(ps_l0_ipe_out_ctb->ai4_best_sad_cost_8x8_l1_ipe[ctr_8x8] <
   2973                ps_ed_ctb_l1->i4_sad_me_for_ref[ctr_8x8])
   2974             {
   2975                 ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
   2976                     ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
   2977             }
   2978             else
   2979             {
   2980                 ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8] =
   2981                     ps_l0_ipe_out_ctb->i4_16x16_act_factor[ctr_8x8][1][0];
   2982             }
   2983 
   2984             /*Accumalate activity factor at frame level*/
   2985             ps_ctxt->i8_frame_acc_act_factor += ps_l0_ipe_out_ctb->ai4_8x8_act_factor[ctr_8x8];
   2986         }
   2987     }
   2988     return;
   2989 }
   2990 
   2991 WORD32 ihevce_nxn_sad_computer(
   2992     UWORD8 *pu1_inp, WORD32 i4_inp_stride, UWORD8 *pu1_ref, WORD32 i4_ref_stride, WORD32 trans_size)
   2993 {
   2994     WORD32 wd, ht, i, j;
   2995     WORD32 sad = 0;
   2996 
   2997     wd = trans_size;
   2998     ht = trans_size;
   2999 
   3000     for(i = 0; i < ht; i++)
   3001     {
   3002         for(j = 0; j < wd; j++)
   3003         {
   3004             sad += (ABS(((WORD32)pu1_inp[j] - (WORD32)pu1_ref[j])));
   3005         }
   3006         pu1_inp += i4_inp_stride;
   3007         pu1_ref += i4_ref_stride;
   3008     }
   3009 
   3010     return sad;
   3011 }
   3012 
   3013 /*!
   3014 ******************************************************************************
   3015 * \if Function name : ihevce_mode_eval_filtering \endif
   3016 *
   3017 * \brief
   3018 *    Evaluates best 3 modes for the given CU size with probable modes from,
   3019 *    early decision structure, mpm candidates and dc, planar mode
   3020 *
   3021 * \param[in] ps_cu_node : pointer to MAX cu node info buffer
   3022 * \param[in] ps_child_cu_node : pointer to (MAX - 1) cu node info buffer
   3023 * \param[in] ps_ctxt : pointer to IPE context struct
   3024 * \param[in] ps_curr_src : pointer to src pixels struct
   3025 * \param[in] best_amode : best angular mode from l1 layer or
   3026                             from (MAX - 1) CU mode
   3027 * \param[in] best_costs_4x4  : pointer to 3 best cost buffer
   3028 * \param[in] best_modes_4x4  : pointer to 3 best mode buffer
   3029 * \param[in] step2_bypass : if 0, (MAX - 1) CU is evaluated
   3030 *                           if 1, (MAX CU) sugested is evaluated
   3031 * \param[in] tu_eq_cu     : indicates if tu size is same as cu or cu/2
   3032 *
   3033 * \return
   3034 *    None
   3035 *
   3036 * \author
   3037 *  Ittiam
   3038 *
   3039 *****************************************************************************
   3040 */
   3041 void ihevce_mode_eval_filtering(
   3042     ihevce_ipe_cu_tree_t *ps_cu_node,
   3043     ihevce_ipe_cu_tree_t *ps_child_cu_node,
   3044     ihevce_ipe_ctxt_t *ps_ctxt,
   3045     iv_enc_yuv_buf_t *ps_curr_src,
   3046     WORD32 best_amode,
   3047     WORD32 *best_costs_4x4,
   3048     UWORD8 *best_modes_4x4,
   3049     WORD32 step2_bypass,
   3050     WORD32 tu_eq_cu)
   3051 {
   3052     UWORD8 *pu1_origin, *pu1_orig;
   3053     WORD32 src_strd = ps_curr_src->i4_y_strd;
   3054     WORD32 nbr_flags;
   3055     nbr_avail_flags_t s_nbr;
   3056     WORD32 trans_size = tu_eq_cu ? ps_cu_node->u1_cu_size : ps_cu_node->u1_cu_size >> 1;
   3057     WORD32 num_tu_in_x = tu_eq_cu ? 1 : 2;
   3058     WORD32 num_tu_in_y = tu_eq_cu ? 1 : 2;
   3059     UWORD8 mode;
   3060 
   3061     WORD32 cost_ang_mode = MAX_INTRA_COST_IPE;
   3062     WORD32 filter_flag;
   3063     WORD32 cost_amode_step2[7] = { 0 };
   3064     /*WORD32 best_sad[5];  // NOTE_A01: Not getting consumed at present */
   3065     WORD32 sad = 0;
   3066     WORD32 cu_pos_x, cu_pos_y;
   3067     WORD32 temp;
   3068     WORD32 i = 0, j, k, i_end, z;
   3069     //WORD32 row, col, size;
   3070     UWORD8 *pu1_ref;
   3071     WORD32 xA, yA, xB, yB;
   3072     WORD32 top_intra_mode;
   3073     WORD32 left_intra_mode;
   3074     UWORD8 *pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
   3075     UWORD8 *pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
   3076 
   3077     UWORD8 modes_4x4[5] = { 0, 1, 2, 3, 4 };
   3078     WORD32 count;
   3079 
   3080     pf_ipe_res_trans_had apf_resd_trns_had[4];
   3081 
   3082     WORD32 cand_mode_satd_list[3];
   3083     ihevc_intra_pred_luma_ref_substitution_ft *ihevc_intra_pred_luma_ref_substitution_fptr;
   3084 
   3085     ihevc_intra_pred_luma_ref_substitution_fptr =
   3086         ps_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
   3087 
   3088     apf_resd_trns_had[0] = ps_ctxt->s_cmn_opt_func.pf_HAD_4x4_8bit;
   3089     apf_resd_trns_had[1] = ps_ctxt->s_cmn_opt_func.pf_HAD_8x8_8bit;
   3090     apf_resd_trns_had[2] = ps_ctxt->s_cmn_opt_func.pf_HAD_16x16_8bit;
   3091     apf_resd_trns_had[3] = ps_ctxt->s_cmn_opt_func.pf_HAD_32x32_8bit;
   3092 
   3093     /* initialize modes_to_eval as zero */
   3094     memset(&ps_ctxt->au1_modes_to_eval, 0, MAX_NUM_IP_MODES);
   3095 
   3096     /* Compute the Parent Cost */
   3097 
   3098     /* Pointer to top-left of the CU - y0,x0 in 8x8 granularity */
   3099     pu1_orig = (UWORD8 *)(ps_curr_src->pv_y_buf) + ((ps_cu_node->u2_y0 << 3) * src_strd) +
   3100                (ps_cu_node->u2_x0 << 3);
   3101 
   3102     /* Get position of CU within CTB at 4x4 granularity */
   3103     cu_pos_x = ps_cu_node->u2_x0 << 1;
   3104     cu_pos_y = ps_cu_node->u2_y0 << 1;
   3105 
   3106     /* get the neighbour availability flags */
   3107     ihevce_get_only_nbr_flag(
   3108         &s_nbr,
   3109         ps_ctxt->pu1_ctb_nbr_map,
   3110         ps_ctxt->i4_nbr_map_strd,
   3111         cu_pos_x,
   3112         cu_pos_y,
   3113         trans_size >> 2,
   3114         trans_size >> 2);
   3115 
   3116     /* Traverse for all 4 child blocks in the parent block */
   3117     xA = (ps_cu_node->u2_x0 << 3) >> 2;
   3118     yA = ((ps_cu_node->u2_y0 << 3) >> 2) + 1;
   3119     xB = xA + 1;
   3120     yB = yA - 1;
   3121     left_intra_mode = ps_ctxt->au1_ctb_mode_map[yA][xA];
   3122     top_intra_mode = ps_ctxt->au1_ctb_mode_map[yB][xB];
   3123     /* call the function which populates sad cost for all the modes */
   3124 
   3125     ihevce_intra_populate_mode_bits_cost_bracketing(
   3126         top_intra_mode,
   3127         left_intra_mode,
   3128         s_nbr.u1_top_avail,
   3129         s_nbr.u1_left_avail,
   3130         ps_cu_node->u2_y0,
   3131         &ps_ctxt->au2_mode_bits_satd_cost[0],
   3132         &ps_ctxt->au2_mode_bits_satd[0],
   3133         ps_ctxt->i4_ol_satd_lambda,
   3134         cand_mode_satd_list);
   3135 
   3136     for(k = 0; k < num_tu_in_y; k++)
   3137     {
   3138         for(j = 0; j < num_tu_in_x; j++)
   3139         {
   3140             /* get the neighbour availability flags */
   3141             nbr_flags = ihevce_get_nbr_intra(
   3142                 &s_nbr,
   3143                 ps_ctxt->pu1_ctb_nbr_map,
   3144                 ps_ctxt->i4_nbr_map_strd,
   3145                 cu_pos_x + ((j) * (trans_size >> 2)),
   3146                 cu_pos_y + ((k) * (trans_size >> 2)),
   3147                 trans_size >> 2);
   3148 
   3149             pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
   3150 
   3151             /* Create reference samples array */
   3152             ihevc_intra_pred_luma_ref_substitution_fptr(
   3153                 pu1_origin - src_strd - 1,
   3154                 pu1_origin - src_strd,
   3155                 pu1_origin - 1,
   3156                 src_strd,
   3157                 trans_size,
   3158                 nbr_flags,
   3159                 pu1_ref_orig,
   3160                 0);
   3161 
   3162             /* Perform reference samples filtering */
   3163             ihevce_intra_pred_ref_filtering(pu1_ref_orig, trans_size, pu1_ref_filt);
   3164 
   3165             ihevce_set_nbr_map(
   3166                 ps_ctxt->pu1_ctb_nbr_map,
   3167                 ps_ctxt->i4_nbr_map_strd,
   3168                 cu_pos_x + ((j) * (trans_size >> 2)),
   3169                 cu_pos_y + ((k) * (trans_size >> 2)),
   3170                 (trans_size >> 2),
   3171                 1);
   3172 
   3173             pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
   3174             pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
   3175         }
   3176     }
   3177 
   3178     /* Revaluation for angular mode */
   3179     //if(ps_ed_blk->ang_attr.mode_present == 1)
   3180     //if(((best_amode & 0x1) != 1))
   3181 
   3182     {
   3183         WORD32 u1_trans_idx = trans_size >> 3;
   3184         if(trans_size == 32)
   3185             u1_trans_idx = 3;
   3186         //best_amode = ps_ed_blk->ang_attr.best_mode;
   3187 
   3188         i = 0;
   3189         if(!step2_bypass)
   3190         {
   3191             /* Around best level 4 angular mode, search for best level 2 mode */
   3192             ASSERT((best_amode >= 2) && (best_amode <= 34));
   3193 
   3194             if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
   3195             {
   3196                 if(best_amode >= 4)
   3197                     ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode - 2;
   3198             }
   3199 
   3200             ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode;
   3201 
   3202             if(ps_ctxt->i4_quality_preset <= IHEVCE_QUALITY_P3)
   3203             {
   3204                 if(best_amode <= 32)
   3205                     ps_ctxt->au1_modes_to_eval_temp[i++] = best_amode + 2;
   3206             }
   3207         }
   3208         else
   3209         {
   3210             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[0]->best_mode;
   3211             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[1]->best_mode;
   3212             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[2]->best_mode;
   3213             ps_ctxt->au1_modes_to_eval_temp[i++] = ps_child_cu_node->ps_sub_cu[3]->best_mode;
   3214         }
   3215 
   3216         /* Add the left and top MPM modes for computation*/
   3217 
   3218         ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[0];
   3219         ps_ctxt->au1_modes_to_eval_temp[i++] = cand_mode_satd_list[1];
   3220 
   3221         i_end = i;
   3222         count = 0;
   3223 
   3224         /*Remove duplicate modes from modes_to_eval_temp[] */
   3225         for(j = 0; j < i_end; j++)
   3226         {
   3227             for(k = 0; k < count; k++)
   3228             {
   3229                 if(ps_ctxt->au1_modes_to_eval_temp[j] == ps_ctxt->au1_modes_to_eval[k])
   3230                     break;
   3231             }
   3232             if((k == count) && (ps_ctxt->au1_modes_to_eval_temp[j] > 1))
   3233             {
   3234                 ps_ctxt->au1_modes_to_eval[count] = ps_ctxt->au1_modes_to_eval_temp[j];
   3235                 count++;
   3236             }
   3237         }
   3238         i_end = count;
   3239         if(count == 0)
   3240         {
   3241             ps_ctxt->au1_modes_to_eval[0] = 26;
   3242             i_end = 1;
   3243         }
   3244 
   3245         for(i = 0; i < i_end; i++)
   3246         {
   3247             pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
   3248             pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
   3249 
   3250             mode = ps_ctxt->au1_modes_to_eval[i];
   3251             ASSERT((mode >= 2) && (mode <= 34));
   3252             cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
   3253             filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
   3254 
   3255             for(k = 0; k < num_tu_in_y; k++)
   3256             {
   3257                 for(j = 0; j < num_tu_in_x; j++)
   3258                 {
   3259                     pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
   3260 
   3261                     if(0 == filter_flag)
   3262                         pu1_ref = pu1_ref_orig;
   3263                     else
   3264                         pu1_ref = pu1_ref_filt;
   3265 
   3266                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
   3267                         pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
   3268 
   3269                     if(ps_ctxt->u1_use_satd)
   3270                     {
   3271                         sad = apf_resd_trns_had[u1_trans_idx](
   3272                             pu1_origin,
   3273                             ps_curr_src->i4_y_strd,
   3274                             &ps_ctxt->au1_pred_samples[0],
   3275                             trans_size,
   3276                             NULL,
   3277                             0
   3278 
   3279                         );
   3280                     }
   3281                     else
   3282                     {
   3283                         sad = ps_ctxt->s_ipe_optimised_function_list.pf_nxn_sad_computer(
   3284                             pu1_origin,
   3285                             ps_curr_src->i4_y_strd,
   3286                             &ps_ctxt->au1_pred_samples[0],
   3287                             trans_size,
   3288                             trans_size);
   3289                     }
   3290 
   3291                     cost_amode_step2[i] += sad;
   3292 
   3293                     pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
   3294                     pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
   3295                 }
   3296             }
   3297         }
   3298         best_amode = ps_ctxt->au1_modes_to_eval[0];
   3299         /*Init cost indx */
   3300         cost_ang_mode = MAX_INTRA_COST_IPE;  //cost_amode_step2[0];
   3301         for(z = 0; z < i_end; z++)
   3302         {
   3303             /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
   3304             if(cost_ang_mode >= cost_amode_step2[z])
   3305             {
   3306                 if(cost_ang_mode == cost_amode_step2[z])
   3307                 {
   3308                     if(best_amode > ps_ctxt->au1_modes_to_eval[z])
   3309                         best_amode = ps_ctxt->au1_modes_to_eval[z];
   3310                 }
   3311                 else
   3312                 {
   3313                     best_amode = ps_ctxt->au1_modes_to_eval[z];
   3314                 }
   3315                 cost_ang_mode = cost_amode_step2[z];
   3316             }
   3317         }
   3318 
   3319         /*Modify mode bits for the angular modes */
   3320     }
   3321 
   3322     {
   3323         /* Step - I modification */
   3324         ASSERT((best_amode >= 2) && (best_amode <= 34));
   3325         i_end = 0;
   3326         z = 0;
   3327 
   3328         /* Around best level 3 angular mode, search for best level 1 mode */
   3329         ps_ctxt->au1_modes_to_eval[i_end++] = 0;
   3330         ps_ctxt->au1_modes_to_eval[i_end++] = 1;
   3331 
   3332         if(best_amode != 2)
   3333             ps_ctxt->au1_modes_to_eval[i_end++] = best_amode - 1;
   3334 
   3335         ps_ctxt->au1_modes_to_eval[i_end++] = best_amode;
   3336 
   3337         if(best_amode != 34)
   3338             ps_ctxt->au1_modes_to_eval[i_end++] = best_amode + 1;
   3339 
   3340         /* Inserting step_2's best mode at last to avoid
   3341         recalculation of it's SATD cost */
   3342 
   3343         //ps_ctxt->au1_modes_to_eval[i_end] = best_amode; //Bugfix: HSAD compared with SAD
   3344         //cost_amode_step2[i_end] = cost_ang_mode;
   3345 
   3346         /*best_sad[i_end] = cost_ang_mode
   3347                 - mode_bits_satd_cost[best_amode]; //See NOTE_A01 above */
   3348 
   3349         cost_ang_mode = MAX_INTRA_COST_IPE; /* Init cost */
   3350 
   3351         for(i = 0; i < i_end; i++)
   3352         {
   3353             WORD32 u1_trans_idx = trans_size >> 3;
   3354             if(trans_size == 32)
   3355                 u1_trans_idx = 3;
   3356             pu1_ref_orig = &ps_ctxt->au1_ref_samples[0];
   3357             pu1_ref_filt = &ps_ctxt->au1_filt_ref_samples[0];
   3358 
   3359             /*best_sad[i] = 0; //See NOTE_A01 above */
   3360             mode = ps_ctxt->au1_modes_to_eval[i];
   3361             cost_amode_step2[i] = ps_ctxt->au2_mode_bits_satd_cost[mode];
   3362             filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(trans_size) - 2));
   3363 
   3364             for(k = 0; k < num_tu_in_y; k++)
   3365             {
   3366                 for(j = 0; j < num_tu_in_x; j++)
   3367                 {
   3368                     pu1_origin = pu1_orig + (k * trans_size * src_strd) + (j * trans_size);
   3369 
   3370                     if(0 == filter_flag)
   3371                         pu1_ref = pu1_ref_orig;
   3372                     else
   3373                         pu1_ref = pu1_ref_filt;
   3374 
   3375                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
   3376                         pu1_ref, 0, &ps_ctxt->au1_pred_samples[0], trans_size, trans_size, mode);
   3377 
   3378                     //if(trans_size != 4)
   3379                     {
   3380                         sad = apf_resd_trns_had[u1_trans_idx](
   3381                             pu1_origin,
   3382                             ps_curr_src->i4_y_strd,
   3383                             &ps_ctxt->au1_pred_samples[0],
   3384                             trans_size,
   3385                             NULL,
   3386                             0);
   3387                     }
   3388 
   3389                     /*accumualting SATD though name says it is sad*/
   3390                     cost_amode_step2[i] += sad;
   3391                     /*best_sad[i] +=sad; //See NOTE_A01 above */
   3392                     pu1_ref_orig += (4 * MAX_CTB_SIZE + 1);
   3393                     pu1_ref_filt += (4 * MAX_CTB_SIZE + 1);
   3394                 }
   3395             }
   3396         }
   3397         /* Updating i_end for the step_2's inserted mode*/
   3398         //        i_end++;
   3399 
   3400         /* Arrange the reference array in ascending order */
   3401 
   3402         for(i = 0; i < (i_end - 1); i++)
   3403         {
   3404             for(j = i + 1; j < i_end; j++)
   3405             {
   3406                 if(cost_amode_step2[i] > cost_amode_step2[j])
   3407                 {
   3408                     temp = cost_amode_step2[i];
   3409                     cost_amode_step2[i] = cost_amode_step2[j];
   3410                     cost_amode_step2[j] = temp;
   3411 
   3412                     temp = modes_4x4[i];
   3413                     modes_4x4[i] = modes_4x4[j];
   3414                     modes_4x4[j] = temp;
   3415                 }
   3416             }
   3417         }
   3418 
   3419         /* Least cost of all 3 angles are stored in cost_amode_step2[0] and corr. mode*/
   3420         best_amode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
   3421         cost_ang_mode = cost_amode_step2[0];
   3422         ps_cu_node->best_satd = cost_ang_mode - ps_ctxt->au2_mode_bits_satd_cost[best_amode];
   3423         ps_cu_node->best_cost = cost_amode_step2[0];
   3424         ps_cu_node->best_mode = ps_ctxt->au1_modes_to_eval[modes_4x4[0]];
   3425         ps_cu_node->best_satd =
   3426             ps_cu_node->best_cost - ps_ctxt->au2_mode_bits_satd_cost[ps_cu_node->best_mode];
   3427 
   3428         /*Accumalate best mode bits cost for RC*/
   3429         ps_cu_node->u2_mode_bits_cost = ps_ctxt->au2_mode_bits_satd[ps_cu_node->best_mode];
   3430 
   3431         /* Store the best three candidates */
   3432         for(i = 0; i < 3; i++)
   3433         {
   3434             best_costs_4x4[i] = cost_amode_step2[i];
   3435             best_modes_4x4[i] = ps_ctxt->au1_modes_to_eval[modes_4x4[i]];
   3436         }
   3437     }
   3438 
   3439     return;
   3440 }
   3441