Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /*!
     22 ******************************************************************************
     23 * \file ihevce_decomp_pre_intra_pass.c
     24 *
     25 * \brief
     26 *    This file contains definitions related to frame decomposition done during
     27 *    pre intra processing
     28 *
     29 * \date
     30 *    19/02/2013
     31 *
     32 * \author
     33 *    Ittiam
     34 *
     35 * List of Functions
     36 *    ihevce_intra_populate_mode_bits_cost()
     37 *    ihevce_8x8_sad_computer()
     38 *    ihevce_4x4_sad_computer()
     39 *    ihevce_ed_4x4_find_best_modes()
     40 *    ihevce_ed_calc_4x4_blk()
     41 *    ihevce_ed_calc_8x8_blk()
     42 *    ihevce_ed_calc_incomplete_ctb()
     43 *    ihevce_cu_level_qp_mod()
     44 *    ihevce_ed_calc_ctb()
     45 *    ihevce_ed_frame_init()
     46 *    ihevce_scale_by_2()
     47 *    ihevce_decomp_pre_intra_process_row()
     48 *    ihevce_decomp_pre_intra_process()
     49 *    ihevce_decomp_pre_intra_get_num_mem_recs()
     50 *    ihevce_decomp_pre_intra_get_mem_recs()
     51 *    ihevce_decomp_pre_intra_init()
     52 *    ihevce_decomp_pre_intra_frame_init()
     53 *    ihevce_merge_sort()
     54 *    ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit()
     55 *
     56 ******************************************************************************
     57 */
     58 
     59 /*****************************************************************************/
     60 /* File Includes                                                             */
     61 /*****************************************************************************/
     62 /* System include files */
     63 #include <stdio.h>
     64 #include <string.h>
     65 #include <stdlib.h>
     66 #include <assert.h>
     67 #include <stdarg.h>
     68 #include <math.h>
     69 #include <limits.h>
     70 
     71 /* User include files */
     72 #include "ihevc_typedefs.h"
     73 #include "itt_video_api.h"
     74 #include "ihevce_api.h"
     75 
     76 #include "rc_cntrl_param.h"
     77 #include "rc_frame_info_collector.h"
     78 #include "rc_look_ahead_params.h"
     79 
     80 #include "ihevc_defs.h"
     81 #include "ihevc_debug.h"
     82 #include "ihevc_structs.h"
     83 #include "ihevc_platform_macros.h"
     84 #include "ihevc_deblk.h"
     85 #include "ihevc_itrans_recon.h"
     86 #include "ihevc_chroma_itrans_recon.h"
     87 #include "ihevc_chroma_intra_pred.h"
     88 #include "ihevc_intra_pred.h"
     89 #include "ihevc_inter_pred.h"
     90 #include "ihevc_mem_fns.h"
     91 #include "ihevc_padding.h"
     92 #include "ihevc_weighted_pred.h"
     93 #include "ihevc_sao.h"
     94 #include "ihevc_resi_trans.h"
     95 #include "ihevc_quant_iquant_ssd.h"
     96 #include "ihevc_cabac_tables.h"
     97 
     98 #include "ihevce_defs.h"
     99 #include "ihevce_hle_interface.h"
    100 #include "ihevce_lap_enc_structs.h"
    101 #include "ihevce_multi_thrd_structs.h"
    102 #include "ihevce_multi_thrd_funcs.h"
    103 #include "ihevce_me_common_defs.h"
    104 #include "ihevce_had_satd.h"
    105 #include "ihevce_error_codes.h"
    106 #include "ihevce_bitstream.h"
    107 #include "ihevce_cabac.h"
    108 #include "ihevce_rdoq_macros.h"
    109 #include "ihevce_function_selector.h"
    110 #include "ihevce_enc_structs.h"
    111 #include "ihevce_entropy_structs.h"
    112 #include "ihevce_cmn_utils_instr_set_router.h"
    113 #include "ihevce_ipe_instr_set_router.h"
    114 #include "ihevce_decomp_pre_intra_structs.h"
    115 #include "ihevce_decomp_pre_intra_pass.h"
    116 #include "ihevce_enc_loop_structs.h"
    117 #include "hme_datatype.h"
    118 #include "hme_interface.h"
    119 #include "hme_common_defs.h"
    120 #include "ihevce_global_tables.h"
    121 
    122 /*****************************************************************************/
    123 /* Typedefs                                                                  */
    124 /*****************************************************************************/
    125 typedef void (*pf_ed_calc_ctb)(
    126     ihevce_ed_ctxt_t *ps_ed_ctxt,
    127     ihevce_ed_blk_t *ps_ed_ctb,
    128     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
    129     UWORD8 *pu1_src,
    130     WORD32 src_stride,
    131     WORD32 num_4x4_blks_x,
    132     WORD32 num_4x4_blks_y,
    133     WORD32 *nbr_flags,
    134     WORD32 i4_layer_id,
    135     WORD32 row_block_no,
    136     WORD32 col_block_no,
    137     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
    138     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list);
    139 
    140 /*****************************************************************************/
    141 /* Constant Macros                                                           */
    142 /*****************************************************************************/
    143 #define SATD_NOISE_FLOOR_THRESHOLD 16
    144 #define MINIMUM_VARIANCE 15
    145 #define SCALE_FACTOR_VARIANCE 20
    146 #define SCALE_FACTOR_VARIANCE_8x8 60
    147 #define MIN_SATD_THRSHLD 0
    148 #define MAX_SATD_THRSHLD 64
    149 #define SUB_NOISE_THRSHLD 0
    150 #define MIN_BLKS 2
    151 
    152 /*****************************************************************************/
    153 /* Global variables                                                          */
    154 /*****************************************************************************/
    155 
    156 /**
    157 *****************************************************************************
    158 * @brief  list of pointers to luma intra pred functions
    159 *****************************************************************************
    160 */
    161 pf_intra_pred g_apf_lum_ip[NUM_IP_FUNCS];
    162 
    163 /*****************************************************************************/
    164 /* Function Definitions                                                      */
    165 /*****************************************************************************/
    166 
    167 /*!
    168 ******************************************************************************
    169 * \if Function name : ihevce_intra_populate_mode_bits_cost \endif
    170 *
    171 * \brief: look-up table of cost of signalling an intra mode in the
    172 *  bitstream
    173 *
    174 *****************************************************************************
    175 */
    176 void ihevce_intra_populate_mode_bits_cost(
    177     WORD32 top_intra_mode,
    178     WORD32 left_intra_mode,
    179     WORD32 available_top,
    180     WORD32 available_left,
    181     WORD32 cu_pos_y,
    182     UWORD16 *mode_bits_cost,
    183     WORD32 lambda)
    184 {
    185     WORD32 i;
    186     // 5.5 * lambda
    187     UWORD16 five_bits_cost = COMPUTE_RATE_COST_CLIP30(11, lambda, (LAMBDA_Q_SHIFT + 1));
    188 
    189     (void)top_intra_mode;
    190     (void)left_intra_mode;
    191     (void)available_top;
    192     (void)available_left;
    193     (void)cu_pos_y;
    194     for(i = 0; i < NUM_MODES; i++)
    195     {
    196         mode_bits_cost[i] = five_bits_cost;
    197     }
    198 }
    199 
    200 /*!
    201 ******************************************************************************
    202 * \if Function name : ihevce_8x8_sad_computer \endif
    203 *
    204 * \brief: compute sad between 2 8x8 blocks
    205 *
    206 *****************************************************************************
    207 */
    208 UWORD16
    209     ihevce_8x8_sad_computer(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
    210 {
    211     UWORD16 sad = 0;
    212     WORD32 i, j;
    213 
    214     for(i = 0; i < 8; i++)
    215     {
    216         for(j = 0; j < 8; j++)
    217         {
    218             sad += ABS(*pu1_src - *pu1_pred);
    219             pu1_src++;
    220             pu1_pred++;
    221         }
    222         pu1_src = pu1_src + (src_strd - 8);
    223         pu1_pred = pu1_pred + (pred_strd - 8);
    224     }
    225 
    226     return sad;
    227 }
    228 
    229 /*!
    230 ******************************************************************************
    231 * \if Function name : ihevce_4x4_sad_computer \endif
    232 *
    233 * \brief: compute sad between 2 4x4 blocks
    234 *
    235 *****************************************************************************
    236 */
    237 UWORD16
    238     ihevce_4x4_sad_computer(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd)
    239 {
    240     UWORD16 sad = 0;
    241     WORD32 i, j;
    242 
    243     for(i = 0; i < 4; i++)
    244     {
    245         for(j = 0; j < 4; j++)
    246         {
    247             sad += ABS(*pu1_src - *pu1_pred);
    248             pu1_src++;
    249             pu1_pred++;
    250         }
    251         pu1_src = pu1_src + (src_strd - 4);
    252         pu1_pred = pu1_pred + (pred_strd - 4);
    253     }
    254 
    255     return sad;
    256 }
    257 
    258 /*!
    259 ******************************************************************************
    260 * \if Function name : ihevce_ed_4x4_find_best_modes \endif
    261 *
    262 * \brief: evaluate input 4x4 block for pre-selected list of angular and normal
    263 *  intra modes and return best sad, cost
    264 *
    265 *****************************************************************************
    266 */
    267 void ihevce_ed_4x4_find_best_modes(
    268     UWORD8 *pu1_src,
    269     WORD32 src_stride,
    270     UWORD8 *ref,
    271     UWORD16 *mode_bits_cost,
    272     UWORD8 *pu1_best_modes,
    273     WORD32 *pu1_best_sad_costs,
    274     WORD32 u1_low_resol,
    275     FT_SAD_COMPUTER *pf_4x4_sad_computer)
    276 {
    277     WORD32 i;
    278     UWORD8 mode = 0, best_amode = 0, best_nmode = 0;
    279     UWORD8 pred[16];
    280     WORD32 sad = 0;
    281     WORD32 sad_cost = 0;
    282     WORD32 best_asad_cost = 0xFFFFF;
    283     WORD32 best_nsad_cost = 0xFFFFF;
    284 
    285     /* If lower layers, l1 or l2, all the 11 modes are evaluated */
    286     /* If L0 layer, all modes excluding DC and Planar are evaluated */
    287     if(1 == u1_low_resol)
    288         i = 0;
    289     else
    290         i = 2;
    291 
    292     /* Find the best non-angular and angular mode till level 4 */
    293     for(; i < 11; i++)
    294     {
    295         mode = gau1_modes_to_eval[i];
    296         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
    297         sad = pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
    298         sad_cost = sad;
    299         sad_cost += mode_bits_cost[mode];
    300         if(mode < 2)
    301         {
    302             if(sad_cost < best_nsad_cost)
    303             {
    304                 best_nmode = mode;
    305                 best_nsad_cost = sad_cost;
    306             }
    307         }
    308         else
    309         {
    310             if(sad_cost < best_asad_cost)
    311             {
    312                 best_amode = mode;
    313                 best_asad_cost = sad_cost;
    314             }
    315         }
    316     }
    317 
    318     pu1_best_modes[0] = best_amode;
    319     pu1_best_sad_costs[0] = best_asad_cost;
    320 
    321     /* Accumalate the best non-angular mode and cost for the l1 and l2 layers */
    322     if(1 == u1_low_resol)
    323     {
    324         pu1_best_modes[1] = best_nmode;
    325         pu1_best_sad_costs[1] = best_nsad_cost;
    326     }
    327 }
    328 
    329 /*!
    330 ******************************************************************************
    331 * \if Function name : ihevce_ed_calc_4x4_blk \endif
    332 *
    333 * \brief: evaluate input 4x4 block for all intra modes and return best sad &
    334 *  cost
    335 *
    336 *****************************************************************************
    337 */
    338 static void ihevce_ed_calc_4x4_blk(
    339     ihevce_ed_blk_t *ps_ed,
    340     UWORD8 *pu1_src,
    341     WORD32 src_stride,
    342     UWORD8 *ref,
    343     UWORD16 *mode_bits_cost,
    344     WORD32 *sad_ptr,
    345     WORD32 *pi4_best_satd,
    346     WORD32 i4_quality_preset,
    347     WORD32 *pi4_best_sad_cost,
    348     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list)
    349 {
    350     WORD32 i, i_end;
    351     UWORD8 mode, best_amode, best_nmode;
    352     UWORD8 pred[16];
    353 
    354     UWORD16 sad;
    355     WORD32 sad_cost = 0;
    356     WORD32 best_asad_cost = 0xFFFFF;
    357     WORD32 best_nsad_cost = 0xFFFFF;
    358 
    359     UWORD8 au1_best_modes[2];
    360     WORD32 ai4_best_sad_costs[2];
    361 
    362     /* L1/L2 resolution hence low resolution enable */
    363     WORD32 u1_low_resol = 1;
    364 
    365     UWORD8 modes_to_eval[2];
    366 
    367     /* The *pi4_best_satd will be consumed only if current
    368     layer has odd number of 4x4 blocks in either x or y
    369     direction. But the function hme_derive_num_layers() makes
    370     sure that every layer has width and height such that each one
    371     is a multiple of 16. Which makes pi4_best_satd useless. Hence
    372     feel free to remove pi4_best_satd. Concluded on 29th Aug13 */
    373     *pi4_best_satd = -1;
    374     ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes(
    375         pu1_src,
    376         src_stride,
    377         ref,
    378         mode_bits_cost,
    379         au1_best_modes,
    380         ai4_best_sad_costs,
    381         u1_low_resol,
    382         ps_ipe_optimised_function_list->pf_4x4_sad_computer);
    383 
    384     best_nmode = au1_best_modes[1];
    385     best_amode = au1_best_modes[0];
    386     best_nsad_cost = ai4_best_sad_costs[1];
    387     best_asad_cost = ai4_best_sad_costs[0];
    388 
    389     /* Updation of pi4_best_satd here needed iff the mode given by
    390     ihevce_ed_4x4_find_best_modes() comes out to be
    391     the best mode at the end of the function */
    392     *pi4_best_satd = best_asad_cost - mode_bits_cost[best_amode];
    393 
    394     /* Around best level 4 angular mode, search for best level 2 mode */
    395     modes_to_eval[0] = best_amode - 2;
    396     modes_to_eval[1] = best_amode + 2;
    397     i = 0;
    398     i_end = 2;
    399     if(best_amode == 2)
    400         i = 1;
    401     else if(best_amode == 34)
    402         i_end = 1;
    403     for(; i < i_end; i++)
    404     {
    405         mode = modes_to_eval[i];
    406         g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
    407         sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4);
    408         sad_cost = sad;
    409         sad_cost += mode_bits_cost[mode];
    410         if(sad_cost < best_asad_cost)
    411         {
    412             best_amode = mode;
    413             best_asad_cost = sad_cost;
    414             *pi4_best_satd = sad;
    415         }
    416         sad_ptr[mode] = sad;
    417     }
    418 
    419     /*To be done : Add a flag here instead of preset condn*/
    420     if((i4_quality_preset < IHEVCE_QUALITY_P4))
    421     {
    422         /* Around best level 2 angular mode, search for best level 1 mode */
    423         modes_to_eval[0] = best_amode - 1;
    424         modes_to_eval[1] = best_amode + 1;
    425         i = 0;
    426         i_end = 2;
    427         if(best_amode == 2)
    428             i = 1;
    429         else if(best_amode == 34)
    430             i_end = 1;
    431         for(; i < i_end; i++)
    432         {
    433             mode = modes_to_eval[i];
    434             g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode);
    435             sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(
    436                 pu1_src, &pred[0], src_stride, 4);
    437             sad_cost = sad;
    438             sad_cost += mode_bits_cost[mode];
    439             if(sad_cost < best_asad_cost)
    440             {
    441                 best_amode = mode;
    442                 best_asad_cost = sad_cost;
    443                 *pi4_best_satd = sad;
    444             }
    445             sad_ptr[mode] = sad;
    446         }
    447     }
    448 
    449     if(best_asad_cost < best_nsad_cost)
    450     {
    451         ps_ed->best_mode = best_amode;
    452         *pi4_best_sad_cost = best_asad_cost;
    453     }
    454     else
    455     {
    456         ps_ed->best_mode = best_nmode;
    457         *pi4_best_sad_cost = best_nsad_cost;
    458     }
    459     ps_ed->intra_or_inter = 0;
    460     ps_ed->merge_success = 0;
    461 }
    462 
    463 /*!
    464 ******************************************************************************
    465 * \if Function name : ihevce_ed_calc_8x8_blk \endif
    466 *
    467 * \brief: evaluate input 8x8 block for intra modes basing on the intra mode
    468 *  decisions made at 4x4 level. This function also makes a decision whether
    469 *  to split blk in to 4x4 partitions or not.
    470 *
    471 *****************************************************************************
    472 */
    473 static void ihevce_ed_calc_8x8_blk(
    474     ihevce_ed_ctxt_t *ps_ed_ctxt,
    475     ihevce_ed_blk_t *ps_ed_8x8,
    476     UWORD8 *pu1_src,
    477     WORD32 src_stride,
    478     WORD32 *nbr_flags_ptr,
    479     WORD32 *top_intra_mode_ptr,
    480     WORD32 *left_intra_mode_ptr,
    481     WORD32 cu_pos_y,
    482     WORD32 lambda,
    483     WORD32 *sad_ptr_8x8,
    484     WORD32 *pi4_best_satd,
    485     WORD32 i4_layer_id,
    486     WORD32 i4_quality_preset,
    487     WORD32 i4_slice_type,
    488     WORD32 *pi4_best_sad_cost_8x8_l1_ipe,
    489     WORD32 *pi4_best_sad_8x8_l1_ipe,
    490     WORD32 *pi4_sum_4x4_satd,
    491     WORD32 *pi4_min_4x4_satd,
    492     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
    493     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
    494 {
    495     WORD32 i, j;
    496     WORD32 nbr_flags, nbr_flags_TR;
    497     UWORD8 *pu1_src_4x4;
    498     WORD32 top_available;
    499     WORD32 left_available;
    500     ihevce_ed_blk_t *ps_ed_4x4 = ps_ed_8x8;
    501     WORD32 top_intra_mode;
    502     WORD32 left_intra_mode;
    503     WORD32 next_left_intra_mode;
    504     WORD32 *sad_ptr = sad_ptr_8x8;
    505     UWORD8 *pu1_src_arr[4];
    506     WORD32 i4_4x4_best_sad_cost[4];
    507     func_selector_t *ps_func_selector = ps_ed_ctxt->ps_func_selector;
    508     ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution =
    509         ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
    510 
    511     (void)i4_slice_type;
    512 
    513     /* Compute ref samples for 8x8 merge block */
    514     nbr_flags = nbr_flags_ptr[0];
    515     nbr_flags_TR = nbr_flags_ptr[1];
    516 
    517     if(CHECK_TR_AVAILABLE(nbr_flags_TR))
    518     {
    519         SET_TR_AVAILABLE(nbr_flags);
    520     }
    521     else
    522     {
    523         SET_TR_UNAVAILABLE(nbr_flags);
    524     }
    525 
    526     if(CHECK_BL_AVAILABLE(nbr_flags))
    527     {
    528         SET_BL_AVAILABLE(nbr_flags);
    529     }
    530     else
    531     {
    532         SET_BL_UNAVAILABLE(nbr_flags);
    533     }
    534 
    535     /* call the function which populates ref data for intra predicion */
    536     pf_intra_pred_luma_ref_substitution(
    537         pu1_src - src_stride - 1,
    538         pu1_src - src_stride,
    539         pu1_src - 1,
    540         src_stride,
    541         8,
    542         nbr_flags,
    543         &ps_ed_ctxt->au1_ref_8x8[0][0],
    544         0);
    545 
    546     for(i = 0; i < 2; i++)
    547     {
    548         pu1_src_4x4 = pu1_src + i * 4 * src_stride;
    549         cu_pos_y += i * 4;
    550         next_left_intra_mode = left_intra_mode_ptr[i];
    551         for(j = 0; j < 2; j++)
    552         {
    553             WORD32 i4_best_satd;
    554             pu1_src_arr[i * 2 + j] = pu1_src_4x4;
    555             nbr_flags = nbr_flags_ptr[i * 8 + j];
    556             top_intra_mode = top_intra_mode_ptr[j];
    557             left_intra_mode = next_left_intra_mode;
    558             /* call the function which populates ref data for intra predicion */
    559             pf_intra_pred_luma_ref_substitution(
    560                 pu1_src_4x4 - src_stride - 1,
    561                 pu1_src_4x4 - src_stride,
    562                 pu1_src_4x4 - 1,
    563                 src_stride,
    564                 4,
    565                 nbr_flags,
    566                 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
    567                 0);
    568 
    569             top_available = CHECK_T_AVAILABLE(nbr_flags);
    570             left_available = CHECK_L_AVAILABLE(nbr_flags);
    571             /* call the function which populates sad cost for all the modes */
    572             ihevce_intra_populate_mode_bits_cost(
    573                 top_intra_mode,
    574                 left_intra_mode,
    575                 top_available,
    576                 left_available,
    577                 cu_pos_y,
    578                 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0],
    579                 lambda);
    580             ihevce_ed_calc_4x4_blk(
    581                 ps_ed_4x4,
    582                 pu1_src_4x4,
    583                 src_stride,
    584                 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0],
    585                 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0],
    586                 sad_ptr,
    587                 &i4_best_satd,
    588                 i4_quality_preset,
    589                 &i4_4x4_best_sad_cost[i * 2 + j],
    590                 ps_ipe_optimised_function_list);
    591 
    592             top_intra_mode_ptr[j] = ps_ed_4x4->best_mode;
    593             next_left_intra_mode = ps_ed_4x4->best_mode;
    594             pu1_src_4x4 += 4;
    595             ps_ed_4x4 += 1;
    596             sad_ptr += NUM_MODES;
    597         }
    598         left_intra_mode_ptr[i] = next_left_intra_mode;
    599     }
    600 
    601     /* 8x8 merge */
    602     {
    603         UWORD8 modes_to_eval[6];
    604         WORD32 sad;
    605         UWORD8 pred[16];
    606         UWORD8 pred_8x8[64] = { 0 };
    607         WORD32 merge_success;
    608         UWORD8 mode;
    609 
    610         ps_ed_4x4 = ps_ed_8x8;
    611         mode = (ps_ed_4x4)->best_mode;
    612 
    613         *pi4_best_satd = -1;
    614 
    615         merge_success =
    616             ((((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 1)->best_mode) +
    617               ((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 2)->best_mode) +
    618               ((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 3)->best_mode)) == 3);
    619 
    620         {
    621             WORD32 i4_satd;
    622             //UWORD16 au2_4x4_sad_cost_array[4];/*SAD of 4x4 blocks*/
    623             UWORD16 u2_sum_best_4x4_sad_cost; /*Sum of 4x4 sad costs*/
    624             UWORD16 u2_sum_best_4x4_satd_cost; /*Sum of 4x4 satd costs*/
    625             UWORD8 u1_best_8x8_mode; /*8x8 mode.*/
    626             UWORD16 u2_best_8x8_cost; /*8x8 Cost. Can store SATD/SAD cost*/
    627             WORD32 i4_best_8x8_sad_satd; /* SATD/SAD value of 8x8 block*/
    628             UWORD16 au2_8x8_costs[6] = { 0 }; /*Cost of 8x8 block for 6 modes*/
    629             UWORD8 u1_cond_4x4_satd; /*condition if 4x4 SATD needs to be done*/
    630             UWORD8 u1_cond_8x8_satd; /*condition if 8x8 SATD needs to be done*/
    631             UWORD8 u1_good_quality;
    632             WORD32 i4_merge_success_stage2;
    633 
    634             /*Initiallization*/
    635             *pi4_best_satd = 0;
    636             u2_best_8x8_cost = (UWORD16)(-1) /*max value*/;
    637             u2_sum_best_4x4_sad_cost = 0;
    638             *pi4_sum_4x4_satd = -1;
    639             *pi4_min_4x4_satd = 0x7FFFFFFF;
    640             i4_best_8x8_sad_satd = 0;
    641             u2_sum_best_4x4_satd_cost = 0;
    642             u1_best_8x8_mode = ps_ed_4x4->best_mode;
    643 
    644             /*We thought of "replacing" SATDs by SADs for 4x4 vs 8x8 decision
    645             for speed improvement, but it gave opposite results. Setting
    646             good_quality to 1 in order to throw away the idea of "replacing".*/
    647             u1_good_quality = 1;
    648             //u1_good_quality = ((i4_quality_preset != IHEVCE_QUALITY_P5)
    649             //  && (i4_quality_preset != IHEVCE_QUALITY_P4));
    650 
    651             /*Needed to disable some processing based on speed preset*/
    652             i4_merge_success_stage2 = 0;
    653 
    654             /*Store SAD cost of 4x4 blocks */
    655             for(i = 0; i < 4; i++)
    656             {
    657                 //au2_4x4_sad_cost_array[i] = (ps_ed_4x4 + i)->best_sad_cost;
    658                 u2_sum_best_4x4_sad_cost +=
    659                     i4_4x4_best_sad_cost[i];  //(ps_ed_4x4 + i)->best_sad_cost;
    660                 modes_to_eval[i] = (ps_ed_4x4 + i)->best_mode;
    661                 /*NOTE_01: i4_4x4_satd is not used anywhere at present.
    662                 Setting it to zero to avoid ASSERT failure */
    663                 /*Now taken care of incomplete CTB*/
    664                 //(ps_ed_4x4 + i)->i4_4x4_satd = 0;
    665             }
    666 
    667             /*Calculate SATD/SAd for 4x4 blocks*/
    668             /*For (layer_2 && high_speed): No need to get 4x4 SATDs bcoz
    669             it won't have any impact on quality but speed will improve.*/
    670             u1_cond_4x4_satd = ((1 == i4_layer_id) || (u1_good_quality && (!merge_success)));
    671 
    672             if(u1_cond_4x4_satd)
    673             {
    674                 *pi4_sum_4x4_satd = 0;
    675                 /*FYI: 1. Level 2 doesn't need the SATD.
    676                 2. The 4x4 vs. 8x8 decision for high_speed will
    677                 happen based on SAD. */
    678                 /*Get SATD for 4x4 blocks */
    679                 for(i = 0; i < 4; i++)
    680                 {
    681                     mode = modes_to_eval[i];
    682                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
    683                         &ps_ed_ctxt->au1_ref_full_ctb[i][0], 0, &pred[0], 4, 4, mode);
    684 
    685                     i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
    686                         pu1_src_arr[i], src_stride, &pred[0], 4, NULL, 0);
    687 
    688                     {
    689                         /*Save 4x4x satd in ed blk struct */
    690                         (ps_ed_4x4 + i)->i4_4x4_satd = i4_satd;
    691                     }
    692 
    693                     /*(ps_ed_4x4 + i)->i4_4x4_satd = i4_satd; // See NOTE_01*/
    694                     u2_sum_best_4x4_satd_cost +=
    695                         ((UWORD16)i4_satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
    696                     *pi4_best_satd += i4_satd;
    697                 }
    698             }
    699             /* Not being used in current code */
    700             else /* (Level_2 && extreme_speed) */
    701             {
    702                 /******DONT ENTER HERE AT aNY COST***************************/
    703                 /* Transistor killers lie ahead!!!!!!! */
    704                 /*This else part is not getting executed as of now*/
    705                 if(2 != i4_layer_id)
    706                     ASSERT(0);
    707                 /*Update values by SAD_cost_array */
    708                 for(i = 0; i < 4; i++)
    709                 {
    710                     mode = modes_to_eval[i];
    711                     //u2_sum_best_4x4_satd_cost += au2_4x4_sad_cost_array[i];
    712                     //sad = (WORD32)((ps_ed_4x4 + i)->best_sad_cost - ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
    713                     sad = (WORD32)(
    714                         i4_4x4_best_sad_cost[i] - ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]);
    715                     *pi4_sum_4x4_satd += sad;
    716                     /*(ps_ed_4x4 + i)->i4_4x4_satd = sad;// See NOTE_01*/
    717                     *pi4_best_satd += sad;
    718 
    719                     if(*pi4_min_4x4_satd > sad)
    720                         *pi4_min_4x4_satd = sad;
    721                 }
    722             }
    723             if(!merge_success) /*If the modes are not identical*/
    724             {
    725                 UWORD8 i1_start; /* no of modes to evaluate */
    726                 UWORD8 ai1_modes[6];
    727 
    728                 /* Prepare 6 candidates for 8x8 block. Two are DC and planar */
    729                 ai1_modes[4] = 0;
    730                 ai1_modes[5] = 1;
    731                 i1_start = 4;
    732 
    733                 /*Assign along with removing duplicates rest 4 candidates. */
    734                 for(i = 3; i >= 0; i--)
    735                 {
    736                     WORD8 i1_fresh_mode_flag = 1;
    737                     mode = modes_to_eval[i];
    738                     /*Check if duplicate already exists in ai1_modes*/
    739                     for(j = i1_start; j < 6; j++)
    740                     {
    741                         if(mode == ai1_modes[j])
    742                             i1_fresh_mode_flag = 0;
    743                     }
    744                     if(i1_fresh_mode_flag)
    745                     {
    746                         i1_start--;
    747                         ai1_modes[i1_start] = mode;
    748                     }
    749                 }
    750 
    751                 /*Calculate SATD/SAD of 8x8 block for all modes*/
    752                 /*If (u1_good_quality == 0) then SATD gets replaced by SAD*/
    753                 if(u1_good_quality && (i4_quality_preset <= IHEVCE_QUALITY_P4))
    754                 {
    755                     //7.5 * lambda to incorporate transfrom flags
    756                     u2_sum_best_4x4_satd_cost +=
    757                         (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
    758 
    759                     /*Loop over all modes for calculating SATD*/
    760                     for(i = i1_start; i < 6; i++)
    761                     {
    762                         mode = ai1_modes[i];
    763                         g_apf_lum_ip[g_i4_ip_funcs[mode]](
    764                             &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
    765 
    766                         i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
    767                             pu1_src_arr[0], src_stride, &pred_8x8[0], 8, NULL, 0);
    768 
    769                         au2_8x8_costs[i] =
    770                             ((UWORD16)i4_satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode]);
    771 
    772                         /*Update data correspoinding to least 8x8 cost */
    773                         if(au2_8x8_costs[i] <= u2_best_8x8_cost)
    774                         {
    775                             u2_best_8x8_cost = au2_8x8_costs[i];
    776                             i4_best_8x8_sad_satd = i4_satd;
    777                             u1_best_8x8_mode = mode;
    778                         }
    779                     }
    780                     /*8x8 vs 4x4 decision based on SATD values*/
    781                     if((u2_best_8x8_cost <= u2_sum_best_4x4_satd_cost) || (u2_best_8x8_cost <= 300))
    782                     {
    783                         i4_merge_success_stage2 = 1;
    784                     }
    785 
    786                     /* EIID: Early inter-intra decision */
    787                     /* Find the SAD based cost for 8x8 block for best mode */
    788                     if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
    789                     {
    790                         UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
    791                         WORD32 i4_best_8x8_sad_curr;
    792 
    793                         g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]](
    794                             &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, i4_best_8x8_mode);
    795 
    796                         i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
    797                             pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
    798 
    799                         //register best sad in the context
    800                         //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
    801 
    802                         //register the best cost in the context
    803                         //[0]th index is used since all 4 blocks are having same cost right now
    804                         //also it doesnt depends on mode. It only depends on the lambda
    805 
    806                         *pi4_best_sad_cost_8x8_l1_ipe =
    807                             i4_best_8x8_sad_curr +
    808                             ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode];
    809                         *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
    810                     }
    811                 }
    812                 else /*If high_speed or extreme speed*/
    813                 {
    814                     //7.5 * lambda to incorporate transfrom flags
    815                     u2_sum_best_4x4_sad_cost +=
    816                         (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1)));
    817 
    818                     /*Loop over all modes for calculating SAD*/
    819                     for(i = i1_start; i < 6; i++)
    820                     {
    821                         mode = ai1_modes[i];
    822                         g_apf_lum_ip[g_i4_ip_funcs[mode]](
    823                             &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
    824 
    825                         sad = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
    826                             pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
    827 
    828                         au2_8x8_costs[i] +=
    829                             ((UWORD16)sad + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode]);
    830 
    831                         /*Find the data correspoinding to least cost */
    832                         if(au2_8x8_costs[i] <= u2_best_8x8_cost)
    833                         {
    834                             u2_best_8x8_cost = au2_8x8_costs[i];
    835                             i4_best_8x8_sad_satd = sad;
    836                             u1_best_8x8_mode = mode;
    837                         }
    838                     }
    839                     /*8x8 vs 4x4 decision based on SAD values*/
    840                     if((u2_best_8x8_cost <= u2_sum_best_4x4_sad_cost) || (u2_best_8x8_cost <= 300))
    841                     {
    842                         i4_merge_success_stage2 = 1;
    843                     }
    844 
    845                     /* EIID: Early inter-intra decision */
    846                     /* Find the SAD based cost for 8x8 block for best mode */
    847                     if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
    848                     {
    849                         //UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
    850                         WORD32 i4_best_8x8_sad_cost_curr = u2_best_8x8_cost;
    851 
    852                         //register best sad in the context
    853                         //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
    854 
    855                         //register the best cost in the context
    856                         *pi4_best_sad_cost_8x8_l1_ipe = i4_best_8x8_sad_cost_curr;
    857                         *pi4_best_sad_8x8_l1_ipe =
    858                             i4_best_8x8_sad_satd;  //i4_best_8x8_sad_cost_curr;
    859                     }
    860                 }
    861             }
    862 
    863             /***** Modes for 4x4 and 8x8 are decided before this point ****/
    864             if(merge_success || i4_merge_success_stage2)
    865             {
    866                 /*FYI: 1. 8x8 SATD is not needed if merge is failed.
    867                 2. For layer_2: SATD won't be calculated for 8x8. So
    868                 the best_8x8_cost is SAD-cost. */
    869 
    870                 /* Store the 8x8 level data in the first 4x4 block*/
    871                 ps_ed_4x4->merge_success = 1;
    872                 ps_ed_4x4->best_merge_mode = u1_best_8x8_mode;
    873                 /* ps_ed_4x4->best_merge_sad_cost = u2_best_8x8_cost;
    874                 This data is not getting consumed anywhere at present */
    875 
    876                 top_intra_mode_ptr[0] = u1_best_8x8_mode;
    877                 top_intra_mode_ptr[1] = u1_best_8x8_mode;
    878                 left_intra_mode_ptr[0] = u1_best_8x8_mode;
    879                 left_intra_mode_ptr[1] = u1_best_8x8_mode;
    880 
    881                 /*If it is layer_1 and high_speed*/
    882                 u1_cond_8x8_satd =
    883                     ((1 == i4_layer_id) &&
    884                      (merge_success || ((!u1_good_quality) && i4_merge_success_stage2)));
    885                 if(u1_cond_8x8_satd)
    886                 {
    887                     mode = u1_best_8x8_mode;
    888                     g_apf_lum_ip[g_i4_ip_funcs[mode]](
    889                         &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode);
    890 
    891                     if(i4_quality_preset > IHEVCE_QUALITY_P3)
    892                     {
    893                         i4_satd = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
    894                             pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
    895                     }
    896                     else
    897                     {
    898                         i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
    899                             pu1_src_arr[0], src_stride, &pred_8x8[0], 8, NULL, 0);
    900                     }
    901                     /* u2_best_8x8_cost = ((UWORD16)i4_satd + mode_bits_cost[0][mode]);
    902                     This data is not getting consumed at present */
    903                     i4_best_8x8_sad_satd = i4_satd;
    904                 }
    905                 *pi4_best_satd = i4_best_8x8_sad_satd;
    906 
    907                 /* EIID: Early inter-intra decision */
    908                 /* Find the SAD based cost for 8x8 block for best mode */
    909                 if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id))
    910                 {
    911                     UWORD8 i4_best_8x8_mode = u1_best_8x8_mode;
    912                     WORD32 i4_best_8x8_sad_curr;
    913 
    914                     g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]](
    915                         &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, i4_best_8x8_mode);
    916 
    917                     i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer(
    918                         pu1_src_arr[0], &pred_8x8[0], src_stride, 8);
    919                     //register best sad in the context
    920                     //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
    921 
    922                     //register the best cost in the context
    923                     //[0]th index is used since all 4 blocks are having same cost right now
    924                     //also it doesnt depends on mode. It only depends on the lambda
    925 
    926                     *pi4_best_sad_cost_8x8_l1_ipe =
    927                         i4_best_8x8_sad_curr +
    928                         ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode];
    929                     *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr;
    930 
    931                 }  // EIID ends
    932 
    933             }  //if(merge_success || i4_merge_success_stage2)
    934         }
    935     }
    936 }
    937 
    938 /*!
    939 ******************************************************************************
    940 * \if Function name : ihevce_ed_calc_incomplete_ctb \endif
    941 *
    942 * \brief: performs L1 8x8 and 4x4 intra mode analysis
    943 *
    944 *****************************************************************************
    945 */
    946 void ihevce_ed_calc_incomplete_ctb(
    947     ihevce_ed_ctxt_t *ps_ed_ctxt,
    948     ihevce_ed_blk_t *ps_ed_ctb,
    949     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
    950     UWORD8 *pu1_src,
    951     WORD32 src_stride,
    952     WORD32 num_4x4_blks_x,
    953     WORD32 num_4x4_blks_y,
    954     WORD32 *nbr_flags,
    955     WORD32 i4_layer_id,
    956     WORD32 i4_row_block_no,
    957     WORD32 i4_col_block_no,
    958     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
    959     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
    960 {
    961     WORD32 i, j, k;
    962     WORD32 z_scan_idx = 0;
    963     WORD32 z_scan_act_idx = 0;
    964     ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution =
    965         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr;
    966 
    967     //UWORD8 ref[18];
    968     //WORD32 top_intra_modes[20];
    969     WORD32 *sad_ptr = &ps_ed_ctxt->sad[0];
    970     WORD32 lambda = ps_ed_ctxt->lambda;
    971     //UWORD16 mode_bits_cost[NUM_MODES];
    972 
    973     UWORD8 *pu1_src_8x8;
    974     ihevce_ed_blk_t *ps_ed_8x8, *ps_ed_4x4;
    975     WORD32 *top_intra_mode_ptr;
    976     WORD32 *left_intra_mode_ptr = ps_ed_ctxt->left_ctb_intra_modes;
    977     WORD32 *nbr_flags_ptr;
    978     WORD32 top_intra_mode;
    979     WORD32 left_intra_mode;
    980     WORD32 next_left_intra_mode;
    981     WORD32 nbr_flag = 0;
    982     WORD32 top_available;
    983     WORD32 left_available;
    984     UWORD8 *pu1_src_4x4;
    985     WORD32 left_over_4x4_blks;
    986     WORD32 i4_incomplete_sum_4x4_satd = 0;
    987     WORD32 i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
    988     WORD32 i4_best_sad_cost_8x8_l1_ipe, i4_best_sad_8x8_l1_ipe, i4_sum_4x4_satd, i4_min_4x4_satd;
    989 
    990     (void)i4_row_block_no;
    991     (void)i4_col_block_no;
    992     /*Find the modulated qp of 16*16 at L2 from 8*8 SATDs in L2
    993     THis is used as 64*64 Qp in L0*/
    994     /*For Incomplete CTB, init all SATD to -1 and then popualate for the complete 8x8 blocks (CU 16 in L0)*/
    995     /* Not populated for 4x4 blocks (CU 8 in L0), can be done */
    996     /*Also, not 32x32 satd is not populated, as it would correspong to CU 64 and it is not an incomplete CTB */
    997     if(i4_layer_id == 1)
    998     {
    999         WORD32 i4_i;
   1000 
   1001         for(i4_i = 0; i4_i < 64; i4_i++)
   1002         {
   1003             (ps_ed_ctb + i4_i)->i4_4x4_satd = -1;
   1004             (ps_ed_ctb + i4_i)->i4_4x4_cur_satd = -1;
   1005         }
   1006 
   1007         for(i4_i = 0; i4_i < 16; i4_i++)
   1008         {
   1009             ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2;
   1010             ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF;
   1011             ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2;
   1012             ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2;
   1013         }
   1014 
   1015         for(i4_i = 0; i4_i < 4; i4_i++)
   1016         {
   1017             ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2;
   1018             ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2;
   1019             ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2;
   1020         }
   1021         ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2;
   1022         ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2;
   1023         ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2;
   1024 
   1025         ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2;
   1026 
   1027         for(i4_i = 0; i4_i < 16; i4_i++)
   1028         {
   1029             ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -1;
   1030             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -1;
   1031             ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -1;
   1032             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -1;
   1033             ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -1;
   1034             ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -1;
   1035             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -1;
   1036 
   1037             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -1;
   1038         }
   1039     }
   1040     /*
   1041     * src scan happens in raster scan order. ps_ed update happens in z-scan order.
   1042     */
   1043     for(i = 0; i < num_4x4_blks_x; i++)
   1044     {
   1045         ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[i] = INTRA_DC;
   1046     }
   1047     next_left_intra_mode = left_intra_mode_ptr[0];
   1048     for(i = 0; i < num_4x4_blks_y / 2; i++)
   1049     {
   1050         pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride;
   1051         top_intra_mode_ptr = &ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[0];
   1052         nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i;
   1053 
   1054         for(j = 0; j < num_4x4_blks_x / 2; j++)
   1055         {
   1056             WORD32 i4_best_satd;
   1057             // Multiply i by 16 since the
   1058             // matrix is prepared for ctb_size = 64
   1059             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2];
   1060             z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
   1061             ASSERT(z_scan_act_idx <= 15);
   1062             ps_ed_8x8 = ps_ed_ctb + z_scan_idx;
   1063 
   1064             ihevce_ed_calc_8x8_blk(
   1065                 ps_ed_ctxt,
   1066                 ps_ed_8x8,
   1067                 pu1_src_8x8,
   1068                 src_stride,
   1069                 nbr_flags_ptr,
   1070                 top_intra_mode_ptr,
   1071                 left_intra_mode_ptr,
   1072                 i * 8,
   1073                 lambda,
   1074                 sad_ptr + z_scan_idx * NUM_MODES,
   1075                 &i4_best_satd,
   1076                 i4_layer_id,
   1077                 ps_ed_ctxt->i4_quality_preset,
   1078                 ps_ed_ctxt->i4_slice_type,
   1079                 &i4_best_sad_cost_8x8_l1_ipe,
   1080                 &i4_best_sad_8x8_l1_ipe,
   1081                 &i4_sum_4x4_satd,
   1082                 &i4_min_4x4_satd,
   1083                 ps_ipe_optimised_function_list,
   1084                 ps_cmn_utils_optimised_function_list);
   1085 
   1086             ASSERT(i4_best_satd >= 0);
   1087             if(i4_layer_id == 1)
   1088             {
   1089                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] =
   1090                     i4_best_sad_cost_8x8_l1_ipe;
   1091                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe;
   1092                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd;
   1093                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
   1094                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
   1095                 //ps_ed_ctb_l1->i4_sum_4x4_satd[z_scan_act_idx] = i4_sum_4x4_satd;
   1096                 //ps_ed_ctb_l1->i4_min_4x4_satd[z_scan_act_idx] = i4_min_4x4_satd;
   1097             }
   1098 
   1099             pu1_src_8x8 += 8;
   1100             //ps_ed_8x8  += 4;
   1101             top_intra_mode_ptr += 2;
   1102             nbr_flags_ptr += 2;
   1103         }
   1104 
   1105         next_left_intra_mode = left_intra_mode_ptr[0];
   1106         left_over_4x4_blks = (num_4x4_blks_x - (2 * (num_4x4_blks_x / 2)));
   1107         left_over_4x4_blks = left_over_4x4_blks * 2;
   1108 
   1109         pu1_src_4x4 = pu1_src_8x8;
   1110 
   1111         i4_incomplete_sum_4x4_satd = 0;
   1112         i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
   1113 
   1114         /* For leftover right 4x4 blks (num_4x4_blks_x - 2 *(num_4x4_blks_x/2))*/
   1115         for(k = 0; k < left_over_4x4_blks; k++)
   1116         {
   1117             WORD32 i4_best_satd;
   1118             WORD32 i4_dummy_sad_cost;
   1119             // Multiply i by 16 since the
   1120             // matrix is prepared for ctb_size = 64
   1121             ASSERT(left_over_4x4_blks == 2);
   1122             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + k * 16 + j * 2];
   1123             ps_ed_4x4 = ps_ed_ctb + z_scan_idx;
   1124 
   1125             top_intra_mode = ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j];
   1126             left_intra_mode = next_left_intra_mode;
   1127 
   1128             nbr_flag = nbr_flags[i * 2 * 8 + k * 8 + j * 2];
   1129 
   1130             /* call the function which populates ref data for intra predicion */
   1131             pf_intra_pred_luma_ref_substitution(
   1132                 pu1_src_4x4 - src_stride - 1,
   1133                 pu1_src_4x4 - src_stride,
   1134                 pu1_src_4x4 - 1,
   1135                 src_stride,
   1136                 4,
   1137                 nbr_flag,
   1138                 &ps_ed_ctxt->au1_ref_ic_ctb[0],
   1139                 0);
   1140 
   1141             top_available = CHECK_T_AVAILABLE(nbr_flag);
   1142             left_available = CHECK_L_AVAILABLE(nbr_flag);
   1143             /* call the function which populates sad cost for all the modes */
   1144             ihevce_intra_populate_mode_bits_cost(
   1145                 top_intra_mode,
   1146                 left_intra_mode,
   1147                 top_available,
   1148                 left_available,
   1149                 i * 4,
   1150                 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
   1151                 lambda);
   1152 
   1153             ihevce_ed_calc_4x4_blk(
   1154                 ps_ed_4x4,
   1155                 pu1_src_4x4,
   1156                 src_stride,
   1157                 &ps_ed_ctxt->au1_ref_ic_ctb[0],
   1158                 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
   1159                 sad_ptr + z_scan_idx * NUM_MODES,
   1160                 &i4_best_satd,
   1161                 ps_ed_ctxt->i4_quality_preset,
   1162                 &i4_dummy_sad_cost,
   1163                 ps_ipe_optimised_function_list);
   1164 
   1165             ASSERT(i4_best_satd >= 0);
   1166             if(i4_layer_id == 1)  //Can we ignore this check?
   1167             {
   1168                 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
   1169                 /*Note : The satd population is not populated for last 4*4 block in incomplete CTB */
   1170                 /* Which corresponds to CU 8 in L0 */
   1171 
   1172                 /*MAM_VAR_L1 */
   1173                 i4_incomplete_sum_4x4_satd = i4_incomplete_sum_4x4_satd + i4_best_satd;
   1174                 if(i4_incomplete_min_4x4_satd >= i4_best_satd)
   1175                     i4_incomplete_min_4x4_satd = i4_best_satd;
   1176                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
   1177                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
   1178                 if((k & 1) == 0)
   1179                 {
   1180                     ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = 0;
   1181                 }
   1182                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] += i4_best_satd;
   1183             }
   1184 
   1185             ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j * 2] = ps_ed_4x4->best_mode;
   1186             next_left_intra_mode = ps_ed_4x4->best_mode;
   1187             pu1_src_4x4 += src_stride;
   1188             left_intra_mode_ptr[k] = next_left_intra_mode;
   1189         }
   1190         left_intra_mode_ptr += 2;
   1191     }
   1192 
   1193     if(num_4x4_blks_y & 1)
   1194     {
   1195         /* For leftover bottom 4x4 blks. (num_4x4_blks_x) */
   1196         pu1_src_4x4 = pu1_src + i * 2 * 4 * src_stride;
   1197         //memset(&ps_ed_ctb_l1->i4_best_satd_8x8[i][0],0,4*sizeof(WORD32));
   1198         for(j = 0; j < num_4x4_blks_x; j++)
   1199         {
   1200             WORD32 i4_best_satd;
   1201             WORD32 i4_dummy_sad_cost;
   1202             // Multiply i by 16 since the
   1203             // matrix is prepared for ctb_size = 64
   1204             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j];
   1205             ps_ed_4x4 = ps_ed_ctb + z_scan_idx;
   1206 
   1207             if((j & 1) == 0)
   1208             {
   1209                 i4_incomplete_sum_4x4_satd = 0;
   1210                 i4_incomplete_min_4x4_satd = 0x7FFFFFFF;
   1211             }
   1212 
   1213             top_intra_mode = ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j];
   1214             left_intra_mode = next_left_intra_mode;
   1215 
   1216             nbr_flag = nbr_flags[i * 2 * 8 + j];
   1217 
   1218             /* call the function which populates ref data for intra predicion */
   1219             pf_intra_pred_luma_ref_substitution(
   1220                 pu1_src_4x4 - src_stride - 1,
   1221                 pu1_src_4x4 - src_stride,
   1222                 pu1_src_4x4 - 1,
   1223                 src_stride,
   1224                 4,
   1225                 nbr_flag,
   1226                 &ps_ed_ctxt->au1_ref_ic_ctb[0],
   1227                 0);
   1228 
   1229             top_available = CHECK_T_AVAILABLE(nbr_flag);
   1230             left_available = CHECK_L_AVAILABLE(nbr_flag);
   1231             /* call the function which populates sad cost for all the modes */
   1232             ihevce_intra_populate_mode_bits_cost(
   1233                 top_intra_mode,
   1234                 left_intra_mode,
   1235                 top_available,
   1236                 left_available,
   1237                 i * 4,
   1238                 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
   1239                 lambda);
   1240 
   1241             ihevce_ed_calc_4x4_blk(
   1242                 ps_ed_4x4,
   1243                 pu1_src_4x4,
   1244                 src_stride,
   1245                 &ps_ed_ctxt->au1_ref_ic_ctb[0],
   1246                 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0],
   1247                 sad_ptr + z_scan_idx * NUM_MODES,
   1248                 &i4_best_satd,
   1249                 ps_ed_ctxt->i4_quality_preset,
   1250                 &i4_dummy_sad_cost,
   1251                 ps_ipe_optimised_function_list);
   1252 
   1253             /*Note : The satd population is not populated for last 4*4 block in incomplete CTB */
   1254             /* Which corresponds to CU 8 in L0 */
   1255 
   1256             /*MAM_VAR_L1 */
   1257             ASSERT(i4_best_satd >= 0);
   1258             if(i4_layer_id == 1)  //Can we ignore this check?
   1259             {
   1260                 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + (j >> 1)];
   1261                 if((j & 1) == 0)
   1262                 {
   1263                     ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = 0;
   1264                 }
   1265                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] += i4_best_satd;
   1266                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
   1267                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
   1268                 i4_incomplete_sum_4x4_satd = i4_incomplete_sum_4x4_satd + i4_best_satd;
   1269                 if(i4_incomplete_min_4x4_satd >= i4_best_satd)
   1270                     i4_incomplete_min_4x4_satd = i4_best_satd;
   1271             }
   1272 
   1273             ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j] = ps_ed_4x4->best_mode;
   1274             next_left_intra_mode = ps_ed_4x4->best_mode;
   1275             pu1_src_4x4 += 4;
   1276         }
   1277     }
   1278     left_intra_mode_ptr[0] = next_left_intra_mode;
   1279 }
   1280 
   1281 /*!
   1282 ******************************************************************************
   1283 * \if Function name : ihevce_cu_level_qp_mod \endif
   1284 *
   1285 * \brief: Performs CU level QP modulation
   1286 *
   1287 *****************************************************************************
   1288 */
   1289 WORD32 ihevce_cu_level_qp_mod(
   1290     WORD32 i4_qscale,
   1291     WORD32 i4_satd,
   1292     long double ld_curr_frame_log_avg_act,
   1293     float f_mod_strength,
   1294     WORD32 *pi4_act_factor,
   1295     WORD32 *pi4_q_scale_mod,
   1296     rc_quant_t *ps_rc_quant_ctxt)
   1297 {
   1298     WORD32 i4_temp_qscale;
   1299     WORD32 i4_temp_qp;
   1300 
   1301     if(i4_satd != -1)
   1302     {
   1303         WORD32 i4_loc_satd = i4_satd;
   1304         if(i4_loc_satd < 1)
   1305         {
   1306             i4_loc_satd = 1;
   1307         }
   1308         if((WORD32)ld_curr_frame_log_avg_act == 0)
   1309         {
   1310             *pi4_act_factor = (1 << (QP_LEVEL_MOD_ACT_FACTOR));
   1311         }
   1312         else
   1313         {
   1314             UWORD32 u4_log2_sq_cur_satd;
   1315             ULWORD64 u8_sq_cur_satd;
   1316             WORD32 qp_offset;
   1317 
   1318             ASSERT(USE_SQRT_AVG_OF_SATD_SQR);
   1319             u8_sq_cur_satd = (i4_loc_satd * i4_loc_satd);
   1320             GET_POS_MSB_64(u4_log2_sq_cur_satd, u8_sq_cur_satd);
   1321             if(ABS((
   1322                    long double)(((1 << u4_log2_sq_cur_satd) * POW_2_TO_1_BY_4) - ((long double)u8_sq_cur_satd))) >
   1323                ABS((
   1324                    long double)(((1 << u4_log2_sq_cur_satd) * POW_2_TO_3_BY_4) - ((long double)u8_sq_cur_satd))))
   1325             {
   1326                 u4_log2_sq_cur_satd += 1;
   1327             }
   1328             qp_offset = (WORD32)(
   1329                 f_mod_strength *
   1330                 (float)((long double)u4_log2_sq_cur_satd - ld_curr_frame_log_avg_act));
   1331             qp_offset = CLIP3(qp_offset, MIN_QP_MOD_OFFSET, MAX_QP_MOD_OFFSET);
   1332             *pi4_act_factor = (WORD32)(
   1333                 gad_look_up_activity[qp_offset + ABS(MIN_QP_MOD_OFFSET)] *
   1334                 (1 << QP_LEVEL_MOD_ACT_FACTOR));
   1335         }
   1336 
   1337         ASSERT(*pi4_act_factor > 0);
   1338         i4_temp_qscale = ((i4_qscale * (*pi4_act_factor)) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >>
   1339                          QP_LEVEL_MOD_ACT_FACTOR;
   1340     }
   1341     else
   1342     {
   1343         i4_temp_qscale = i4_qscale;
   1344         *pi4_act_factor = (1 << QP_LEVEL_MOD_ACT_FACTOR);
   1345     }
   1346     ASSERT(*pi4_act_factor > 0);
   1347 
   1348     if(i4_temp_qscale > ps_rc_quant_ctxt->i2_max_qscale)
   1349     {
   1350         i4_temp_qscale = ps_rc_quant_ctxt->i2_max_qscale;
   1351     }
   1352     else if(i4_temp_qscale < ps_rc_quant_ctxt->i2_min_qscale)
   1353     {
   1354         i4_temp_qscale = ps_rc_quant_ctxt->i2_min_qscale;
   1355     }
   1356     /*store q scale for stat gen for I frame model*/
   1357     /*Here activity factor is not modified as the cu qp would be clipped in rd-opt stage*/
   1358     *pi4_q_scale_mod = i4_temp_qscale;
   1359     i4_temp_qp = ps_rc_quant_ctxt->pi4_qscale_to_qp[i4_temp_qscale];
   1360     if(i4_temp_qp > ps_rc_quant_ctxt->i2_max_qp)
   1361     {
   1362         i4_temp_qp = ps_rc_quant_ctxt->i2_max_qp;
   1363     }
   1364     else if(i4_temp_qp < ps_rc_quant_ctxt->i2_min_qp)
   1365     {
   1366         i4_temp_qp = ps_rc_quant_ctxt->i2_min_qp;
   1367     }
   1368     return (i4_temp_qp);
   1369 }
   1370 
   1371 /*!
   1372 ******************************************************************************
   1373 * \if Function name : ihevce_ed_calc_ctb \endif
   1374 *
   1375 * \brief: performs L1 8x8 and 4x4 intra mode analysis
   1376 *
   1377 *****************************************************************************
   1378 */
   1379 void ihevce_ed_calc_ctb(
   1380     ihevce_ed_ctxt_t *ps_ed_ctxt,
   1381     ihevce_ed_blk_t *ps_ed_ctb,
   1382     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
   1383     UWORD8 *pu1_src,
   1384     WORD32 src_stride,
   1385     WORD32 num_4x4_blks_x,
   1386     WORD32 num_4x4_blks_y,
   1387     WORD32 *nbr_flags,
   1388     WORD32 i4_layer_id,
   1389     WORD32 i4_row_block_no,
   1390     WORD32 i4_col_block_no,
   1391     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
   1392     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
   1393 {
   1394     WORD32 i, j;
   1395     WORD32 z_scan_idx = 0;
   1396     WORD32 z_scan_act_idx = 0;
   1397     ihevce_ed_blk_t *ps_ed_8x8;
   1398     UWORD8 *pu1_src_8x8;
   1399 
   1400     WORD32 top_intra_modes[20];
   1401     WORD32 *top_intra_mode_ptr;
   1402     WORD32 *left_intra_mode_ptr = ps_ed_ctxt->left_ctb_intra_modes;
   1403 
   1404     WORD32 *sad_ptr = &ps_ed_ctxt->sad[0];
   1405     WORD32 lambda = ps_ed_ctxt->lambda;
   1406     WORD32 *nbr_flags_ptr;
   1407     WORD32 i4_best_sad_cost_8x8_l1_ipe, i4_best_sad_8x8_l1_ipe, i4_sum_4x4_satd, i4_min_4x4_satd;
   1408 
   1409     (void)num_4x4_blks_y;
   1410     (void)i4_row_block_no;
   1411     (void)i4_col_block_no;
   1412     ASSERT(num_4x4_blks_x % 2 == 0);
   1413     ASSERT(num_4x4_blks_y % 2 == 0);
   1414     ASSERT((num_4x4_blks_x == 4) || (num_4x4_blks_x == 8));
   1415     ASSERT((num_4x4_blks_y == 4) || (num_4x4_blks_y == 8));
   1416 
   1417     if(i4_layer_id == 1)
   1418     {
   1419         WORD32 i4_i;
   1420 
   1421         for(i4_i = 0; i4_i < 64; i4_i++)
   1422         {
   1423             (ps_ed_ctb + i4_i)->i4_4x4_satd = -1;
   1424             (ps_ed_ctb + i4_i)->i4_4x4_cur_satd = -1;
   1425         }
   1426 
   1427         for(i4_i = 0; i4_i < 16; i4_i++)
   1428         {
   1429             ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2;
   1430             ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF;
   1431             ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2;
   1432             ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2;
   1433         }
   1434 
   1435         for(i4_i = 0; i4_i < 4; i4_i++)
   1436         {
   1437             ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2;
   1438             ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2;
   1439             ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2;
   1440         }
   1441         ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2;
   1442         ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2;
   1443         ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2;
   1444         ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2;
   1445         for(i4_i = 0; i4_i < 16; i4_i++)
   1446         {
   1447             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -2;
   1448             ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -2;
   1449             ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -2;
   1450             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -2;
   1451 
   1452             ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -2;
   1453 
   1454             ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -2;
   1455             ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -2;
   1456             ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -2;
   1457         }
   1458     }
   1459     /*
   1460     * src scan happens in raster scan order. ps_ed update happens in z-scan order.
   1461     */
   1462     for(i = 0; i < num_4x4_blks_x; i++)
   1463     {
   1464         top_intra_modes[i] = INTRA_DC;
   1465     }
   1466     for(i = 0; i < num_4x4_blks_x / 2; i++)
   1467     {
   1468         pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride;
   1469         top_intra_mode_ptr = &top_intra_modes[0];
   1470         nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i;
   1471 
   1472         for(j = 0; j < num_4x4_blks_x / 2; j++)
   1473         {
   1474             WORD32 i4_best_satd;
   1475             ASSERT(i <= 3);
   1476             ASSERT(j <= 3);
   1477 
   1478             // Multiply i by 16 since the
   1479             // matrix is prepared for ctb_size = 64
   1480             z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2];
   1481             z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j];
   1482             ASSERT(z_scan_act_idx <= 15);
   1483 
   1484             ps_ed_8x8 = ps_ed_ctb + z_scan_idx;
   1485 
   1486             ihevce_ed_calc_8x8_blk(
   1487                 ps_ed_ctxt,
   1488                 ps_ed_8x8,
   1489                 pu1_src_8x8,
   1490                 src_stride,
   1491                 nbr_flags_ptr,
   1492                 top_intra_mode_ptr,
   1493                 left_intra_mode_ptr,
   1494                 i * 8,
   1495                 lambda,
   1496                 sad_ptr + z_scan_idx * NUM_MODES,
   1497                 &i4_best_satd,
   1498                 i4_layer_id,
   1499                 ps_ed_ctxt->i4_quality_preset,
   1500                 ps_ed_ctxt->i4_slice_type,
   1501                 &i4_best_sad_cost_8x8_l1_ipe,
   1502                 &i4_best_sad_8x8_l1_ipe,
   1503                 &i4_sum_4x4_satd,
   1504                 &i4_min_4x4_satd,
   1505                 ps_ipe_optimised_function_list,
   1506                 ps_cmn_utils_optimised_function_list);
   1507 
   1508             if(i4_layer_id == 1)
   1509             {
   1510                 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] =
   1511                     i4_best_sad_cost_8x8_l1_ipe;
   1512                 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe;
   1513                 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd;
   1514                 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd;
   1515                 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd);
   1516                 //ps_ed_ctb_l1->i4_sum_4x4_satd[z_scan_act_idx] = i4_sum_4x4_satd;
   1517                 //ps_ed_ctb_l1->i4_min_4x4_satd[z_scan_act_idx] = i4_min_4x4_satd;
   1518             }
   1519 
   1520             pu1_src_8x8 += 8;
   1521             //ps_ed_8x8  += 4;
   1522             top_intra_mode_ptr += 2;
   1523             nbr_flags_ptr += 2;
   1524         }
   1525         left_intra_mode_ptr += 2;
   1526     }
   1527 }
   1528 
   1529 /*!
   1530 ******************************************************************************
   1531 * \if Function name : ihevce_ed_frame_init \endif
   1532 *
   1533 * \brief: Initialize frame context for early decision
   1534 *
   1535 *****************************************************************************
   1536 */
   1537 void ihevce_ed_frame_init(void *pv_ed_ctxt, WORD32 i4_layer_no)
   1538 {
   1539     ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
   1540 
   1541     g_apf_lum_ip[IP_FUNC_MODE_0] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_planar_fptr;
   1542     g_apf_lum_ip[IP_FUNC_MODE_1] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_dc_fptr;
   1543     g_apf_lum_ip[IP_FUNC_MODE_2] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode2_fptr;
   1544     g_apf_lum_ip[IP_FUNC_MODE_3TO9] =
   1545         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_3_to_9_fptr;
   1546     g_apf_lum_ip[IP_FUNC_MODE_10] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_horz_fptr;
   1547     g_apf_lum_ip[IP_FUNC_MODE_11TO17] =
   1548         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_11_to_17_fptr;
   1549     g_apf_lum_ip[IP_FUNC_MODE_18_34] =
   1550         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_18_34_fptr;
   1551     g_apf_lum_ip[IP_FUNC_MODE_19TO25] =
   1552         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_19_to_25_fptr;
   1553     g_apf_lum_ip[IP_FUNC_MODE_26] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ver_fptr;
   1554     g_apf_lum_ip[IP_FUNC_MODE_27TO33] =
   1555         ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_27_to_33_fptr;
   1556 
   1557     if(i4_layer_no == 1)
   1558     {
   1559         ps_ed_ctxt->i8_sum_best_satd = 0;
   1560         ps_ed_ctxt->i8_sum_sq_best_satd = 0;
   1561     }
   1562 }
   1563 
   1564 /**
   1565 ********************************************************************************
   1566 *
   1567 *  @brief  downscales by 2 in horz and vertical direction, creates output of
   1568 *          size wd/2 * ht/2
   1569 *
   1570 *  @param[in]  pu1_src : source pointer
   1571 *  @param[in]  src_stride : source stride
   1572 *  @param[out] pu1_dst : destination pointer. Starting of a row.
   1573 *  @param[in]  dst_stride : destination stride
   1574 *  @param[in]  wd : width
   1575 *  @param[in]  ht : height
   1576 *  @param[in]  pu1_wkg_mem : working memory (atleast of size CEIL16(wd) * ht))
   1577 *  @param[in]  ht_offset : height offset of the block to be scaled
   1578 *  @param[in]  block_ht : height of the block to be scaled
   1579 *  @param[in]  wd_offset : width offset of the block to be scaled
   1580 *  @param[in]  block_wd : width of the block to be scaled
   1581 *
   1582 *  @return void
   1583 *
   1584 *  @remarks Assumption made block_ht should me multiple of 2. LANCZOS_SCALER
   1585 *
   1586 ********************************************************************************
   1587 */
   1588 void ihevce_scaling_filter_mxn(
   1589     UWORD8 *pu1_src,
   1590     WORD32 src_strd,
   1591     UWORD8 *pu1_scrtch,
   1592     WORD32 scrtch_strd,
   1593     UWORD8 *pu1_dst,
   1594     WORD32 dst_strd,
   1595     WORD32 ht,
   1596     WORD32 wd)
   1597 {
   1598 #define FILT_TAP_Q 8
   1599 #define N_TAPS 7
   1600     const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 };
   1601     WORD32 i, j;
   1602     WORD32 tmp;
   1603     UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd;
   1604     UWORD8 *pu1_scrtch_tmp = pu1_scrtch;
   1605 
   1606     /* horizontal filtering */
   1607     for(i = -3; i < ht + 2; i++)
   1608     {
   1609         for(j = 0; j < wd; j += 2)
   1610         {
   1611             tmp = (i4_ftaps[3] * pu1_src_tmp[j] +
   1612                    i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) +
   1613                    i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) +
   1614                    i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) +
   1615                    (1 << (FILT_TAP_Q - 1))) >>
   1616                   FILT_TAP_Q;
   1617             pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp);
   1618         }
   1619         pu1_scrtch_tmp += scrtch_strd;
   1620         pu1_src_tmp += src_strd;
   1621     }
   1622     /* vertical filtering */
   1623     pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd;
   1624     for(i = 0; i < ht; i += 2)
   1625     {
   1626         for(j = 0; j < (wd >> 1); j++)
   1627         {
   1628             tmp =
   1629                 (i4_ftaps[3] * pu1_scrtch_tmp[j] +
   1630                  i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) +
   1631                  i4_ftaps[1] *
   1632                      (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) +
   1633                  i4_ftaps[0] *
   1634                      (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) +
   1635                  (1 << (FILT_TAP_Q - 1))) >>
   1636                 FILT_TAP_Q;
   1637             pu1_dst[j] = CLIP_U8(tmp);
   1638         }
   1639         pu1_dst += dst_strd;
   1640         pu1_scrtch_tmp += (scrtch_strd << 1);
   1641     }
   1642 }
   1643 
   1644 void ihevce_scale_by_2(
   1645     UWORD8 *pu1_src,
   1646     WORD32 src_strd,
   1647     UWORD8 *pu1_dst,
   1648     WORD32 dst_strd,
   1649     WORD32 wd,
   1650     WORD32 ht,
   1651     UWORD8 *pu1_wkg_mem,
   1652     WORD32 ht_offset,
   1653     WORD32 block_ht,
   1654     WORD32 wd_offset,
   1655     WORD32 block_wd,
   1656     FT_COPY_2D *pf_copy_2d,
   1657     FT_SCALING_FILTER_BY_2 *pf_scaling_filter_mxn)
   1658 {
   1659 #define N_TAPS 7
   1660 #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1))
   1661     UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ];
   1662     UWORD32 cpy_strd = MAX_BLK_SZ;
   1663     UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1);
   1664 
   1665     UWORD8 *pu1_in, *pu1_out;
   1666     WORD32 in_strd, wkg_mem_strd;
   1667 
   1668     WORD32 row_start, row_end;
   1669     WORD32 col_start, col_end;
   1670     WORD32 i, fun_select;
   1671     WORD32 ht_tmp, wd_tmp;
   1672     FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2];
   1673 
   1674     assert((wd & 1) == 0);
   1675     assert((ht & 1) == 0);
   1676     assert(block_wd <= MAX_CTB_SIZE);
   1677     assert(block_ht <= MAX_CTB_SIZE);
   1678 
   1679     /* function pointers for filtering different dimensions */
   1680     ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn;
   1681     ihevce_scaling_filters[1] = pf_scaling_filter_mxn;
   1682 
   1683     /* handle boundary blks */
   1684     col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0;
   1685     row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0;
   1686     col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0;
   1687     row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0;
   1688     if(col_end && (wd % block_wd != 0))
   1689     {
   1690         block_wd = (wd % block_wd);
   1691     }
   1692     if(row_end && (ht % block_ht != 0))
   1693     {
   1694         block_ht = (ht % block_ht);
   1695     }
   1696 
   1697     /* boundary blks needs to be padded, copy src to tmp buffer */
   1698     if(col_start || col_end || row_end || row_start)
   1699     {
   1700         UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd;
   1701 
   1702         pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start));
   1703         pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start));
   1704         ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end);
   1705         wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end);
   1706         pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp);
   1707         pu1_in = au1_cpy + cpy_strd * 3 + 3;
   1708         in_strd = cpy_strd;
   1709     }
   1710     else
   1711     {
   1712         pu1_in = pu1_src + wd_offset + ht_offset * src_strd;
   1713         in_strd = src_strd;
   1714     }
   1715 
   1716     /*top padding*/
   1717     if(row_start)
   1718     {
   1719         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3;
   1720 
   1721         pu1_cpy = au1_cpy + cpy_strd * (3 - 1);
   1722         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
   1723         pu1_cpy -= cpy_strd;
   1724         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
   1725         pu1_cpy -= cpy_strd;
   1726         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
   1727     }
   1728 
   1729     /*bottom padding*/
   1730     if(row_end)
   1731     {
   1732         UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd;
   1733 
   1734         pu1_cpy = pu1_cpy_tmp + cpy_strd;
   1735         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
   1736         pu1_cpy += cpy_strd;
   1737         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
   1738         pu1_cpy += cpy_strd;
   1739         memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6);
   1740     }
   1741 
   1742     /*left padding*/
   1743     if(col_start)
   1744     {
   1745         UWORD8 *pu1_cpy_tmp = au1_cpy + 3;
   1746 
   1747         pu1_cpy = au1_cpy;
   1748         for(i = 0; i < block_ht + 6; i++)
   1749         {
   1750             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
   1751             pu1_cpy += cpy_strd;
   1752             pu1_cpy_tmp += cpy_strd;
   1753         }
   1754     }
   1755 
   1756     /*right padding*/
   1757     if(col_end)
   1758     {
   1759         UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1;
   1760 
   1761         pu1_cpy = au1_cpy + 3 + block_wd;
   1762         for(i = 0; i < block_ht + 6; i++)
   1763         {
   1764             pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0];
   1765             pu1_cpy += cpy_strd;
   1766             pu1_cpy_tmp += cpy_strd;
   1767         }
   1768     }
   1769 
   1770     wkg_mem_strd = block_wd >> 1;
   1771     pu1_out = pu1_dst + (wd_offset >> 1);
   1772     fun_select = (block_wd % 16 == 0);
   1773     ihevce_scaling_filters[fun_select](
   1774         pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd);
   1775 
   1776     /* Left padding of 16 for 1st block of every row */
   1777     if(wd_offset == 0)
   1778     {
   1779         UWORD8 u1_val;
   1780         WORD32 pad_wd = 16;
   1781         WORD32 pad_ht = block_ht >> 1;
   1782         UWORD8 *dst = pu1_dst;
   1783 
   1784         for(i = 0; i < pad_ht; i++)
   1785         {
   1786             u1_val = dst[0];
   1787             memset(&dst[-pad_wd], u1_val, pad_wd);
   1788             dst += dst_strd;
   1789         }
   1790     }
   1791 
   1792     if(wd == wd_offset + block_wd)
   1793     {
   1794         /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */
   1795         /* Right padding is done only after processing of last block of that row is done*/
   1796         UWORD8 u1_val;
   1797         WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4;
   1798         WORD32 pad_ht = block_ht >> 1;
   1799         UWORD8 *dst = pu1_dst + (wd >> 1) - 1;
   1800 
   1801         for(i = 0; i < pad_ht; i++)
   1802         {
   1803             u1_val = dst[0];
   1804             memset(&dst[1], u1_val, pad_wd);
   1805             dst += dst_strd;
   1806         }
   1807 
   1808         if(ht_offset == 0)
   1809         {
   1810             /* Top padding of 16 is done for 1st row only after we reach end of that row */
   1811             WORD32 pad_wd = dst_strd;
   1812             WORD32 pad_ht = 16;
   1813             UWORD8 *dst = pu1_dst - 16;
   1814 
   1815             for(i = 1; i <= pad_ht; i++)
   1816             {
   1817                 memcpy(dst - (i * dst_strd), dst, pad_wd);
   1818             }
   1819         }
   1820 
   1821         /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have
   1822          reached end of frame */
   1823         if(ht - ht_offset - block_ht == 0)
   1824         {
   1825             WORD32 pad_wd = dst_strd;
   1826             WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4;
   1827             UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16;
   1828 
   1829             for(i = 1; i <= pad_ht; i++)
   1830                 memcpy(dst + (i * dst_strd), dst, pad_wd);
   1831         }
   1832     }
   1833 }
   1834 
   1835 /*!
   1836 ******************************************************************************
   1837 * \if Function name : ihevce_decomp_pre_intra_process_row \endif
   1838 *
   1839 * \brief
   1840 *    Row level function which down scales a given row by 2 in horz and
   1841 *    vertical direction creates output of size wd/2 * ht/2.
   1842 *
   1843 *  @param[in]  pu1_src : soource pointer
   1844 *  @param[in]  src_stride : source stride
   1845 *  @param[out] pu1_dst : desitnation pointer
   1846 *  @param[in]  dst_stride : destination stride
   1847 *  @param[in]  layer_wd : layer width
   1848 *  @param[in]  layer_ht : layer height
   1849 *  @param[in]  ht_offset : height offset of the block to be scaled
   1850 *  @param[in]  block_ht : height of the block to be scaled
   1851 *  @param[in]  wd_offset : width offset of the block to be scaled
   1852 *  @param[in]  block_wd : width of the block to be scaled
   1853 *  @param[in]  num_col_blks : number of col blks in that row
   1854 *
   1855 * \return None
   1856 *
   1857 *  @NOTE : When decompositionis done from L1 to L2 pre intra analysis is
   1858 *          done on L1
   1859 *
   1860 *****************************************************************************
   1861 */
   1862 void ihevce_decomp_pre_intra_process_row(
   1863     UWORD8 *pu1_src,
   1864     WORD32 src_stride,
   1865     UWORD8 *pu1_dst_decomp,
   1866     WORD32 dst_stride,
   1867     WORD32 layer_wd,
   1868     WORD32 layer_ht,
   1869     UWORD8 *pu1_wkg_mem,
   1870     WORD32 ht_offset,
   1871     WORD32 block_ht,
   1872     WORD32 block_wd,
   1873     WORD32 i4_cu_aligned_pic_wd,
   1874     WORD32 i4_cu_aligned_pic_ht,
   1875     WORD32 num_col_blks,
   1876     WORD32 layer_no,
   1877     ihevce_ed_ctxt_t *ps_ed_ctxt,
   1878     ihevce_ed_blk_t *ps_ed_row,
   1879     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1_row,
   1880     ihevce_8x8_L0_satd_t *ps_layer0_cur_satd,
   1881     ihevce_8x8_L0_mean_t *ps_layer0_cur_mean,
   1882     WORD32 num_4x4_blks_ctb_y,
   1883     WORD32 num_4x4_blks_last_ctb_x,
   1884     WORD32 skip_decomp,
   1885     WORD32 skip_pre_intra,
   1886     WORD32 row_block_no,
   1887     WORD32 i4_enable_noise_detection,
   1888     ctb_analyse_t *ps_ctb_analyse,
   1889     ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list,
   1890     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
   1891 {
   1892     WORD32 col_block_no;
   1893 
   1894     //ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
   1895     UWORD8 *pu1_src_pre_intra = pu1_src + (ht_offset * src_stride);
   1896     WORD32 num_4x4_blks_in_ctb = block_wd >> 2;
   1897     //WORD32 nbr_flags[64];
   1898     WORD32 *nbr_flags_ptr = &ps_ed_ctxt->ai4_nbr_flags[0];
   1899     WORD32 src_inc_pre_intra = num_4x4_blks_in_ctb * 4;
   1900     WORD32 inc_ctb = 0;
   1901     ihevce_ed_blk_t *ps_ed_ctb = ps_ed_row;
   1902     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1 = ps_ed_ctb_l1_row;
   1903     WORD32 i, j;
   1904     WORD32 do_pre_intra_analysis;
   1905     pf_ed_calc_ctb ed_calc_ctb;
   1906     ctb_analyse_t *ps_ctb_analyse_curr;
   1907 
   1908     (void)i4_cu_aligned_pic_wd;
   1909     (void)i4_cu_aligned_pic_ht;
   1910     (void)ps_layer0_cur_satd;
   1911     (void)ps_layer0_cur_mean;
   1912     (void)i4_enable_noise_detection;
   1913     /*increment the struct pointer to point to the first CTB of the current row. */
   1914     ps_ctb_analyse_curr = ps_ctb_analyse + row_block_no * num_col_blks;
   1915 
   1916     //if((num_4x4_blks_ctb_x == num_4x4_blks_ctb_y) && (num_4x4_blks_in_ctb == num_4x4_blks_ctb_x) )
   1917     if(num_4x4_blks_in_ctb == num_4x4_blks_ctb_y)
   1918     {
   1919         ed_calc_ctb = ihevce_ed_calc_ctb;
   1920     }
   1921     else
   1922     {
   1923         ed_calc_ctb = ihevce_ed_calc_incomplete_ctb;
   1924     }
   1925 
   1926     inc_ctb = num_4x4_blks_in_ctb * num_4x4_blks_in_ctb;
   1927 
   1928     do_pre_intra_analysis = ((layer_no == 1) || (layer_no == 2)) && (!skip_pre_intra);
   1929 
   1930     /*
   1931     * For optimal pre intra analysis first block is processed outside
   1932     * the loop.
   1933     */
   1934     if(!skip_decomp)
   1935     {
   1936         ihevce_scale_by_2(
   1937             pu1_src,
   1938             src_stride,
   1939             pu1_dst_decomp,
   1940             dst_stride,
   1941             layer_wd,
   1942             layer_ht,
   1943             pu1_wkg_mem,
   1944             ht_offset,
   1945             block_ht,
   1946             block_wd * 0,
   1947             block_wd,
   1948             ps_cmn_utils_optimised_function_list->pf_copy_2d,
   1949             ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
   1950         /* Disable noise detection */
   1951         ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
   1952 
   1953         memset(
   1954             ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
   1955             0,
   1956             sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
   1957     }
   1958 
   1959     /*
   1960     * Pre intra analysis for the first ctb.
   1961     * To analyse any given CTB we need to set the availability flags of the
   1962     * following neighbouring CTB: BL,L,TL,T,TR.
   1963     */
   1964     if(do_pre_intra_analysis)
   1965     {
   1966         /*
   1967         * At the beginning of ctb row set left intra modes to default value.
   1968         */
   1969         for(j = 0; j < num_4x4_blks_ctb_y; j++)
   1970         {
   1971             ps_ed_ctxt->left_ctb_intra_modes[j] = INTRA_DC;
   1972         }
   1973 
   1974         /*
   1975         * Copy the neighbor flags for a general ctb (ctb inside the frame; not any corners).
   1976         * The table gau4_nbr_flags_8x8_4x4blks generated for 16x16 4x4 blocks(ctb_size = 64).
   1977         * But the same table holds good for other 4x4 blocks 2d arrays(eg 8x8 4x4 blks,4x4 4x4blks).
   1978         * But the flags must be accessed with stride of 16 since the table has been generated for
   1979         * ctb_size = 64. For odd 4x4 2d arrays(eg 3x3 4x4 blks) the flags needs modification.
   1980         * The flags also need modification for corner ctbs.
   1981         */
   1982         memcpy(
   1983             ps_ed_ctxt->ai4_nbr_flags,
   1984             gau4_nbr_flags_8x8_4x4blks,
   1985             sizeof(gau4_nbr_flags_8x8_4x4blks));
   1986 
   1987         /*
   1988         * Since this is the fist ctb in the ctb row, set left flags unavailable for 1st CTB col
   1989         */
   1990         for(j = 0; j < num_4x4_blks_ctb_y; j++)
   1991         {
   1992             SET_L_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
   1993             SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
   1994             SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
   1995         }
   1996         /*
   1997         * If this is the fist ctb row, set top flags unavailable.
   1998         */
   1999         if(ht_offset == 0)
   2000         {
   2001             for(j = 0; j < num_4x4_blks_in_ctb; j++)
   2002             {
   2003                 SET_T_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
   2004                 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
   2005                 SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]);
   2006             }
   2007         }
   2008 
   2009         /* If this is last ctb row,set BL as not available. */
   2010         if(ht_offset + block_ht >= layer_ht)
   2011         {
   2012             for(j = 0; j < num_4x4_blks_in_ctb; j++)
   2013             {
   2014                 SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(num_4x4_blks_ctb_y - 1) * 8 + j]);
   2015             }
   2016         }
   2017         col_block_no = 0;
   2018         /* Call intra analysis for the ctb */
   2019         ed_calc_ctb(
   2020             ps_ed_ctxt,
   2021             ps_ed_ctb,
   2022             ps_ed_ctb_l1,
   2023             pu1_src_pre_intra,
   2024             src_stride,
   2025             num_4x4_blks_in_ctb,
   2026             num_4x4_blks_ctb_y,
   2027             nbr_flags_ptr,
   2028             layer_no,
   2029             row_block_no,
   2030             col_block_no,
   2031             ps_ipe_optimised_function_list,
   2032             ps_cmn_utils_optimised_function_list
   2033 
   2034         );
   2035 
   2036         pu1_src_pre_intra += src_inc_pre_intra;
   2037         ps_ed_ctb += inc_ctb;
   2038         ps_ed_ctb_l1 += 1;
   2039         /*
   2040         * For the rest of the ctbs, set left flags available.
   2041         */
   2042         for(j = 0; j < num_4x4_blks_ctb_y; j++)
   2043         {
   2044             SET_L_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
   2045         }
   2046         for(j = 0; j < num_4x4_blks_ctb_y - 1; j++)
   2047         {
   2048             SET_BL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]);
   2049             SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(j + 1) * 8]);
   2050         }
   2051         if(ht_offset != 0)
   2052         {
   2053             SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[0]);
   2054         }
   2055     }
   2056 
   2057     /* The first ctb is processed before the loop.
   2058     * The last one is processed after the loop.
   2059     */
   2060     for(col_block_no = 1; col_block_no < num_col_blks - 1; col_block_no++)
   2061     {
   2062         if(!skip_decomp)
   2063         {
   2064             ihevce_scale_by_2(
   2065                 pu1_src,
   2066                 src_stride,
   2067                 pu1_dst_decomp,
   2068                 dst_stride,
   2069                 layer_wd,
   2070                 layer_ht,
   2071                 pu1_wkg_mem,
   2072                 ht_offset,
   2073                 block_ht,
   2074                 block_wd * col_block_no,
   2075                 block_wd,
   2076                 ps_cmn_utils_optimised_function_list->pf_copy_2d,
   2077                 ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
   2078             /* Disable noise detection */
   2079             memset(
   2080                 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
   2081                 0,
   2082                 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
   2083 
   2084             ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
   2085         }
   2086 
   2087         if(do_pre_intra_analysis)
   2088         {
   2089             ed_calc_ctb(
   2090                 ps_ed_ctxt,
   2091                 ps_ed_ctb,
   2092                 ps_ed_ctb_l1,
   2093                 pu1_src_pre_intra,
   2094                 src_stride,
   2095                 num_4x4_blks_in_ctb,
   2096                 num_4x4_blks_ctb_y,
   2097                 nbr_flags_ptr,
   2098                 layer_no,
   2099                 row_block_no,
   2100                 col_block_no,
   2101                 ps_ipe_optimised_function_list,
   2102                 ps_cmn_utils_optimised_function_list);
   2103             pu1_src_pre_intra += src_inc_pre_intra;
   2104             ps_ed_ctb += inc_ctb;
   2105             ps_ed_ctb_l1 += 1;
   2106         }
   2107     }
   2108 
   2109     /* Last ctb in row */
   2110     if((!skip_decomp) && (col_block_no == (num_col_blks - 1)))
   2111     {
   2112         ihevce_scale_by_2(
   2113             pu1_src,
   2114             src_stride,
   2115             pu1_dst_decomp,
   2116             dst_stride,
   2117             layer_wd,
   2118             layer_ht,
   2119             pu1_wkg_mem,
   2120             ht_offset,
   2121             block_ht,
   2122             block_wd * col_block_no,
   2123             block_wd,
   2124             ps_cmn_utils_optimised_function_list->pf_copy_2d,
   2125             ps_ipe_optimised_function_list->pf_scaling_filter_mxn);
   2126         {
   2127             /* Disable noise detection */
   2128             memset(
   2129                 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy,
   2130                 0,
   2131                 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy));
   2132 
   2133             ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0;
   2134         }
   2135     }
   2136 
   2137     if(do_pre_intra_analysis && (col_block_no == (num_col_blks - 1)))
   2138     {
   2139         /*
   2140         * The last ctb can be complete or incomplete. The complete
   2141         * ctb is handled in the if and incomplete is handled in the
   2142         * else case
   2143         */
   2144         //if(num_4x4_blks_last_ctb == num_4x4_blks_in_ctb)
   2145         if((num_4x4_blks_last_ctb_x == num_4x4_blks_ctb_y) &&
   2146            (num_4x4_blks_in_ctb == num_4x4_blks_last_ctb_x))
   2147         {
   2148             /* Last ctb so set top right not available */
   2149             SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[num_4x4_blks_in_ctb - 1]);
   2150 
   2151             ed_calc_ctb(
   2152                 ps_ed_ctxt,
   2153                 ps_ed_ctb,
   2154                 ps_ed_ctb_l1,
   2155                 pu1_src_pre_intra,
   2156                 src_stride,
   2157                 num_4x4_blks_in_ctb,
   2158                 num_4x4_blks_in_ctb,
   2159                 nbr_flags_ptr,
   2160                 layer_no,
   2161                 row_block_no,
   2162                 col_block_no,
   2163                 ps_ipe_optimised_function_list,
   2164                 ps_cmn_utils_optimised_function_list);
   2165             pu1_src_pre_intra += src_inc_pre_intra;
   2166             ps_ed_ctb += inc_ctb;
   2167             ps_ed_ctb_l1 += 1;
   2168         }
   2169         else
   2170         {
   2171             /* Last ctb so set top right not available */
   2172             for(i = 0; i < num_4x4_blks_ctb_y; i++)
   2173             {
   2174                 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[i * 8 + num_4x4_blks_in_ctb - 1]);
   2175             }
   2176 
   2177             ihevce_ed_calc_incomplete_ctb(
   2178                 ps_ed_ctxt,
   2179                 ps_ed_ctb,
   2180                 ps_ed_ctb_l1,
   2181                 pu1_src_pre_intra,
   2182                 src_stride,
   2183                 num_4x4_blks_last_ctb_x,
   2184                 num_4x4_blks_ctb_y,
   2185                 nbr_flags_ptr,
   2186                 layer_no,
   2187                 row_block_no,
   2188                 col_block_no,
   2189                 ps_ipe_optimised_function_list,
   2190                 ps_cmn_utils_optimised_function_list);
   2191         }
   2192     }
   2193 }
   2194 
   2195 /*!
   2196 ******************************************************************************
   2197 * \if Function name : ihevce_decomp_pre_intra_process \endif
   2198 *
   2199 * \brief
   2200 *    Frame level function to decompose given layer L0 into coarser layers
   2201 *
   2202 * \param[in] pv_ctxt : pointer to master context of decomp_pre_intra module
   2203 * \param[in] ps_inp  : pointer to input yuv buffer (frame buffer)
   2204 * \param[in] pv_multi_thrd_ctxt : pointer to multithread context
   2205 * \param[out] thrd_id : thread id
   2206 *
   2207 * \return
   2208 *    None
   2209 *
   2210 * \author
   2211 *  Ittiam
   2212 *
   2213 *****************************************************************************
   2214 */
   2215 void ihevce_decomp_pre_intra_process(
   2216     void *pv_ctxt,
   2217     ihevce_lap_output_params_t *ps_lap_out_prms,
   2218     frm_ctb_ctxt_t *ps_frm_ctb_prms,
   2219     void *pv_multi_thrd_ctxt,
   2220     WORD32 thrd_id,
   2221     WORD32 i4_ping_pong,
   2222     ihevce_8x8_L0_satd_t *ps_layer0_cur_satd,
   2223     ihevce_8x8_L0_mean_t *ps_layer0_cur_mean)
   2224 {
   2225     WORD32 i4_layer_no;
   2226     WORD32 i4_num_layers;
   2227     WORD32 end_of_layer;
   2228     UWORD8 *pu1_src, *pu1_dst;
   2229     WORD32 src_stride, dst_stride;
   2230     WORD32 i4_layer_wd, i4_layer_ht;
   2231     WORD32 ht_offset, block_ht;
   2232     WORD32 row_block_no, num_row_blocks;
   2233     UWORD8 *pu1_wkg_mem;
   2234     WORD32 block_wd;
   2235     WORD32 num_col_blks;
   2236     WORD32 skip_decomp, skip_pre_intra;
   2237     WORD32 i4_cu_aligned_pic_wd, i4_cu_aligned_pic_ht;
   2238     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
   2239         (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
   2240 
   2241     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
   2242         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thrd_id];
   2243     multi_thrd_ctxt_t *ps_multi_thrd = (multi_thrd_ctxt_t *)pv_multi_thrd_ctxt;
   2244 
   2245     ihevce_ed_ctxt_t *ps_ed_ctxt;
   2246     ihevce_ed_blk_t *ps_ed;
   2247     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1;
   2248     WORD32 inc_ctb = 0;
   2249     WORD32 num_4x4_blks_lyr;
   2250 
   2251     i4_num_layers = ps_ctxt->i4_num_layers;
   2252 
   2253     ASSERT(i4_num_layers >= 3);
   2254 
   2255     /*
   2256      * Always force minimum layers as 4 so that we would have both l1 and l2
   2257      * pre intra analysis
   2258      */
   2259     if(i4_num_layers == 3)
   2260     {
   2261         i4_num_layers = 4;
   2262     }
   2263 
   2264     ps_ctxt->as_layers[0].pu1_inp = (UWORD8 *)ps_lap_out_prms->s_input_buf.pv_y_buf;
   2265     ps_ctxt->as_layers[0].i4_inp_stride = ps_lap_out_prms->s_input_buf.i4_y_strd;
   2266     ps_ctxt->as_layers[0].i4_actual_wd = ps_lap_out_prms->s_input_buf.i4_y_wd;
   2267     ps_ctxt->as_layers[0].i4_actual_ht = ps_lap_out_prms->s_input_buf.i4_y_ht;
   2268 
   2269     /* ------------ Loop over all the layers --------------- */
   2270     /* This loop does only decomp for all layers by picking jobs from job queue */
   2271     /* Decomp for all layers will completed with this for loop */
   2272     for(i4_layer_no = 0; i4_layer_no < (i4_num_layers - 1); i4_layer_no++)
   2273     {
   2274         WORD32 idx = 0;
   2275         src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride;
   2276         pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp;
   2277         i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd;
   2278         i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht;
   2279         pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp;
   2280         dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride;
   2281         block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd;
   2282         block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht;
   2283         num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks;
   2284         num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks;
   2285         i4_cu_aligned_pic_wd = ps_frm_ctb_prms->i4_cu_aligned_pic_wd;
   2286         i4_cu_aligned_pic_ht = ps_frm_ctb_prms->i4_cu_aligned_pic_ht;
   2287 
   2288         /* register ed_ctxt buffer pointer */
   2289         //pv_ed_ctxt =  &ps_ctxt->as_layers[i4_layer_no].s_early_decision;
   2290         //ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt;
   2291         //ps_ed = ps_ed_ctxt->ps_ed;
   2292 
   2293         //pv_ed_ctxt = &ps_ctxt->ps_ed_ctxt;
   2294         ps_ed_ctxt = ps_ctxt->ps_ed_ctxt;
   2295 
   2296         /* initialize ed_ctxt here */
   2297         /* init is moved here since now allocation is happening for only one instance
   2298         is allocated. for each layer it is re-used */
   2299         ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no];
   2300         ps_ed_ctxt->i4_slice_type = ps_ctxt->i4_slice_type;
   2301         ps_ed_ctxt->level = ps_ctxt->i4_codec_level;
   2302         if(0 == i4_layer_no)
   2303         {
   2304             ps_ed_ctxt->ps_ed_pic = NULL;
   2305             ps_ed_ctxt->ps_ed = NULL;
   2306             ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
   2307             ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
   2308         }
   2309         else if(1 == i4_layer_no)
   2310         {
   2311             ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf;
   2312             ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf;
   2313             ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1;
   2314             ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1;
   2315             ps_ctxt->ps_layer0_cur_satd = NULL;
   2316             ps_ctxt->ps_layer0_cur_mean = NULL;
   2317         }
   2318         else if(2 == i4_layer_no)
   2319         {
   2320             ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf;
   2321             ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf;
   2322             ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
   2323             ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
   2324             ps_ctxt->ps_layer0_cur_satd = NULL;
   2325             ps_ctxt->ps_layer0_cur_mean = NULL;
   2326         }
   2327 
   2328         /*Calculate the number of 4x4 blocks in a CTB in that layer*/
   2329         /*Divide block_wd by 4. 4 to get no of 4x4 blks*/
   2330         num_4x4_blks_lyr = block_wd >> 2;
   2331         inc_ctb = num_4x4_blks_lyr * num_4x4_blks_lyr;
   2332 
   2333         ps_ed = ps_ed_ctxt->ps_ed;
   2334         ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1;
   2335 
   2336         end_of_layer = 0;
   2337         skip_decomp = 0;
   2338         skip_pre_intra = 1;
   2339         //if( i4_layer_no >= ps_ctxt->i4_num_layers)
   2340         if(i4_layer_no >= (ps_ctxt->i4_num_layers - 1))
   2341         {
   2342             skip_decomp = 1;
   2343         }
   2344         /* ------------ Loop over all the CTB rows --------------- */
   2345         while(0 == end_of_layer)
   2346         {
   2347             job_queue_t *ps_pre_enc_job;
   2348             WORD32 num_4x4_blks_ctb_y = 0;
   2349             WORD32 num_4x4_blks_last_ctb_x = 0;
   2350 
   2351             /* Get the current row from the job queue */
   2352             ps_pre_enc_job = (job_queue_t *)ihevce_pre_enc_grp_get_next_job(
   2353                 pv_multi_thrd_ctxt, (DECOMP_JOB_LYR0 + i4_layer_no), 1, i4_ping_pong);
   2354 
   2355             pu1_wkg_mem = ps_ctxt->pu1_wkg_mem;
   2356 
   2357             /* If all rows are done, set the end of layer flag to 1, */
   2358             if(NULL == ps_pre_enc_job)
   2359             {
   2360                 end_of_layer = 1;
   2361             }
   2362             else
   2363             {
   2364                 /* Obtain the current row's details from the job */
   2365                 row_block_no = ps_pre_enc_job->s_job_info.s_decomp_job_info.i4_vert_unit_row_no;
   2366                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = row_block_no;
   2367                 ht_offset = row_block_no * block_ht;
   2368 
   2369                 if(row_block_no < (num_row_blocks))
   2370                 {
   2371                     pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
   2372                               ((block_ht >> 1) * dst_stride * row_block_no);
   2373 
   2374                     /*L0 8x8 curr satd for qp mod*/
   2375                     if(i4_layer_no == 0)
   2376                     {
   2377                         ps_ctxt->ps_layer0_cur_satd =
   2378                             ps_layer0_cur_satd + (row_block_no * num_col_blks /*num ctbs*/ *
   2379                                                   (block_wd >> 3) * (block_ht >> 3));
   2380                         ps_ctxt->ps_layer0_cur_mean =
   2381                             ps_layer0_cur_mean + (row_block_no * num_col_blks /*num ctbs*/ *
   2382                                                   (block_wd >> 3) * (block_ht >> 3));
   2383                     }
   2384 
   2385                     /* call the row level processing function */
   2386                     ihevce_decomp_pre_intra_process_row(
   2387                         pu1_src,
   2388                         src_stride,
   2389                         pu1_dst,
   2390                         dst_stride,
   2391                         i4_layer_wd,
   2392                         i4_layer_ht,
   2393                         pu1_wkg_mem,
   2394                         ht_offset,
   2395                         block_ht,
   2396                         block_wd,
   2397                         i4_cu_aligned_pic_wd,
   2398                         i4_cu_aligned_pic_ht,
   2399                         num_col_blks,
   2400                         i4_layer_no,
   2401                         ps_ed_ctxt,
   2402                         ps_ed,
   2403                         ps_ed_ctb_l1,
   2404                         ps_ctxt->ps_layer0_cur_satd,
   2405                         ps_ctxt->ps_layer0_cur_mean,
   2406                         num_4x4_blks_ctb_y,
   2407                         num_4x4_blks_last_ctb_x,
   2408                         skip_decomp,
   2409                         skip_pre_intra,
   2410                         row_block_no,
   2411                         ps_ctxt->i4_enable_noise_detection,
   2412                         ps_ctxt->ps_ctb_analyse,
   2413                         &ps_ctxt->s_ipe_optimised_function_list,
   2414                         &ps_ctxt->s_cmn_opt_func);
   2415 
   2416                     /*When decompositionis done from L1 to L2
   2417                     pre intra analysis is done on L1*/
   2418                     if(i4_layer_no == 1 || i4_layer_no == 2)
   2419                     {
   2420                         // ps_ed   = ps_ed_ctxt->ps_ed +
   2421                         //          (row_block_no * inc_ctb * (num_col_blks));
   2422                     }
   2423                 }
   2424                 idx++;
   2425                 /* set the output dependency */
   2426                 ihevce_pre_enc_grp_job_set_out_dep(
   2427                     pv_multi_thrd_ctxt, ps_pre_enc_job, i4_ping_pong);
   2428             }
   2429         }
   2430         ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = idx;
   2431 
   2432         ihevce_ed_frame_init(ps_ed_ctxt, i4_layer_no);
   2433 
   2434         if((1 == i4_layer_no) && (IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset))
   2435         {
   2436             WORD32 vert_ctr, ctb_ctr, i;
   2437             WORD32 ctb_ctr_blks = ps_ctxt->as_layers[1].i4_num_col_blks;
   2438             WORD32 vert_ctr_blks = ps_ctxt->as_layers[1].i4_num_row_blks;
   2439 
   2440             if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) &&
   2441                (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))
   2442             {
   2443                 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
   2444                 {
   2445                     ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
   2446                         ps_ctxt->ps_ed_ctb_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
   2447 
   2448                     for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
   2449                     {
   2450                         ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
   2451                         for(i = 0; i < 16; i++)
   2452                         {
   2453                             ps_ed_ctb_curr_l1->i4_best_sad_cost_8x8_l1_ipe[i] = 0x7fffffff;
   2454                             ps_ed_ctb_curr_l1->i4_best_sad_8x8_l1_ipe[i] = 0x7fffffff;
   2455                         }
   2456                     }
   2457                 }
   2458             }
   2459         }
   2460 
   2461 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
   2462         if(((2 == i4_layer_no) && (ps_lap_out_prms->i4_pic_type == IV_I_FRAME ||
   2463                                    ps_lap_out_prms->i4_pic_type == IV_IDR_FRAME)) ||
   2464            ((1 == i4_layer_no) &&
   2465             (ps_lap_out_prms->i4_temporal_lyr_id <= TEMPORAL_LAYER_DISABLE)) ||
   2466            ((IHEVCE_QUALITY_P6 != ps_ctxt->i4_quality_preset) && (0 != i4_layer_no)))
   2467 #else
   2468         if((0 != i4_layer_no) &&
   2469            (1 != ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
   2470                   (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
   2471 #endif
   2472         {
   2473             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
   2474 
   2475             src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride;
   2476             pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp;
   2477             i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd;
   2478             i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht;
   2479             pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp;
   2480             dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride;
   2481             block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd;
   2482             block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht;
   2483             num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks;
   2484             num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks;
   2485             i4_cu_aligned_pic_wd = ps_frm_ctb_prms->i4_cu_aligned_pic_wd;
   2486             i4_cu_aligned_pic_ht = ps_frm_ctb_prms->i4_cu_aligned_pic_ht;
   2487 
   2488             /* register ed_ctxt buffer pointer */
   2489             ps_ed_ctxt = ps_ctxt->ps_ed_ctxt;
   2490 
   2491             /* initialize ed_ctxt here */
   2492             /* init is moved here since now allocation is happening for only one instance
   2493             is allocated. for each layer it is re-used */
   2494             ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no];
   2495             ps_ed_ctxt->i4_slice_type = ps_ctxt->i4_slice_type;
   2496             ps_ed_ctxt->level = ps_ctxt->i4_codec_level;
   2497             if(1 == i4_layer_no)
   2498             {
   2499                 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf;
   2500                 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf;
   2501                 ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1;
   2502                 ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1;
   2503                 ps_ctxt->ps_layer0_cur_satd = NULL;
   2504                 ps_ctxt->ps_layer0_cur_mean = NULL;
   2505             }
   2506             else if(2 == i4_layer_no)
   2507             {
   2508                 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf;
   2509                 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf;
   2510                 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL;
   2511                 ps_ed_ctxt->ps_ed_ctb_l1 = NULL;
   2512                 ps_ctxt->ps_layer0_cur_satd = NULL;
   2513                 ps_ctxt->ps_layer0_cur_mean = NULL;
   2514             }
   2515 
   2516             /*Calculate the number of 4x4 blocks in a CTB in that layer*/
   2517             /*Divide block_wd by 4. 4 to get no of 4x4 blks*/
   2518             num_4x4_blks_lyr = block_wd >> 2;
   2519             inc_ctb = num_4x4_blks_lyr * num_4x4_blks_lyr;
   2520 
   2521             ps_ed = ps_ed_ctxt->ps_ed;
   2522             ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1;
   2523             skip_decomp = 1;
   2524             skip_pre_intra = 0;
   2525             for(idx = 0; idx < i4_num_rows; idx++)
   2526             {
   2527                 WORD32 num_4x4_blks_ctb_y = 0;
   2528                 WORD32 num_4x4_blks_last_ctb_x = 0;
   2529 
   2530                 pu1_wkg_mem = ps_ctxt->pu1_wkg_mem;
   2531 
   2532                 {
   2533                     /* Obtain the current row's details from the job */
   2534                     row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
   2535                     ht_offset = row_block_no * block_ht;
   2536 
   2537                     if(row_block_no < (num_row_blocks))
   2538                     {
   2539                         pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp +
   2540                                   ((block_ht >> 1) * dst_stride * row_block_no);
   2541 
   2542                         if(i4_layer_no == 1 || i4_layer_no == 2)
   2543                         {
   2544                             ps_ed = ps_ed_ctxt->ps_ed + (row_block_no * inc_ctb * (num_col_blks));
   2545                             ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1 + (row_block_no * num_col_blks);
   2546 
   2547                             ps_ed_ctxt->i4_quality_preset = ps_ctxt->i4_quality_preset;
   2548                             num_4x4_blks_ctb_y = block_ht >> 2;
   2549                             num_4x4_blks_last_ctb_x = block_wd >> 2;
   2550 
   2551                             if(row_block_no == num_row_blocks - 1)
   2552                             {
   2553                                 if(i4_layer_ht % block_ht)
   2554                                 {
   2555                                     num_4x4_blks_ctb_y = ((i4_layer_ht % block_ht) + 3) >> 2;
   2556                                 }
   2557                             }
   2558 
   2559                             if(i4_layer_wd % block_wd)
   2560                             {
   2561                                 num_4x4_blks_last_ctb_x = ((i4_layer_wd % block_wd) + 3) >> 2;
   2562                             }
   2563                         }
   2564 
   2565                         /* call the row level processing function */
   2566                         ihevce_decomp_pre_intra_process_row(
   2567                             pu1_src,
   2568                             src_stride,
   2569                             pu1_dst,
   2570                             dst_stride,
   2571                             i4_layer_wd,
   2572                             i4_layer_ht,
   2573                             pu1_wkg_mem,
   2574                             ht_offset,
   2575                             block_ht,
   2576                             block_wd,
   2577                             i4_cu_aligned_pic_wd,
   2578                             i4_cu_aligned_pic_ht,
   2579                             num_col_blks,
   2580                             i4_layer_no,
   2581                             ps_ed_ctxt,
   2582                             ps_ed,
   2583                             ps_ed_ctb_l1,
   2584                             ps_ctxt->ps_layer0_cur_satd,
   2585                             ps_ctxt->ps_layer0_cur_mean,
   2586                             num_4x4_blks_ctb_y,
   2587                             num_4x4_blks_last_ctb_x,
   2588                             skip_decomp,
   2589                             skip_pre_intra,
   2590                             row_block_no,
   2591                             0,
   2592                             NULL,
   2593                             &ps_ctxt->s_ipe_optimised_function_list,
   2594                             &ps_ctxt->s_cmn_opt_func);
   2595                     }
   2596                 }
   2597                 if(1 == i4_layer_no)
   2598                 {
   2599                     ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
   2600                 }
   2601             }
   2602             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
   2603             {
   2604                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
   2605             }
   2606             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
   2607         }
   2608 
   2609 #if DISABLE_L2_IPE_IN_PB_L1_IN_B
   2610         if((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
   2611            (((i4_layer_no == 2) && (ps_lap_out_prms->i4_pic_type == ISLICE)) ||
   2612             ((i4_layer_no == 1) && (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))))
   2613         {
   2614             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
   2615             if(1 == i4_layer_no)
   2616             {
   2617                 for(idx = 0; idx < i4_num_rows; idx++)
   2618                 {
   2619                     row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
   2620 
   2621                     {
   2622                         ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
   2623                     }
   2624                 }
   2625             }
   2626             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
   2627             {
   2628                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
   2629             }
   2630             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
   2631         }
   2632 #else
   2633         if((i4_layer_no != 0) && ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) &&
   2634                                   (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE)))
   2635         {
   2636             WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed;
   2637             for(idx = 0; idx < i4_num_rows; idx++)
   2638             {
   2639                 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx];
   2640                 if(1 == i4_layer_no)
   2641                 {
   2642                     ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1;
   2643                 }
   2644             }
   2645             for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++)
   2646             {
   2647                 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1;
   2648             }
   2649             ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0;
   2650         }
   2651 #endif
   2652     }
   2653 }
   2654 
   2655 /*!
   2656 ************************************************************************
   2657 * \brief
   2658 *    return number of records used by decomp pre intra
   2659 *
   2660 ************************************************************************
   2661 */
   2662 WORD32 ihevce_decomp_pre_intra_get_num_mem_recs(void)
   2663 {
   2664     return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
   2665 }
   2666 
   2667 /*!
   2668 ************************************************************************
   2669 * @brief
   2670 *    return each record attributes of  decomp pre intra
   2671 ************************************************************************
   2672 */
   2673 WORD32 ihevce_decomp_pre_intra_get_mem_recs(
   2674     iv_mem_rec_t *ps_mem_tab, WORD32 i4_num_proc_thrds, WORD32 i4_mem_space)
   2675 {
   2676     /* memories should be requested assuming worst case requirememnts */
   2677 
   2678     /* Module context structure */
   2679     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_size = sizeof(ihevce_decomp_pre_intra_master_ctxt_t);
   2680     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
   2681     ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_alignment = 8;
   2682 
   2683     /* Thread context structure */
   2684     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_size =
   2685         i4_num_proc_thrds * sizeof(ihevce_decomp_pre_intra_ctxt_t);
   2686     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
   2687     ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_alignment = 8;
   2688 
   2689     /* early decision context structure */
   2690     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_size = i4_num_proc_thrds * sizeof(ihevce_ed_ctxt_t);
   2691     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space;
   2692     ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_alignment = 8;
   2693 
   2694     return (NUM_DECOMP_PRE_INTRA_MEM_RECS);
   2695 }
   2696 
   2697 /*!
   2698 ************************************************************************
   2699 * @brief
   2700 *    Init decomp pre intra context
   2701 ************************************************************************
   2702 */
   2703 void *ihevce_decomp_pre_intra_init(
   2704     iv_mem_rec_t *ps_mem_tab,
   2705     ihevce_static_cfg_params_t *ps_init_prms,
   2706     WORD32 i4_num_proc_thrds,
   2707     func_selector_t *ps_func_selector,
   2708     WORD32 i4_resolution_id,
   2709     UWORD8 u1_is_popcnt_available)
   2710 {
   2711     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt;
   2712     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt;
   2713     WORD32 thread_no;
   2714     WORD32 n_tot_layers;
   2715     WORD32 count;
   2716     WORD32 a_wd[MAX_NUM_HME_LAYERS], a_ht[MAX_NUM_HME_LAYERS], layer_no;
   2717     WORD32 a_disp_wd[MAX_NUM_LAYERS], a_disp_ht[MAX_NUM_LAYERS];
   2718     ihevce_ed_ctxt_t *ps_ed_ctxt;
   2719     WORD32 min_cu_size;
   2720 
   2721     /* get the min cu size from config params */
   2722     min_cu_size = ps_init_prms->s_config_prms.i4_min_log2_cu_size;
   2723 
   2724     min_cu_size = 1 << min_cu_size;
   2725 
   2726     /* Get the height and width of each layer */
   2727     *a_wd = ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_width +
   2728             SET_CTB_ALIGN(
   2729                 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_width, min_cu_size);
   2730     *a_ht =
   2731         ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_height +
   2732         SET_CTB_ALIGN(
   2733             ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_height, min_cu_size);
   2734 
   2735     n_tot_layers = hme_derive_num_layers(1, a_wd, a_ht, a_disp_wd, a_disp_ht);
   2736 
   2737     /* Decomp state structure */
   2738     ps_master_ctxt =
   2739         (ihevce_decomp_pre_intra_master_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_CTXT].pv_base;
   2740     ps_master_ctxt->i4_num_proc_thrds = i4_num_proc_thrds;
   2741 
   2742     ps_ctxt = (ihevce_decomp_pre_intra_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].pv_base;
   2743     ps_ed_ctxt = (ihevce_ed_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].pv_base;
   2744 
   2745     for(thread_no = 0; thread_no < ps_master_ctxt->i4_num_proc_thrds; thread_no++)
   2746     {
   2747         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no] = ps_ctxt;
   2748 
   2749         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->i4_num_layers = n_tot_layers;
   2750 
   2751         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->pu1_wkg_mem =
   2752             &ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->au1_wkg_mem[0];
   2753 
   2754         ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->ps_ed_ctxt = ps_ed_ctxt;
   2755 
   2756         for(layer_no = 0; layer_no < n_tot_layers; layer_no++)
   2757         {
   2758             WORD32 max_ctb_size;
   2759             WORD32 decomp_blk_ht, decomp_blk_wd;
   2760 
   2761             ps_ctxt->as_layers[layer_no].i4_actual_wd = a_wd[layer_no];
   2762             ps_ctxt->as_layers[layer_no].i4_actual_ht = a_ht[layer_no];
   2763             ps_ctxt->as_layers[layer_no].i4_inp_stride = 0;
   2764             ps_ctxt->as_layers[layer_no].pu1_inp = NULL;
   2765             ps_ctxt->as_layers[layer_no].i4_num_rows_processed = 0;
   2766 
   2767             for(count = 0; count < MAX_NUM_CTB_ROWS_FRM; count++)
   2768             {
   2769                 ps_ctxt->as_layers[layer_no].ai4_curr_row_no[count] = -1;
   2770             }
   2771             if(0 == layer_no)
   2772             {
   2773                 ps_ctxt->as_layers[layer_no].i4_padded_ht = a_ht[layer_no];
   2774                 ps_ctxt->as_layers[layer_no].i4_padded_wd = a_wd[layer_no];
   2775             }
   2776             else
   2777             {
   2778                 ps_ctxt->as_layers[layer_no].i4_padded_ht = a_ht[layer_no] + 32 + 4;
   2779                 ps_ctxt->as_layers[layer_no].i4_padded_wd = a_wd[layer_no] + 32 + 4;
   2780             }
   2781 
   2782             /** If CTB size= 64.decomp_blk_wd = 64 for L0, 32 for L1 , 16 for L2, 8 for L3 */
   2783             max_ctb_size = 1 << ps_init_prms->s_config_prms.i4_max_log2_cu_size;
   2784 
   2785             ps_ctxt->as_layers[layer_no].i4_decomp_blk_ht = max_ctb_size >> layer_no;
   2786             ps_ctxt->as_layers[layer_no].i4_decomp_blk_wd = max_ctb_size >> layer_no;
   2787 
   2788             decomp_blk_ht = ps_ctxt->as_layers[layer_no].i4_decomp_blk_ht;
   2789             decomp_blk_wd = ps_ctxt->as_layers[layer_no].i4_decomp_blk_wd;
   2790 
   2791             ps_ctxt->as_layers[layer_no].i4_num_row_blks =
   2792                 ((a_ht[layer_no] + (decomp_blk_ht - 1)) / decomp_blk_ht);
   2793 
   2794             ps_ctxt->as_layers[layer_no].i4_num_col_blks =
   2795                 ((a_wd[layer_no] + (decomp_blk_wd - 1)) / decomp_blk_wd);
   2796         }
   2797         ps_ed_ctxt->ps_func_selector = ps_func_selector;
   2798 
   2799         ps_ctxt->i4_quality_preset =
   2800             ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_quality_preset;
   2801 
   2802         if(ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P7)
   2803         {
   2804             ps_ctxt->i4_quality_preset = IHEVCE_QUALITY_P6;
   2805         }
   2806 
   2807         if(ps_init_prms->s_coding_tools_prms.i4_vqet &
   2808            (1 << BITPOS_IN_VQ_TOGGLE_FOR_CONTROL_TOGGLER))
   2809         {
   2810             if(ps_init_prms->s_coding_tools_prms.i4_vqet &
   2811                (1 << BITPOS_IN_VQ_TOGGLE_FOR_ENABLING_NOISE_PRESERVATION))
   2812             {
   2813                 ps_ctxt->i4_enable_noise_detection = 1;
   2814             }
   2815             else
   2816             {
   2817                 ps_ctxt->i4_enable_noise_detection = 0;
   2818             }
   2819         }
   2820         else
   2821         {
   2822             ps_ctxt->i4_enable_noise_detection = 0;
   2823         }
   2824 
   2825         ihevce_cmn_utils_instr_set_router(
   2826             &ps_ctxt->s_cmn_opt_func, u1_is_popcnt_available, ps_init_prms->e_arch_type);
   2827 
   2828         ihevce_ipe_instr_set_router(
   2829             &ps_ctxt->s_ipe_optimised_function_list, ps_init_prms->e_arch_type);
   2830 
   2831         ps_ctxt++;
   2832         ps_ed_ctxt++;
   2833     }
   2834     /* return the handle to caller */
   2835     return ((void *)ps_master_ctxt);
   2836 }
   2837 
   2838 /*!
   2839 ******************************************************************************
   2840 * \if Function name : ihevce_decomp_pre_intra_frame_init \endif
   2841 *
   2842 * \brief
   2843 *    Frame Intialization for Decomp intra pre analysis.
   2844 *
   2845 * \param[in] pv_ctxt : pointer to module ctxt
   2846 * \param[in] ppu1_decomp_lyr_bufs : pointer to array of layer buffer pointers
   2847 * \param[in] pi4_lyr_buf_stride : pointer to array of layer buffer strides
   2848 *
   2849 * \return
   2850 *    None
   2851 *
   2852 * \author
   2853 *  Ittiam
   2854 *
   2855 *****************************************************************************
   2856 */
   2857 void ihevce_decomp_pre_intra_frame_init(
   2858     void *pv_ctxt,
   2859     UWORD8 **ppu1_decomp_lyr_bufs,
   2860     WORD32 *pi4_lyr_buf_stride,
   2861     ihevce_ed_blk_t *ps_layer1_buf,
   2862     ihevce_ed_blk_t *ps_layer2_buf,
   2863     ihevce_ed_ctb_l1_t *ps_ed_ctb_l1,
   2864     WORD32 i4_ol_sad_lambda_qf,
   2865     WORD32 i4_slice_type,
   2866     ctb_analyse_t *ps_ctb_analyse)
   2867 {
   2868     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt;
   2869     ihevce_decomp_pre_intra_ctxt_t *ps_ctxt;
   2870     WORD32 thread_no;
   2871 
   2872     /* Decomp state structure */
   2873     ps_master_ctxt = (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
   2874 
   2875     for(thread_no = 0; thread_no < ps_master_ctxt->i4_num_proc_thrds; thread_no++)
   2876     {
   2877         WORD32 layer_no;
   2878 
   2879         ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no];
   2880 
   2881         /* L0 layer (actual input) is registered in process call */
   2882         for(layer_no = 1; layer_no < ps_ctxt->i4_num_layers; layer_no++)
   2883         {
   2884             ps_ctxt->as_layers[layer_no].i4_inp_stride = pi4_lyr_buf_stride[layer_no - 1];
   2885             ps_ctxt->as_layers[layer_no].pu1_inp = ppu1_decomp_lyr_bufs[layer_no - 1];
   2886 
   2887             /*Populating the buffer pointers for layer1 and layer2 buffers to store the
   2888             structure for each 4x4 block after pre intra analysis on their respective laeyrs*/
   2889 
   2890             if(layer_no == 1)
   2891             {
   2892                 WORD32 sad_lambda_l1 = (3 * i4_ol_sad_lambda_qf >> 2);
   2893                 WORD32 temp = 1 << LAMBDA_Q_SHIFT;
   2894                 WORD32 lambda = ((temp) > sad_lambda_l1) ? temp : sad_lambda_l1;
   2895                 //ps_ctxt->as_layers[1].s_early_decision.ps_ed_pic = ps_layer1_buf;
   2896                 //ps_ctxt->as_layers[1].s_early_decision.ps_ed = ps_layer1_buf;
   2897                 ps_ctxt->ps_layer1_buf = ps_layer1_buf;
   2898                 ps_ctxt->ps_ed_ctb_l1 = ps_ed_ctb_l1;
   2899                 ps_ctxt->ai4_lambda[layer_no] = lambda;
   2900                 ps_ctxt->i4_codec_level = 0;
   2901                 ps_ctxt->i4_slice_type = i4_slice_type;
   2902             }
   2903             else if(layer_no == 2)
   2904             {
   2905                 WORD32 sad_lambda_l2 = i4_ol_sad_lambda_qf >> 1;
   2906                 WORD32 temp = 1 << LAMBDA_Q_SHIFT;
   2907                 WORD32 lambda = ((temp) > sad_lambda_l2) ? temp : sad_lambda_l2;
   2908 
   2909                 //ps_ctxt->as_layers[2].s_early_decision.ps_ed_pic = ps_layer2_buf;
   2910                 //ps_ctxt->as_layers[2].s_early_decision.ps_ed = ps_layer2_buf;
   2911                 ps_ctxt->ps_layer2_buf = ps_layer2_buf;
   2912                 //ihevce_ed_frame_init(ps_ctxt->ps_ed_ctxt);
   2913                 ps_ctxt->ai4_lambda[layer_no] = lambda;
   2914                 ps_ctxt->i4_codec_level = 0;
   2915                 ps_ctxt->i4_slice_type = i4_slice_type;
   2916             }
   2917             else
   2918             {
   2919                 //ps_ctxt->as_layers[0].s_early_decision.ps_ed_pic = NULL;
   2920                 //ps_ctxt->as_layers[0].s_early_decision.ps_ed = NULL;
   2921                 //ps_ctxt->ps_layer1_buf = NULL;
   2922                 ps_ctxt->ai4_lambda[layer_no] = -1;
   2923                 ps_ctxt->i4_codec_level = 0;
   2924                 ps_ctxt->i4_slice_type = i4_slice_type;
   2925             }
   2926         }
   2927 
   2928         /* make the ps_ctb_analyse refernce as a part of the private context */
   2929         ps_ctxt->ps_ctb_analyse = ps_ctb_analyse;
   2930     }
   2931 }
   2932 
   2933 /**
   2934 *******************************************************************************
   2935 *
   2936 * @brief
   2937 *     Merge Sort function.
   2938 *
   2939 * @par Description:
   2940 *     This function sorts the data in the input array in ascending
   2941 *     order using merge sort algorithm. Intermediate data obtained in
   2942 *     merge sort are stored in output 2-D array.
   2943 *
   2944 * @param[in]
   2945 *   pi4_input_val  :   Input 1-D array
   2946 *   aai4_output_val:   Output 2-D array containing elements sorted in sets of
   2947 *                      4,16,64 etc.
   2948 *   i4_length      : length of the array
   2949 *   i4_ip_sort_level: Input sort level. Specifies the level upto which array is sorted.
   2950 *                     It should be 1 if the array is unsorted. Should be 4 if array is sorted
   2951 *                     in sets of 4.
   2952 *   i4_op_sort_level: Output sort level. Specify the level upto which sorting is required.
   2953 *                     If it is given as length of array it sorts for whole array.
   2954 *
   2955 * @returns
   2956 *
   2957 * @remarks
   2958 *  None
   2959 *
   2960 *******************************************************************************
   2961 */
   2962 void ihevce_merge_sort(
   2963     WORD32 *pi4_input_val,
   2964     WORD32 aai4_output_val[][64],
   2965     WORD32 i4_length,
   2966     WORD32 i4_ip_sort_level,
   2967     WORD32 i4_op_sort_level)
   2968 {
   2969     WORD32 i, j, k;
   2970     WORD32 count, level;
   2971     WORD32 temp[64];
   2972     WORD32 *pi4_temp_buf_cpy;
   2973     WORD32 *pi4_temp = &temp[0];
   2974     WORD32 calc_level;
   2975 
   2976     pi4_temp_buf_cpy = pi4_temp;
   2977 
   2978     GETRANGE(calc_level, i4_op_sort_level / i4_ip_sort_level);
   2979 
   2980     calc_level = calc_level - 1;
   2981 
   2982     /*** This function is written under the assumption that we need only intermediate values of
   2983     sort in the range of 4,16,64 etc. ***/
   2984     ASSERT((calc_level % 2) == 0);
   2985 
   2986     /** One iteration of this for loop does 1 sets of sort and produces one intermediate value in 2 iterations **/
   2987     for(level = 0; level < calc_level; level++)
   2988     {
   2989         /** Merges adjacent sets of elements based on current sort level **/
   2990         for(count = 0; count < i4_length; (count = count + (i4_ip_sort_level * 2)))
   2991         {
   2992             i = 0;
   2993             j = 0;
   2994             if(pi4_input_val[i4_ip_sort_level - 1] < pi4_input_val[i4_ip_sort_level])
   2995             {
   2996                 /*** Condition for early exit ***/
   2997                 memcpy(&pi4_temp[0], pi4_input_val, sizeof(WORD32) * i4_ip_sort_level * 2);
   2998             }
   2999             else
   3000             {
   3001                 for(k = 0; k < (i4_ip_sort_level * 2); k++)
   3002                 {
   3003                     if((i < i4_ip_sort_level) && (j < i4_ip_sort_level))
   3004                     {
   3005                         if(pi4_input_val[i] > pi4_input_val[j + i4_ip_sort_level])
   3006                         {
   3007                             /** copy to output array **/
   3008                             pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
   3009                             j++;
   3010                         }
   3011                         else
   3012                         {
   3013                             /** copy to output array **/
   3014                             pi4_temp[k] = pi4_input_val[i];
   3015                             i++;
   3016                         }
   3017                     }
   3018                     else if(i == i4_ip_sort_level)
   3019                     {
   3020                         /** copy the remaining data to output array **/
   3021                         pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level];
   3022                         j++;
   3023                     }
   3024                     else
   3025                     {
   3026                         /** copy the remaining data to output array **/
   3027                         pi4_temp[k] = pi4_input_val[i];
   3028                         i++;
   3029                     }
   3030                 }
   3031             }
   3032             pi4_input_val += (i4_ip_sort_level * 2);
   3033             pi4_temp += (i4_ip_sort_level * 2);
   3034         }
   3035         pi4_input_val = pi4_temp - i4_length;
   3036 
   3037         if(level % 2)
   3038         {
   3039             /** Assign a temp address for storing next sort level output as we will not need this data as output **/
   3040             pi4_temp = pi4_temp_buf_cpy;
   3041         }
   3042         else
   3043         {
   3044             /** Assign address for storing the intermediate data into output 2-D array **/
   3045             pi4_temp = aai4_output_val[level / 2];
   3046         }
   3047         i4_ip_sort_level *= 2;
   3048     }
   3049 }
   3050 
   3051 void ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit(
   3052     void *pv_pre_intra_ctxt,
   3053     pre_enc_me_ctxt_t *ps_curr_out,
   3054     WORD32 i4_is_last_thread,
   3055     frm_ctb_ctxt_t *ps_frm_ctb_prms,
   3056     WORD32 i4_temporal_lyr_id,
   3057     WORD32 i4_enable_noise_detection)
   3058 {
   3059     ihevce_decomp_pre_intra_master_ctxt_t *ps_pre_intra_master_ctxt =
   3060         (ihevce_decomp_pre_intra_master_ctxt_t *)pv_pre_intra_ctxt;
   3061     ihevce_decomp_pre_intra_ctxt_t *ps_pre_intra_ctxt =
   3062         ps_pre_intra_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0];
   3063 
   3064     WORD32 i4_k;
   3065     WORD32 ctb_ctr, vert_ctr;
   3066 
   3067     WORD32 ai4_curr_frame_8x8_sum_act[2] = { 0, 0 };
   3068     LWORD64 ai8_curr_frame_8x8_sum_act_sqr[2] = { 0, 0 };
   3069     WORD32 ai4_curr_frame_8x8_sum_blks[2] = { 0, 0 };
   3070     ULWORD64 u8_curr_frame_8x8_sum_act_sqr = 0;
   3071 
   3072     LWORD64 ai8_curr_frame_16x16_sum_act_sqr[3] = { 0, 0, 0 };
   3073     WORD32 ai4_curr_frame_16x16_sum_act[3] = { 0, 0, 0 };
   3074     WORD32 ai4_curr_frame_16x16_sum_blks[3] = { 0, 0, 0 };
   3075 
   3076     LWORD64 ai8_curr_frame_32x32_sum_act_sqr[3] = { 0, 0, 0 };
   3077     WORD32 ai4_curr_frame_32x32_sum_act[3] = { 0, 0, 0 };
   3078     WORD32 ai4_curr_frame_32x32_sum_blks[3] = { 0, 0, 0 };
   3079 
   3080     (void)i4_temporal_lyr_id;
   3081     (void)i4_enable_noise_detection;
   3082 
   3083     if(i4_is_last_thread == 1)
   3084     {
   3085         WORD32 i4_slice_type = ps_curr_out->s_slice_hdr.i1_slice_type;
   3086         //ps_pre_intra_ctxt->i4_slice_type;
   3087         WORD32 ctb_ctr_blks = ps_pre_intra_ctxt->as_layers[1].i4_num_col_blks;
   3088         WORD32 vert_ctr_blks = ps_pre_intra_ctxt->as_layers[1].i4_num_row_blks;
   3089         ihevce_ed_ctb_l1_t *ps_ed_ctb_pic_l1 = ps_curr_out->ps_ed_ctb_l1;
   3090         WORD32 block_wd = ps_pre_intra_ctxt->as_layers[1].i4_decomp_blk_wd;
   3091         WORD32 inc_ctb = ((block_wd >> 2) * (block_wd >> 2));
   3092         ihevce_ed_blk_t *ps_ed_blk_l1 = ps_curr_out->ps_layer1_buf;
   3093         ihevce_ed_blk_t *ps_ed;
   3094         WORD32 i, j;
   3095         WORD32 i4_avg_noise_satd;
   3096         WORD32 k;
   3097         WORD32 i4_layer_wd = ps_pre_intra_ctxt->as_layers[1].i4_actual_wd;
   3098         WORD32 i4_layer_ht = ps_pre_intra_ctxt->as_layers[1].i4_actual_ht;
   3099 
   3100         /*Calculate min noise threshold */
   3101         /*Min noise threshold is calculted by taking average of lowest 1% satd val in the complete 4x4 frame satds*/
   3102         //ihevce_ed_ctxt_t *ps_ed_ctxt =  ps_pre_intra_ctxt->ps_ed_ctxt;
   3103         WORD32 i4_min_blk = ((MIN_BLKS * (i4_layer_wd >> 1) * (i4_layer_ht >> 1)) / 100);
   3104         WORD32 ai4_noise_thr_hstrgm[MAX_SATD_THRSHLD];
   3105         memset(&ai4_noise_thr_hstrgm[0], 0, (sizeof(WORD32) * MAX_SATD_THRSHLD));
   3106         ASSERT(!(USE_CUR_L0_SATD && USE_CUR_SATD));
   3107         for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
   3108         {
   3109             ps_ed = ps_ed_blk_l1 + (vert_ctr * inc_ctb * (ctb_ctr_blks));
   3110             for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
   3111             {
   3112                 /* Populate avg satd to calculate MI and activity factors */
   3113                 for(i = 0; i < 4; i++)
   3114                 {
   3115                     for(j = 0; j < 4; j++)
   3116                     {
   3117                         for(k = 0; k < 4; k++)
   3118                         {
   3119                             if(-1 != (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd)
   3120                             {
   3121                                 WORD32 i4_satd_lim;
   3122                                 i4_satd_lim = (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd;
   3123                                 /* Histogram creation for Noise threshold */
   3124                                 if(i4_satd_lim < MAX_SATD_THRSHLD)
   3125                                 {
   3126                                     ai4_noise_thr_hstrgm[i4_satd_lim]++;
   3127                                 }
   3128                             }
   3129                         }
   3130                     }
   3131                 }
   3132                 ps_ed += inc_ctb;
   3133             }
   3134         }
   3135         {
   3136             WORD32 i4_total_blks = 0;
   3137             LWORD64 i8_acc_satd = 0;
   3138             for(i = MIN_SATD_THRSHLD; i < MAX_SATD_THRSHLD; i++)
   3139             {
   3140                 i4_total_blks += ai4_noise_thr_hstrgm[i];
   3141                 i8_acc_satd += (i * ai4_noise_thr_hstrgm[i]);
   3142 
   3143                 if(i4_total_blks > i4_min_blk)
   3144                     break;
   3145             }
   3146             if(i4_total_blks < i4_min_blk)
   3147             {
   3148                 i4_avg_noise_satd = SATD_NOISE_FLOOR_THRESHOLD;
   3149             }
   3150             else
   3151             {
   3152                 i4_avg_noise_satd = (WORD32)(i8_acc_satd + (i4_total_blks >> 1)) / i4_total_blks;
   3153             }
   3154         }
   3155 
   3156         ps_curr_out->i4_avg_noise_thrshld_4x4 = i4_avg_noise_satd;
   3157 
   3158         for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++)
   3159         {
   3160             ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 =
   3161                 ps_ed_ctb_pic_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz;
   3162             ps_ed = ps_ed_blk_l1 + (vert_ctr * inc_ctb * (ctb_ctr_blks));
   3163 
   3164             for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++)
   3165             {
   3166                 /*sum of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 level */
   3167                 WORD32 ai4_sum_sum_4x4_satd_16x16[4] = { 0, 0, 0, 0 };
   3168                 /*min of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 level */
   3169                 WORD32 ai4_min_sum_4x4_satd_16x16[4] = {
   3170                     MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL
   3171                 };
   3172                 /*min of (min of L1_4x4 @ L1_8x8) @ L1_16x16 level */
   3173                 WORD32 ai4_min_min_4x4_satd_16x16[4] = {
   3174                     MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL
   3175                 };
   3176                 WORD32 i4_sum_4x4_satd, i4_min_4x4_satd;
   3177                 ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr;
   3178 
   3179                 WORD32 is_min_block_uncompensated_in_l32x32 = 0;
   3180 
   3181                 /*min of L1_4x4 @ L1_8x8*/
   3182                 WORD32 ai4_min_satd_ctb[MAX_CTB_SIZE];
   3183                 /*** This 2-D array will contain 4x4 satds sorted in ascending order in sets of 4,16,64 ***/
   3184                 /*** For example : '5 10 2 7 6 12 3 1' array input will return '2 5 7 10 1 3 6 12' if sorted in sets of 4 ***/
   3185                 WORD32 aai4_min_4_16_64_satd[3][MAX_CTB_SIZE];
   3186 
   3187                 /*sum of L1_4x4 @ L1_8x8*/
   3188                 WORD32 ai4_sum_satd_ctb[MAX_CTB_SIZE >> 2];
   3189                 /*** This 2-D array will contain 4x4 satds sorted in ascending order in sets of 4,16***/
   3190                 WORD32 aai4_sum_4_16_satd_ctb[2][MAX_CTB_SIZE];
   3191 
   3192                 /* sum of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 */
   3193                 WORD32 ai4_sum_sum_satd_ctb[(MAX_CTB_SIZE >> 2) >> 2];
   3194                 /*L1_32x32 = L0_64x64
   3195                 so in L1_32x32 there are 64 L1_4x4blocks*/
   3196                 for(i = 0; i < MAX_CTB_SIZE; i++)
   3197                 {
   3198                     ai4_min_satd_ctb[i] = -1;
   3199                 }
   3200                 for(j = 0; j < 3; j++)
   3201                 {
   3202                     for(i = 0; i < MAX_CTB_SIZE; i++)
   3203                     {
   3204                         aai4_min_4_16_64_satd[j][i] = -1;
   3205                     }
   3206                 }
   3207                 /*L1_32x32 = L0_64x64
   3208                 so in L1_32x32 there are 16 L1_8x8blocks*/
   3209                 for(i = 0; i < (MAX_CTB_SIZE >> 2); i++)
   3210                 {
   3211                     ai4_sum_satd_ctb[i] = -1;
   3212                 }
   3213                 for(j = 0; j < 2; j++)
   3214                 {
   3215                     for(i = 0; i < (MAX_CTB_SIZE >> 2); i++)
   3216                     {
   3217                         aai4_sum_4_16_satd_ctb[j][i] = -1;
   3218                     }
   3219                 }
   3220                 /*L1_32x32 = L0_64x64
   3221                 so in L1_32x32 there are 16 L1_16x16blocks*/
   3222                 for(i = 0; i < ((MAX_CTB_SIZE >> 2) >> 2); i++)
   3223                 {
   3224                     ai4_sum_sum_satd_ctb[i] = 0;
   3225                 }
   3226                 /*Populate sum min 4x4 activty */
   3227                 /*loop for L1_32x32 block*/
   3228                 for(i = 0; i < 4; i++)
   3229                 {
   3230                     /*loop for L1_16x16 block*/
   3231                     for(j = 0; j < 4; j++)
   3232                     {
   3233                         WORD32 i4_sum_satd_dumyy = 0;
   3234                         WORD32 i4_num_satd_blks = 0;
   3235                         /* loop for L1_8x8 block*/
   3236                         for(k = 0; k < 4; k++)
   3237                         {
   3238                             WORD32 i4_satd_lim;
   3239                             i4_satd_lim = (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd;
   3240 
   3241                             /*complete ctb will not have i4_4x4_satd = -1*/
   3242                             if(-1 != i4_satd_lim)
   3243                             {
   3244 #if SUB_NOISE_THRSHLD
   3245                                 i4_satd_lim = i4_satd_lim - i4_avg_noise_satd;
   3246                                 if(i4_satd_lim < 0)
   3247                                 {
   3248                                     i4_satd_lim = 0;
   3249                                 }
   3250 #else
   3251                                 if(i4_satd_lim < i4_avg_noise_satd)
   3252                                 {
   3253                                     i4_satd_lim = i4_avg_noise_satd;
   3254                                 }
   3255 #endif
   3256                                 i4_num_satd_blks++;
   3257                                 /*populate 4x4 data to calculate modulation index */
   3258                                 (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd = i4_satd_lim;
   3259 
   3260                                 i4_sum_satd_dumyy += i4_satd_lim;
   3261                                 ai4_min_satd_ctb[j * 4 + i * 16 + k] = i4_satd_lim;
   3262                             }
   3263                         }
   3264                         if(i4_num_satd_blks != 0)
   3265                         {
   3266                             /*make the sum of satd always for 4 blocks even it is incomplete ctb */
   3267                             i4_sum_satd_dumyy = i4_sum_satd_dumyy * 4 / i4_num_satd_blks;
   3268                         }
   3269                         else
   3270                         {
   3271                             i4_sum_satd_dumyy = -1;
   3272                         }
   3273                         /*sum of L1_4x4 @ L1_8x8block level*/
   3274                         ai4_sum_satd_ctb[j + i * 4] = i4_sum_satd_dumyy;
   3275                         /*sum of L1_8x8 @ L1_16x16block level*/
   3276                         ai4_sum_sum_satd_ctb[i] += i4_sum_satd_dumyy;
   3277                         /*store sum of 4x4 @ L1_8x8block level*/
   3278                         ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j] = i4_sum_satd_dumyy;
   3279                         /*store min of 4x4 @ L1_8x8block level */
   3280                         //ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] = i4_min_satd_dumyy;
   3281                     }
   3282                 }
   3283                 {
   3284                     WORD32 i4_array_length = sizeof(ai4_min_satd_ctb) / sizeof(WORD32);
   3285 
   3286                     /*** This function will sort 64 elements in array ai4_min_satd_ctb in ascending order to ***/
   3287                     /*** 3 arrays in sets of 4,16,64 into the 2-D array   aai4_min_4_16_64_satd              ***/
   3288                     ihevce_merge_sort(
   3289                         &ai4_min_satd_ctb[0], aai4_min_4_16_64_satd, i4_array_length, 1, 64);
   3290 
   3291                     i4_array_length = sizeof(ai4_sum_satd_ctb) / sizeof(WORD32);
   3292 
   3293                     /*** This function will sort 16 elements in array ai4_sum_satd_ctb in ascending order to ***/
   3294                     /*** 2 arrays in sets of 4,16 into the 2-D array   aai4_sum_4_16_satd_ctb                ***/
   3295                     ihevce_merge_sort(
   3296                         &ai4_sum_satd_ctb[0], aai4_sum_4_16_satd_ctb, i4_array_length, 1, 16);
   3297                 }
   3298 
   3299                 /*Populate avg satd to calculate MI and activity factors*/
   3300                 for(i = 0; i < 4; i++)
   3301                 {
   3302                     WORD32 is_min_block_uncompensated_in_l116x16 = 0;
   3303                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = -1;
   3304                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = -1;
   3305                     ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = -1;
   3306 
   3307                     for(j = 0; j < 4; j++)
   3308                     {
   3309                         ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] =
   3310                             aai4_min_4_16_64_satd[0][i * 16 + j * 4 + MEDIAN_CU_TU];
   3311                         /*Accumulate the sum of 8*8 activities in the current layer (16*16 CU in L0)*/
   3312                         i4_sum_4x4_satd = ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j];
   3313                         i4_min_4x4_satd = ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j];
   3314                         ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = -1;
   3315                         ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = -1;
   3316                         ASSERT(-2 != i4_sum_4x4_satd);
   3317 
   3318                         if((-1 != i4_sum_4x4_satd))
   3319                         {
   3320                             WORD32 not_skipped = 1;
   3321 
   3322                             if((i4_slice_type == ISLICE) || (1 == not_skipped))
   3323                             {
   3324                                 is_min_block_uncompensated_in_l116x16 = 1;
   3325                                 is_min_block_uncompensated_in_l32x32 = 1;
   3326 
   3327                                 u8_curr_frame_8x8_sum_act_sqr +=
   3328                                     (i4_sum_4x4_satd * i4_sum_4x4_satd);
   3329 
   3330                                 ai4_curr_frame_8x8_sum_act[0] += i4_sum_4x4_satd;
   3331                                 ai8_curr_frame_8x8_sum_act_sqr[0] +=
   3332                                     (i4_sum_4x4_satd * i4_sum_4x4_satd);
   3333                                 ai4_curr_frame_8x8_sum_blks[0] += 1;
   3334                                 ai4_curr_frame_8x8_sum_act[1] += i4_min_4x4_satd;
   3335                                 ai8_curr_frame_8x8_sum_act_sqr[1] +=
   3336                                     (i4_min_4x4_satd * i4_min_4x4_satd);
   3337                                 ai4_curr_frame_8x8_sum_blks[1] += 1;
   3338                             }
   3339 
   3340                             ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = i4_sum_4x4_satd;
   3341                             ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = i4_min_4x4_satd;
   3342                         }
   3343                         else
   3344                         {
   3345                             ai4_sum_sum_4x4_satd_16x16[i] = MAX_32BIT_VAL;
   3346                             ai4_min_sum_4x4_satd_16x16[i] = MAX_32BIT_VAL;
   3347                             ai4_min_min_4x4_satd_16x16[i] = MAX_32BIT_VAL;
   3348                         }
   3349                     }
   3350 
   3351                     //if(1 == is_min_block_comensated_in_l116x16)
   3352                     {
   3353                         ai4_min_sum_4x4_satd_16x16[i] =
   3354                             aai4_sum_4_16_satd_ctb[0][i * 4 + MEDIAN_CU_TU];
   3355                         ai4_min_min_4x4_satd_16x16[i] =
   3356                             aai4_min_4_16_64_satd[1][i * 16 + MEDIAN_CU_TU_BY_2];
   3357 
   3358                         if(ai4_sum_sum_4x4_satd_16x16[i] != MAX_32BIT_VAL)
   3359                         {
   3360                             ai4_sum_sum_4x4_satd_16x16[i] = 0;
   3361                             for(j = 0; j < 4; j++)
   3362                             {
   3363                                 ai4_sum_sum_4x4_satd_16x16[i] +=
   3364                                     ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j];
   3365                             }
   3366                             ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = ai4_sum_sum_4x4_satd_16x16[i];
   3367                             ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = ai4_min_sum_4x4_satd_16x16[i];
   3368                             ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = ai4_min_min_4x4_satd_16x16[i];
   3369                         }
   3370                     }
   3371                     if(1 == is_min_block_uncompensated_in_l116x16)
   3372                     {
   3373                         if(MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[i])
   3374                         {
   3375                             ai4_curr_frame_16x16_sum_act[0] += ai4_sum_sum_4x4_satd_16x16[i];
   3376                             ai8_curr_frame_16x16_sum_act_sqr[0] +=
   3377                                 (ai4_sum_sum_4x4_satd_16x16[i] * ai4_sum_sum_4x4_satd_16x16[i]);
   3378                             ai4_curr_frame_16x16_sum_blks[0] += 1;
   3379                         }
   3380                         if(MAX_32BIT_VAL != ai4_min_sum_4x4_satd_16x16[i])
   3381                         {
   3382                             ai4_curr_frame_16x16_sum_act[1] += ai4_min_sum_4x4_satd_16x16[i];
   3383                             ai8_curr_frame_16x16_sum_act_sqr[1] +=
   3384                                 (ai4_min_sum_4x4_satd_16x16[i] * ai4_min_sum_4x4_satd_16x16[i]);
   3385                             ai4_curr_frame_16x16_sum_blks[1] += 1;
   3386                             ai4_curr_frame_16x16_sum_act[2] += ai4_min_min_4x4_satd_16x16[i];
   3387                             ai8_curr_frame_16x16_sum_act_sqr[2] +=
   3388                                 (ai4_min_min_4x4_satd_16x16[i] * ai4_min_min_4x4_satd_16x16[i]);
   3389                             ai4_curr_frame_16x16_sum_blks[2] += 1;
   3390                         }
   3391                     }
   3392                 }
   3393                 /*32x32*/
   3394                 {
   3395                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = -1;
   3396                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = -1;
   3397                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = -1;
   3398                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = -1;
   3399 
   3400                     if((MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[0]) ||
   3401                        (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[2]) ||
   3402                        (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[1]) ||
   3403                        (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[3]))
   3404                     {
   3405                         //if(1 == is_min_block_comensated_in_l32x32)
   3406                         {
   3407                             {
   3408                                 WORD32 aai4_min_sum_sum_4x4_satd_16x16[1][64];
   3409                                 WORD32 i4_array_length =
   3410                                     sizeof(ai4_sum_sum_4x4_satd_16x16) / sizeof(WORD32);
   3411                                 /*** Sort 4 elements in ascending order ***/
   3412                                 ihevce_merge_sort(
   3413                                     &ai4_sum_sum_4x4_satd_16x16[0],
   3414                                     aai4_min_sum_sum_4x4_satd_16x16,
   3415                                     i4_array_length,
   3416                                     1,
   3417                                     4);
   3418 
   3419                                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] =
   3420                                     aai4_min_sum_sum_4x4_satd_16x16[0][MEDIAN_CU_TU];
   3421                             }
   3422                             {
   3423                                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] =
   3424                                     aai4_sum_4_16_satd_ctb[1][MEDIAN_CU_TU_BY_2];
   3425                             }
   3426                             {
   3427                                 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] =
   3428                                     aai4_min_4_16_64_satd[2][MEDIAN_CU_TU_BY_4];
   3429                             }
   3430 
   3431                             /*Sum of all 32x32 activity */
   3432                             ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = 0;
   3433                             for(j = 0; j < 4; j++)
   3434                             {
   3435                                 if(MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[j])
   3436                                     ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] +=
   3437                                         ai4_sum_sum_4x4_satd_16x16[j];
   3438                             }
   3439 
   3440                             if(1 == is_min_block_uncompensated_in_l32x32)
   3441                             {
   3442                                 /*Accumulate the sum of 32*32 activities in the current layer (64*64 CU in L0)*/
   3443                                 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][0])
   3444                                 {
   3445                                     ai4_curr_frame_32x32_sum_act[0] +=
   3446                                         ps_ed_ctb_curr_l1->i4_32x32_satd[0][0];
   3447                                     ai8_curr_frame_32x32_sum_act_sqr[0] +=
   3448                                         (ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] *
   3449                                          ps_ed_ctb_curr_l1->i4_32x32_satd[0][0]);
   3450                                     ai4_curr_frame_32x32_sum_blks[0] += 1;
   3451                                 }
   3452 
   3453                                 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][1])
   3454                                 {
   3455                                     ai4_curr_frame_32x32_sum_act[1] +=
   3456                                         ps_ed_ctb_curr_l1->i4_32x32_satd[0][1];
   3457                                     ai8_curr_frame_32x32_sum_act_sqr[1] +=
   3458                                         (ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] *
   3459                                          ps_ed_ctb_curr_l1->i4_32x32_satd[0][1]);
   3460                                     ai4_curr_frame_32x32_sum_blks[1] += 1;
   3461                                 }
   3462 
   3463                                 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][2])
   3464                                 {
   3465                                     ai4_curr_frame_32x32_sum_act[2] +=
   3466                                         ps_ed_ctb_curr_l1->i4_32x32_satd[0][2];
   3467                                     ai8_curr_frame_32x32_sum_act_sqr[2] +=
   3468                                         (ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] *
   3469                                          ps_ed_ctb_curr_l1->i4_32x32_satd[0][2]);
   3470                                     ai4_curr_frame_32x32_sum_blks[2] += 1;
   3471                                 }
   3472                             }
   3473                         }
   3474                     }
   3475                 }
   3476                 /*Increment ctb count*/
   3477                 ps_ed += inc_ctb;
   3478             }
   3479         }
   3480 
   3481         /* Spatial Variation and modulation index calculated for the frame */
   3482         {
   3483             for(i4_k = 0; i4_k < 2; i4_k++)
   3484             {
   3485                 /*8x8*/
   3486 #if USE_SQRT_AVG_OF_SATD_SQR
   3487                 ps_curr_out->i8_curr_frame_8x8_sum_act[i4_k] = ai8_curr_frame_8x8_sum_act_sqr[i4_k];
   3488 #else
   3489                 ps_curr_out->i8_curr_frame_8x8_sum_act[i4_k] = ai4_curr_frame_8x8_sum_act[i4_k];
   3490 #endif
   3491                 ps_curr_out->i4_curr_frame_8x8_sum_act_for_strength[i4_k] =
   3492                     ai4_curr_frame_8x8_sum_act[i4_k];
   3493                 ps_curr_out->i4_curr_frame_8x8_num_blks[i4_k] = ai4_curr_frame_8x8_sum_blks[i4_k];
   3494                 ps_curr_out->u8_curr_frame_8x8_sum_act_sqr = u8_curr_frame_8x8_sum_act_sqr;
   3495 
   3496                 /*16x16*/
   3497 #if USE_SQRT_AVG_OF_SATD_SQR
   3498                 ps_curr_out->i8_curr_frame_16x16_sum_act[i4_k] =
   3499                     ai8_curr_frame_16x16_sum_act_sqr[i4_k];
   3500 #else
   3501                 ps_curr_out->i8_curr_frame_16x16_sum_act[i4_k] = ai4_curr_frame_16x16_sum_act[i4_k];
   3502 #endif
   3503                 ps_curr_out->i4_curr_frame_16x16_num_blks[i4_k] =
   3504                     ai4_curr_frame_16x16_sum_blks[i4_k];
   3505 
   3506                 /*32x32*/
   3507 #if USE_SQRT_AVG_OF_SATD_SQR
   3508                 ps_curr_out->i8_curr_frame_32x32_sum_act[i4_k] =
   3509                     ai8_curr_frame_32x32_sum_act_sqr[i4_k];
   3510 #else
   3511                 ps_curr_out->i8_curr_frame_32x32_sum_act[i4_k] = ai4_curr_frame_32x32_sum_act[i4_k];
   3512 #endif
   3513                 ps_curr_out->i4_curr_frame_32x32_num_blks[i4_k] =
   3514                     ai4_curr_frame_32x32_sum_blks[i4_k];
   3515             }
   3516 
   3517             /*16x16*/
   3518 #if USE_SQRT_AVG_OF_SATD_SQR
   3519             ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai8_curr_frame_16x16_sum_act_sqr[2];
   3520 #else
   3521             ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai4_curr_frame_16x16_sum_act[2];
   3522 #endif
   3523 
   3524             ps_curr_out->i4_curr_frame_16x16_num_blks[2] = ai4_curr_frame_16x16_sum_blks[2];
   3525 
   3526             /*32x32*/
   3527 #if USE_SQRT_AVG_OF_SATD_SQR
   3528             ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai8_curr_frame_32x32_sum_act_sqr[2];
   3529 #else
   3530             ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai4_curr_frame_32x32_sum_act[2];
   3531 #endif
   3532             ps_curr_out->i4_curr_frame_32x32_num_blks[2] = ai4_curr_frame_32x32_sum_blks[2];
   3533         }
   3534     }
   3535 }
   3536 
   3537 /*!
   3538 ******************************************************************************
   3539 * \if Function name : ihevce_decomp_pre_intra_get_frame_satd \endif
   3540 *
   3541 * \brief
   3542 *    Number of memory records are returned for enc_loop module
   3543 *
   3544 *
   3545 * \return
   3546 *    None
   3547 *
   3548 * \author
   3549 *  Ittiam
   3550 *
   3551 *****************************************************************************
   3552 */
   3553 LWORD64 ihevce_decomp_pre_intra_get_frame_satd(void *pv_ctxt, WORD32 *i4_width, WORD32 *i4_hieght)
   3554 {
   3555     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
   3556         (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
   3557     WORD32 i4_i;
   3558     LWORD64 i8_tot_satd = 0;
   3559 
   3560     /*accumulate SATD acorss all thread. note that every thread will enter this function,
   3561     hence it must be guranteed that all thread must have completed preintra pass by now*/
   3562     for(i4_i = 0; i4_i < ps_master_ctxt->i4_num_proc_thrds; i4_i++)
   3563     {
   3564         ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
   3565             ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i4_i];
   3566 
   3567         //i8_tot_satd += ps_ctxt->as_layers[1].s_early_decision.i8_sum_best_satd;
   3568         i8_tot_satd += ps_ctxt->ps_ed_ctxt->i8_sum_best_satd;
   3569 
   3570         *i4_width = ps_ctxt->as_layers[1].i4_actual_wd;
   3571         *i4_hieght = ps_ctxt->as_layers[1].i4_actual_ht;
   3572     }
   3573 
   3574     return i8_tot_satd;
   3575 }
   3576 
   3577 LWORD64 ihevce_decomp_pre_intra_get_frame_satd_squared(
   3578     void *pv_ctxt, WORD32 *i4_width, WORD32 *i4_hieght)
   3579 {
   3580     ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt =
   3581         (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt;
   3582     WORD32 i4_i;
   3583     LWORD64 i8_tot_satd = 0;
   3584 
   3585     /*accumulate SATD acorss all thread. note that every thread will enter this function,
   3586     hence it must be guranteed that all thread must have completed preintra pass by now*/
   3587     for(i4_i = 0; i4_i < ps_master_ctxt->i4_num_proc_thrds; i4_i++)
   3588     {
   3589         ihevce_decomp_pre_intra_ctxt_t *ps_ctxt =
   3590             ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i4_i];
   3591 
   3592         //i8_tot_satd += ps_ctxt->as_layers[1].s_early_decision.i8_sum_best_satd;
   3593         i8_tot_satd += (ps_ctxt->ps_ed_ctxt->i8_sum_sq_best_satd);
   3594 
   3595         *i4_width = ps_ctxt->as_layers[1].i4_actual_wd;
   3596         *i4_hieght = ps_ctxt->as_layers[1].i4_actual_ht;
   3597     }
   3598 
   3599     return i8_tot_satd;
   3600 }
   3601