Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21 *******************************************************************************
     22 * @file
     23 *  ihevce_stasino_helpers.c
     24 *
     25 * @brief
     26 *
     27 * @author
     28 *  Ittiam
     29 *
     30 * @par List of Functions:
     31 *
     32 * @remarks
     33 *  None
     34 *
     35 *******************************************************************************
     36 */
     37 
     38 /*****************************************************************************/
     39 /* File Includes                                                             */
     40 /*****************************************************************************/
     41 /* System include files */
     42 #include <stdio.h>
     43 #include <stdlib.h>
     44 #include <assert.h>
     45 #include <string.h>
     46 
     47 /* User include files */
     48 #include "ihevc_typedefs.h"
     49 #include "itt_video_api.h"
     50 #include "ihevce_api.h"
     51 
     52 #include "rc_cntrl_param.h"
     53 #include "rc_frame_info_collector.h"
     54 #include "rc_look_ahead_params.h"
     55 
     56 #include "ihevc_defs.h"
     57 #include "ihevc_structs.h"
     58 #include "ihevc_platform_macros.h"
     59 #include "ihevc_deblk.h"
     60 #include "ihevc_itrans_recon.h"
     61 #include "ihevc_chroma_itrans_recon.h"
     62 #include "ihevc_chroma_intra_pred.h"
     63 #include "ihevc_intra_pred.h"
     64 #include "ihevc_inter_pred.h"
     65 #include "ihevc_mem_fns.h"
     66 #include "ihevc_padding.h"
     67 #include "ihevc_weighted_pred.h"
     68 #include "ihevc_sao.h"
     69 #include "ihevc_resi_trans.h"
     70 #include "ihevc_quant_iquant_ssd.h"
     71 #include "ihevc_cabac_tables.h"
     72 
     73 #include "ihevce_defs.h"
     74 #include "ihevce_lap_enc_structs.h"
     75 #include "ihevce_multi_thrd_structs.h"
     76 #include "ihevce_me_common_defs.h"
     77 #include "ihevce_had_satd.h"
     78 #include "ihevce_error_codes.h"
     79 #include "ihevce_bitstream.h"
     80 #include "ihevce_cabac.h"
     81 #include "ihevce_rdoq_macros.h"
     82 #include "ihevce_function_selector.h"
     83 #include "ihevce_enc_structs.h"
     84 #include "ihevce_entropy_structs.h"
     85 #include "ihevce_cmn_utils_instr_set_router.h"
     86 #include "ihevce_enc_loop_structs.h"
     87 #include "ihevce_stasino_helpers.h"
     88 
     89 /*****************************************************************************/
     90 /* Function Definitions                                                      */
     91 /*****************************************************************************/
     92 
     93 /**
     94 *******************************************************************************
     95 *
     96 * @brief
     97 *  This function calculates the variance of given data set.
     98 *
     99 * @par Description:
    100 *  This function is mainly used to find the variance of the block of pixel values.
    101 *  The block can be rectangular also. Single pass variance calculation
    102 *  implementation.
    103 *
    104 * @param[in] p_input
    105 *  The input buffer to calculate the variance.
    106 *
    107 * @param[out] pi4_mean
    108 *  Pointer ot the mean of the datset
    109 *
    110 * @param[out] pi4_variance
    111 *  Pointer tot he variabce of the data set
    112 *
    113 * @param[in] u1_is_hbd
    114 *  1 if the data is in  high bit depth
    115 *
    116 * @param[in] stride
    117 *  Stride for the input buffer
    118 *
    119 * @param[in] block_height
    120 *  height of the pixel block
    121 *
    122 * @param[in] block_width
    123 *  width of the pixel block
    124 *
    125 * @remarks
    126 *  None
    127 *
    128 *******************************************************************************
    129 */
    130 void ihevce_calc_variance(
    131     void *pv_input,
    132     WORD32 i4_stride,
    133     WORD32 *pi4_mean,
    134     UWORD32 *pu4_variance,
    135     UWORD8 u1_block_height,
    136     UWORD8 u1_block_width,
    137     UWORD8 u1_is_hbd,
    138     UWORD8 u1_disable_normalization)
    139 {
    140     UWORD8 *pui1_buffer;  // pointer for 8 bit usecase
    141     WORD32 i, j;
    142     WORD32 total_elements;
    143 
    144     LWORD64 mean;
    145     ULWORD64 variance;
    146     ULWORD64 sum;
    147     ULWORD64 sq_sum;
    148 
    149     /* intialisation */
    150     total_elements = u1_block_height * u1_block_width;
    151     mean = 0;
    152     variance = 0;
    153     sum = 0;
    154     sq_sum = 0;
    155 
    156     /* handle the case of 8/10 bit depth separately */
    157     if(!u1_is_hbd)
    158     {
    159         pui1_buffer = (UWORD8 *)pv_input;
    160 
    161         /* loop over all the values in the block */
    162         for(i = 0; i < u1_block_height; i++)
    163         {
    164             /* loop over a row in the block */
    165             for(j = 0; j < u1_block_width; j++)
    166             {
    167                 sum += pui1_buffer[i * i4_stride + j];
    168                 sq_sum += (pui1_buffer[i * i4_stride + j] * pui1_buffer[i * i4_stride + j]);
    169             }
    170         }
    171 
    172         if(!u1_disable_normalization)
    173         {
    174             mean = sum / total_elements;
    175             variance =
    176                 ((total_elements * sq_sum) - (sum * sum)) / (total_elements * (total_elements));
    177         }
    178         else
    179         {
    180             mean = sum;
    181             variance = ((total_elements * sq_sum) - (sum * sum));
    182         }
    183     }
    184 
    185     /* copy back the values to the output variables */
    186     *pi4_mean = mean;
    187     *pu4_variance = variance;
    188 }
    189 
    190 /**
    191 *******************************************************************************
    192 *
    193 * @brief
    194 *  This function calcluates the variance of given data set which is WORD16
    195 *
    196 * @par Description:
    197 *  This function is mainly used to find the variance of the block of pixel values.
    198 *  Single pass variance calculation implementation.
    199 *
    200 * @param[in] pv_input
    201 *  The input buffer to calculate the variance.
    202 *
    203 *
    204 * @param[in] stride
    205 *  Stride for the input buffer
    206 *
    207 * @param[out] pi4_mean
    208 *  Pointer ot the mean of the datset
    209 *
    210 * @param[out] pi4_variance
    211 *  Pointer tot he variabce of the data set
    212 *
    213 * @param[in] block_height
    214 *  height of the pixel block
    215 *
    216 * @param[in] block_width
    217 *  width of the pixel block
    218 *
    219 *
    220 * @remarks
    221 *  None
    222 *
    223 *******************************************************************************/
    224 void ihevce_calc_variance_signed(
    225     WORD16 *pv_input,
    226     WORD32 i4_stride,
    227     WORD32 *pi4_mean,
    228     UWORD32 *pu4_variance,
    229     UWORD8 u1_block_height,
    230     UWORD8 u1_block_width)
    231 {
    232     WORD16 *pi2_buffer;  // poinbter for 10 bit use case
    233 
    234     WORD32 i, j;
    235     WORD32 total_elements;
    236 
    237     LWORD64 mean;
    238     LWORD64 variance;
    239     LWORD64 sum;
    240     LWORD64 sq_sum;
    241 
    242     /* intialisation */
    243     total_elements = u1_block_height * u1_block_width;
    244     mean = 0;
    245     variance = 0;
    246     sum = 0;
    247     sq_sum = 0;
    248 
    249     pi2_buffer = pv_input;
    250 
    251     for(i = 0; i < u1_block_height; i++)
    252     {
    253         for(j = 0; j < u1_block_width; j++)
    254         {
    255             sum += pi2_buffer[i * i4_stride + j];
    256             sq_sum += (pi2_buffer[i * i4_stride + j] * pi2_buffer[i * i4_stride + j]);
    257         }
    258     }
    259 
    260     mean = sum;  /// total_elements;
    261     variance = ((total_elements * sq_sum) - (sum * sum));  // / (total_elements * (total_elements) )
    262 
    263     /* copy back the values to the output variables */
    264     *pi4_mean = mean;
    265     *pu4_variance = variance;
    266 }
    267 
    268 /**
    269 *******************************************************************************
    270 *
    271 * @brief
    272 *  This function calculates the variance of a chrominance plane for 420SP data
    273 *
    274 * @par Description:
    275 *  This function is mainly used to find the variance of the block of pixel values.
    276 *  The block can be rectangular also. Single pass variance calculation
    277 *  implementation.
    278 *
    279 * @param[in] p_input
    280 *  The input buffer to calculate the variance.
    281 *
    282 * @param[in] stride
    283 *  Stride for the input buffer
    284 *
    285 * @param[out] pi4_mean
    286 *  Pointer ot the mean of the datset
    287 *
    288 * @param[out] pi4_variance
    289 *  Pointer tot he variabce of the data set
    290 *
    291 * @param[in] block_height
    292 *  height of the pixel block
    293 *
    294 * @param[in] block_width
    295 *  width of the pixel block
    296 *
    297 * @param[in] u1_is_hbd
    298 *  1 if the data is in  high bit depth
    299 *
    300 * @param[in] e_chroma_plane
    301 *  is U or V
    302 *
    303 * @remarks
    304 *  None
    305 *
    306 *******************************************************************************
    307 */
    308 void ihevce_calc_chroma_variance(
    309     void *pv_input,
    310     WORD32 i4_stride,
    311     WORD32 *pi4_mean,
    312     UWORD32 *pu4_variance,
    313     UWORD8 u1_block_height,
    314     UWORD8 u1_block_width,
    315     UWORD8 u1_is_hbd,
    316     CHROMA_PLANE_ID_T e_chroma_plane)
    317 {
    318     UWORD8 *pui1_buffer;  // pointer for 8 bit usecase
    319     WORD32 i, j;
    320     WORD32 total_elements;
    321 
    322     LWORD64 mean;
    323     ULWORD64 variance;
    324     LWORD64 sum;
    325     LWORD64 sq_sum;
    326 
    327     /* intialisation */
    328     total_elements = u1_block_height * u1_block_width;
    329     mean = 0;
    330     variance = 0;
    331     sum = 0;
    332     sq_sum = 0;
    333 
    334     /* handle the case of 8/10 bit depth separately */
    335     if(!u1_is_hbd)
    336     {
    337         pui1_buffer = (UWORD8 *)pv_input;
    338 
    339         pui1_buffer += e_chroma_plane;
    340 
    341         /* loop over all the values in the block */
    342         for(i = 0; i < u1_block_height; i++)
    343         {
    344             /* loop over a row in the block */
    345             for(j = 0; j < u1_block_width; j++)
    346             {
    347                 sum += pui1_buffer[i * i4_stride + j * 2];
    348                 sq_sum += (pui1_buffer[i * i4_stride + j * 2] * pui1_buffer[i * i4_stride + j * 2]);
    349             }
    350         }
    351 
    352         mean = sum / total_elements;
    353         variance = ((total_elements * sq_sum) - (sum * sum)) / (total_elements * (total_elements));
    354     }
    355 
    356     /* copy back the values to the output variables */
    357     *pi4_mean = mean;
    358     *pu4_variance = variance;
    359 }
    360 
    361 LWORD64 ihevce_inject_stim_into_distortion(
    362     void *pv_src,
    363     WORD32 i4_src_stride,
    364     void *pv_pred,
    365     WORD32 i4_pred_stride,
    366     LWORD64 i8_distortion,
    367     WORD32 i4_alpha_stim_multiplier,
    368     UWORD8 u1_blk_size,
    369     UWORD8 u1_is_hbd,
    370     UWORD8 u1_enable_psyRDOPT,
    371     CHROMA_PLANE_ID_T e_chroma_plane)
    372 {
    373     if(!u1_enable_psyRDOPT)
    374     {
    375         UWORD32 u4_src_variance;
    376         UWORD32 u4_pred_variance;
    377         WORD32 i4_mean;
    378         WORD32 i4_noise_term;
    379 
    380         if(NULL_PLANE == e_chroma_plane)
    381         {
    382             ihevce_calc_variance(
    383                 pv_src,
    384                 i4_src_stride,
    385                 &i4_mean,
    386                 &u4_src_variance,
    387                 u1_blk_size,
    388                 u1_blk_size,
    389                 u1_is_hbd,
    390                 0);
    391 
    392             ihevce_calc_variance(
    393                 pv_pred,
    394                 i4_pred_stride,
    395                 &i4_mean,
    396                 &u4_pred_variance,
    397                 u1_blk_size,
    398                 u1_blk_size,
    399                 u1_is_hbd,
    400                 0);
    401         }
    402         else
    403         {
    404             ihevce_calc_chroma_variance(
    405                 pv_src,
    406                 i4_src_stride,
    407                 &i4_mean,
    408                 &u4_src_variance,
    409                 u1_blk_size,
    410                 u1_blk_size,
    411                 u1_is_hbd,
    412                 e_chroma_plane);
    413 
    414             ihevce_calc_chroma_variance(
    415                 pv_pred,
    416                 i4_pred_stride,
    417                 &i4_mean,
    418                 &u4_pred_variance,
    419                 u1_blk_size,
    420                 u1_blk_size,
    421                 u1_is_hbd,
    422                 e_chroma_plane);
    423         }
    424 
    425         i4_noise_term =
    426             ihevce_compute_noise_term(i4_alpha_stim_multiplier, u4_src_variance, u4_pred_variance);
    427 
    428         MULTIPLY_STIM_WITH_DISTORTION(i8_distortion, i4_noise_term, STIM_Q_FORMAT, ALPHA_Q_FORMAT);
    429 
    430         return i8_distortion;
    431     }
    432     else
    433     {
    434         return i8_distortion;
    435     }
    436 }
    437 
    438 UWORD8 ihevce_determine_cu_noise_based_on_8x8Blk_data(
    439     UWORD8 *pu1_is_8x8Blk_noisy, UWORD8 u1_cu_x_pos, UWORD8 u1_cu_y_pos, UWORD8 u1_cu_size)
    440 {
    441     UWORD8 u1_num_noisy_children = 0;
    442     UWORD8 u1_start_index = (u1_cu_x_pos / 8) + u1_cu_y_pos;
    443 
    444     if(8 == u1_cu_size)
    445     {
    446         return pu1_is_8x8Blk_noisy[u1_start_index];
    447     }
    448 
    449     u1_num_noisy_children += ihevce_determine_cu_noise_based_on_8x8Blk_data(
    450         pu1_is_8x8Blk_noisy, u1_cu_x_pos, u1_cu_y_pos, u1_cu_size / 2);
    451 
    452     u1_num_noisy_children += ihevce_determine_cu_noise_based_on_8x8Blk_data(
    453         pu1_is_8x8Blk_noisy, u1_cu_x_pos + (u1_cu_size / 2), u1_cu_y_pos, u1_cu_size / 2);
    454 
    455     u1_num_noisy_children += ihevce_determine_cu_noise_based_on_8x8Blk_data(
    456         pu1_is_8x8Blk_noisy, u1_cu_x_pos, u1_cu_y_pos + (u1_cu_size / 2), u1_cu_size / 2);
    457 
    458     u1_num_noisy_children += ihevce_determine_cu_noise_based_on_8x8Blk_data(
    459         pu1_is_8x8Blk_noisy,
    460         u1_cu_x_pos + (u1_cu_size / 2),
    461         u1_cu_y_pos + (u1_cu_size / 2),
    462         u1_cu_size / 2);
    463 
    464     return (u1_num_noisy_children >= 2);
    465 }
    466 
    467 /*!
    468 ******************************************************************************
    469 * \if Function name : ihevce_psy_rd_cost_croma \endif
    470 *
    471 * \brief
    472 *    Calculates the psyco visual cost for RD opt. This is
    473 *
    474 * \param[in] pui4_source_satd
    475 *   This is the pointer to the array of 8x8 satd of the corresponding source CTB. This is pre calculated.
    476 * \param[in] *pui1_recon
    477 *   This si the pointer to the pred data.
    478 * \param[in] recon_stride
    479 *   This si the pred stride
    480 * \param[in] pic_type
    481 *   Picture type.
    482 * \param[in] layer_id
    483 *   Indicates the temporal layer.
    484 * \param[in] lambda
    485 *   This is the weighting factor for the cost.
    486 * \param[in] is_hbd
    487 *   This is the high bit depth flag which indicates if the bit depth of the pixels is 10 bit or 8 bit.
    488 * \param[in] sub_sampling_type
    489 *   This is the chroma subsampling type. 11 - for 420 and 13 for 422
    490 * \return
    491 *    the cost for the psyRDopt
    492 *
    493 * \author
    494 *  Ittiam
    495 *
    496 *****************************************************************************
    497 */
    498 LWORD64 ihevce_psy_rd_cost_croma(
    499     LWORD64 *pui4_source_satd,
    500     void *p_recon,
    501     WORD32 recon_stride_vert,
    502     WORD32 recond_stride_horz,
    503     WORD32 cu_size_luma,
    504     WORD32 pic_type,
    505     WORD32 layer_id,
    506     WORD32 lambda,
    507     WORD32 start_index,
    508     WORD32 is_hbd,
    509     WORD32 sub_sampling_type,
    510     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
    511 {
    512     /* declare local variables to store the SATD values for the pred  for the current block. */
    513     LWORD64 psy_rd_cost;
    514     UWORD32 lambda_mod;
    515     WORD32 psy_factor;
    516 
    517     /* declare local variables */
    518     WORD32 i;
    519     WORD32 cu_total_size;
    520     WORD32 num_comp_had_blocks;
    521 
    522     UWORD8 *pu1_l0_block;
    523     UWORD8 *pu1_l0_block_prev;
    524     UWORD8 *pu1_recon;
    525     WORD32 ht_offset;
    526     WORD32 wd_offset;
    527     WORD32 cu_ht;
    528     WORD32 cu_wd;
    529 
    530     WORD32 num_horz_blocks;
    531 
    532     WORD16 pi2_residue_had[64];
    533     /* this is used as a buffer with all values equal to 0. This is emulate the case with
    534        pred being zero in HAD fucntion */
    535     UWORD8 ai1_zeros_buffer[64];
    536 
    537     WORD32 had_block_size;
    538     LWORD64 source_satd;  // to hold source for current 8x8 block
    539     LWORD64 recon_satd;  // holds the current recon 8x8 satd
    540 
    541     WORD32 index_for_src_satd;
    542 
    543     (void)recond_stride_horz;
    544     (void)pic_type;
    545     (void)layer_id;
    546     if(!is_hbd)
    547     {
    548         pu1_recon = (UWORD8 *)p_recon;
    549     }
    550 
    551     /**** initialize the variables ****/
    552     had_block_size = 4;
    553 
    554     if(sub_sampling_type == 1)  // 420
    555     {
    556         cu_ht = cu_size_luma / 2;
    557         cu_wd = cu_size_luma / 2;
    558     }
    559     else
    560     {
    561         cu_ht = cu_size_luma;
    562         cu_wd = cu_size_luma / 2;
    563     }
    564 
    565     num_horz_blocks = 2 * cu_wd / had_block_size;  //ctb_width / had_block_size;
    566     ht_offset = -had_block_size;
    567     wd_offset = 0;  //-had_block_size;
    568 
    569     cu_total_size = cu_ht * cu_wd;
    570     num_comp_had_blocks = 2 * cu_total_size / (had_block_size * had_block_size);
    571 
    572     index_for_src_satd = start_index;
    573 
    574     for(i = 0; i < 64; i++)
    575     {
    576         ai1_zeros_buffer[i] = 0;
    577     }
    578 
    579     psy_factor = PSY_STRENGTH_CHROMA;
    580     psy_rd_cost = 0;
    581     lambda_mod = lambda * psy_factor;
    582 
    583     /************************************************************/
    584     /* loop over for every 4x4 blocks in the CU for Cb */
    585     for(i = 0; i < num_comp_had_blocks; i++)
    586     {
    587         if(i % num_horz_blocks == 0)
    588         {
    589             wd_offset = -had_block_size;
    590             ht_offset += had_block_size;
    591         }
    592         wd_offset += had_block_size;
    593 
    594         /* source satd for the current 8x8 block */
    595         source_satd = pui4_source_satd[index_for_src_satd];
    596 
    597         if(i % 2 != 0)
    598         {
    599             if(!is_hbd)
    600             {
    601                 pu1_l0_block = pu1_l0_block_prev + 1;
    602             }
    603         }
    604         else
    605         {
    606             if(!is_hbd)
    607             {
    608                 /* get memory pointers for each of L0 and L1 blocks whose hadamard has to be computed */
    609                 pu1_l0_block = pu1_recon + recon_stride_vert * ht_offset + wd_offset;
    610                 pu1_l0_block_prev = pu1_l0_block;
    611             }
    612         }
    613 
    614         if(had_block_size == 4)
    615         {
    616             if(!is_hbd)
    617             {
    618                 recon_satd = ps_cmn_utils_optimised_function_list->pf_chroma_AC_HAD_4x4_8bit(
    619                     pu1_l0_block,
    620                     recon_stride_vert,
    621                     ai1_zeros_buffer,
    622                     had_block_size,
    623                     pi2_residue_had,
    624                     had_block_size);
    625             }
    626 
    627             /* get the additional cost function based on the absolute SATD diff of source and recon. */
    628             psy_rd_cost += (lambda_mod * llabs(source_satd - recon_satd));
    629 
    630             index_for_src_satd++;
    631 
    632             if((i % num_horz_blocks) == (num_horz_blocks - 1))
    633             {
    634                 index_for_src_satd -= num_horz_blocks;
    635                 index_for_src_satd +=
    636                     (MAX_CU_SIZE / 8); /* Assuming CTB size = 64 and blocksize = 8 */
    637             }
    638 
    639         }  // if had block size ==4
    640     }  // for loop for all 4x4 block in the cu
    641 
    642     psy_rd_cost = psy_rd_cost >> (Q_PSY_STRENGTH_CHROMA + LAMBDA_Q_SHIFT);
    643     /* reutrn the additional cost for the psy RD opt */
    644     return (psy_rd_cost);
    645 }
    646 
    647 /*!
    648 ******************************************************************************
    649 * \if Function name : ihevce_psy_rd_cost \endif
    650 *
    651 * \brief
    652 *    Calculates the psyco visual cost for RD opt. This is
    653 *
    654 * \param[in] pui4_source_satd
    655 *   This is the pointer to the array of 8x8 satd of the corresponding source CTB. This is pre calculated.
    656 * \param[in] *pui1_recon
    657 *   This si the pointer to the pred data.
    658 * \param[in] recon_stride
    659 *   This si the pred stride
    660 * \param[in] pic_type
    661 *   Picture type.
    662 * \param[in] layer_id
    663 *   Indicates the temporal layer.
    664 * \param[in] lambda
    665 *   This is the weighting factor for the cost.
    666 *
    667 * \return
    668 *    the cost for the psyRDopt
    669 *
    670 * \author
    671 *  Ittiam
    672 *
    673 *****************************************************************************
    674 */
    675 LWORD64 ihevce_psy_rd_cost(
    676     LWORD64 *pui4_source_satd,
    677     void *pv_recon,
    678     WORD32 recon_stride_vert,
    679     WORD32 recond_stride_horz,
    680     WORD32 cu_size,
    681     WORD32 pic_type,
    682     WORD32 layer_id,
    683     WORD32 lambda,
    684     WORD32 start_index,
    685     WORD32 is_hbd,
    686     UWORD32 u4_psy_strength,
    687     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list)
    688 {
    689     /* declare local variables to store the SATD values for the pred  for the current block. */
    690     LWORD64 psy_rd_cost;  // TODO : check if overflow is there.
    691     UWORD32 lambda_mod;
    692     WORD32 psy_factor;
    693 
    694     /* declare local variables */
    695     WORD32 i;
    696     WORD32 cu_total_size;
    697     WORD32 num_comp_had_blocks;
    698 
    699     UWORD8 *pu1_l0_block;
    700     UWORD8 *pu1_recon;
    701 
    702     WORD32 ht_offset;
    703     WORD32 wd_offset;
    704     WORD32 cu_ht;
    705     WORD32 cu_wd;
    706 
    707     WORD32 num_horz_blocks;
    708 
    709     //WORD16 pi2_residue_had[64];
    710     WORD16 pi2_residue_had_zscan[64];
    711     //WORD16 pi2_residue[64];
    712     /* this is used as a buffer with all values equal to 0. This is emulate the case with
    713        pred being zero in HAD fucntion */
    714     UWORD8 ai1_zeros_buffer[64];
    715 
    716     WORD32 had_block_size;
    717     LWORD64 source_satd;  // to hold source for current 8x8 block
    718     LWORD64 recon_satd;  // holds the current recon 8x8 satd
    719 
    720     WORD32 index_for_src_satd;
    721 
    722     (void)recond_stride_horz;
    723     (void)pic_type;
    724     (void)layer_id;
    725     /***** initialize the variables ****/
    726     had_block_size = 8;
    727     cu_ht = cu_size;
    728     cu_wd = cu_size;
    729 
    730     num_horz_blocks = cu_wd / had_block_size;  //ctb_width / had_block_size;
    731 
    732     ht_offset = -had_block_size;
    733     wd_offset = 0 - had_block_size;
    734 
    735     cu_total_size = cu_ht * cu_wd;
    736     num_comp_had_blocks = cu_total_size / (had_block_size * had_block_size);
    737 
    738     index_for_src_satd = start_index;
    739 
    740     for(i = 0; i < 64; i++)
    741     {
    742         ai1_zeros_buffer[i] = 0;
    743     }
    744     psy_factor = u4_psy_strength;  //PSY_STRENGTH;
    745     psy_rd_cost = 0;
    746     lambda_mod = lambda * psy_factor;
    747 
    748     if(!is_hbd)
    749     {
    750         pu1_recon = (UWORD8 *)pv_recon;
    751     }
    752 
    753     /**************************************************************/
    754     /* loop over for every 8x8 blocks in the CU */
    755     for(i = 0; i < num_comp_had_blocks; i++)
    756     {
    757         if(i % num_horz_blocks == 0)
    758         {
    759             wd_offset = -had_block_size;
    760             ht_offset += had_block_size;
    761         }
    762         wd_offset += had_block_size;
    763 
    764         /* source satd for the current 8x8 block */
    765         source_satd = pui4_source_satd[index_for_src_satd];
    766 
    767         if(had_block_size == 8)
    768         {
    769             //WORD32 index;
    770             //WORD32 u4_satd;
    771             //WORD32 dst_strd = 8;
    772             //WORD32 i4_frm_qstep = 0;
    773             //WORD32 early_cbf;
    774             if(!is_hbd)
    775             {
    776                 /* get memory pointers for each of L0 and L1 blocks whose hadamard has to be computed */
    777                 pu1_l0_block = pu1_recon + recon_stride_vert * ht_offset + wd_offset;
    778 
    779                 recon_satd = ps_cmn_utils_optimised_function_list->pf_AC_HAD_8x8_8bit(
    780                     pu1_l0_block,
    781                     recon_stride_vert,
    782                     ai1_zeros_buffer,
    783                     had_block_size,
    784                     pi2_residue_had_zscan,
    785                     had_block_size);
    786             }
    787 
    788             /* get the additional cost function based on the absolute SATD diff of source and recon. */
    789             psy_rd_cost += (lambda_mod * llabs(source_satd - recon_satd));
    790 
    791             index_for_src_satd++;
    792             if((i % num_horz_blocks) == (num_horz_blocks - 1))
    793             {
    794                 index_for_src_satd -= num_horz_blocks;
    795                 index_for_src_satd +=
    796                     (MAX_CU_SIZE / 8); /* Assuming CTB size = 64 and blocksize = 8 */
    797             }
    798         }  // if
    799     }  // for loop
    800     psy_rd_cost = psy_rd_cost >> (Q_PSY_STRENGTH + LAMBDA_Q_SHIFT);
    801 
    802     /* reutrn the additional cost for the psy RD opt */
    803     return (psy_rd_cost);
    804 }
    805 
    806 unsigned long ihevce_calc_stim_injected_variance(
    807     ULWORD64 *pu8_sigmaX,
    808     ULWORD64 *pu8_sigmaXSquared,
    809     ULWORD64 *u8_var,
    810     WORD32 i4_inv_wpred_wt,
    811     WORD32 i4_inv_wt_shift_val,
    812     WORD32 i4_wpred_log_wdc,
    813     WORD32 i4_part_id)
    814 {
    815     ULWORD64 u8_X_Square, u8_temp_var;
    816     WORD32 i4_bits_req;
    817 
    818     const WORD32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
    819 
    820     u8_X_Square = (pu8_sigmaX[i4_part_id] * pu8_sigmaX[i4_part_id]);
    821     u8_temp_var = pu8_sigmaXSquared[i4_part_id] - u8_X_Square;
    822 
    823     if(i4_inv_wpred_wt != i4_default_src_wt)
    824     {
    825         i4_inv_wpred_wt = i4_inv_wpred_wt >> i4_inv_wt_shift_val;
    826 
    827         u8_temp_var = SHR_NEG(
    828             (u8_temp_var * i4_inv_wpred_wt * i4_inv_wpred_wt),
    829             (30 - (2 * i4_inv_wt_shift_val) - i4_wpred_log_wdc * 2));
    830     }
    831 
    832     GETRANGE64(i4_bits_req, u8_temp_var);
    833 
    834     if(i4_bits_req > 27)
    835     {
    836         *u8_var = u8_temp_var >> (i4_bits_req - 27);
    837         return (i4_bits_req - 27);
    838     }
    839     else
    840     {
    841         *u8_var = u8_temp_var;
    842         return 0;
    843     }
    844 }
    845 
    846 unsigned long ihevce_calc_variance_for_diff_weights(
    847     ULWORD64 *pu8_sigmaX,
    848     ULWORD64 *pu8_sigmaXSquared,
    849     ULWORD64 *u8_var,
    850     WORD32 *pi4_inv_wt,
    851     WORD32 *pi4_inv_wt_shift_val,
    852     pu_result_t *ps_result,
    853     WORD32 i4_wpred_log_wdc,
    854     PART_ID_T *pe_part_id,
    855     UWORD8 u1_cu_size,
    856     UWORD8 u1_num_parts,
    857     UWORD8 u1_is_for_src)
    858 {
    859     WORD32 i4_k;
    860     UWORD32 u4_wd, u4_ht;
    861     UWORD8 u1_num_base_blks;
    862     UWORD32 u4_num_pixels_in_part;
    863     UWORD8 u1_index;
    864     WORD32 i4_bits_req;
    865 
    866     UWORD8 u1_base_blk_size = 4;
    867     UWORD32 u4_tot_num_pixels = u1_cu_size * u1_cu_size;
    868     ULWORD64 u8_temp_sigmaX[MAX_NUM_INTER_PARTS] = { 0, 0 };
    869     ULWORD64 u8_temp_sigmaXsquared[MAX_NUM_INTER_PARTS] = { 0, 0 };
    870     ULWORD64 u8_z;
    871 
    872     const WORD32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
    873 
    874     for(i4_k = 0; i4_k < u1_num_parts; i4_k++)
    875     {
    876         u4_wd = ps_result[i4_k].pu.b4_wd + 1;
    877         u4_ht = ps_result[i4_k].pu.b4_ht + 1;
    878         u1_num_base_blks = u4_wd * u4_ht;
    879         u4_num_pixels_in_part = u1_num_base_blks * u1_base_blk_size * u1_base_blk_size;
    880 
    881         if(u1_is_for_src)
    882         {
    883             u1_index = pe_part_id[i4_k];
    884         }
    885         else
    886         {
    887             u1_index = i4_k;
    888         }
    889 
    890         u8_temp_sigmaXsquared[i4_k] = pu8_sigmaXSquared[u1_index] / u4_num_pixels_in_part;
    891         u8_temp_sigmaX[i4_k] = pu8_sigmaX[u1_index];
    892 
    893         if(u1_is_for_src)
    894         {
    895             if(pi4_inv_wt[i4_k] != i4_default_src_wt)
    896             {
    897                 pi4_inv_wt[i4_k] = pi4_inv_wt[i4_k] >> pi4_inv_wt_shift_val[i4_k];
    898                 u8_temp_sigmaX[i4_k] = SHR_NEG(
    899                     (u8_temp_sigmaX[i4_k] * pi4_inv_wt[i4_k]),
    900                     (15 - pi4_inv_wt_shift_val[i4_k] - i4_wpred_log_wdc));
    901                 u8_temp_sigmaXsquared[i4_k] = SHR_NEG(
    902                     (u8_temp_sigmaXsquared[i4_k] * pi4_inv_wt[i4_k] * pi4_inv_wt[i4_k]),
    903                     (30 - (2 * pi4_inv_wt_shift_val[i4_k]) - i4_wpred_log_wdc * 2));
    904             }
    905         }
    906     }
    907 
    908     u8_z = (u4_tot_num_pixels * (u8_temp_sigmaXsquared[0] + u8_temp_sigmaXsquared[1])) -
    909            ((u8_temp_sigmaX[0] + u8_temp_sigmaX[1]) * (u8_temp_sigmaX[0] + u8_temp_sigmaX[1]));
    910 
    911     GETRANGE64(i4_bits_req, u8_z);
    912 
    913     if(i4_bits_req > 27)
    914     {
    915         *u8_var = u8_z >> (i4_bits_req - 27);
    916         return (i4_bits_req - 27);
    917     }
    918     else
    919     {
    920         *u8_var = u8_z;
    921         return 0;
    922     }
    923 }
    924