Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /*!
     22 ***************************************************************************
     23 * \file hme_err_compute.c
     24 *
     25 * \brief
     26 *    SAD / SATD routines for error computation
     27 *
     28 * Detailed_description : Contains various types of SAD/SATD routines for
     29 *   error computation between a given input and reference ptr. The SAD
     30 *   routines can evaluate for either a single point or a grid, and can
     31 *   evaluate with either partial updates or no partial updates. Partial
     32 *   updates means evaluating sub block SADs, e.g. 4 4x4 subblock SAD in
     33 *   addition to the main 8x8 block SAD.
     34 *
     35 * \date
     36 *    22/9/2012
     37 *
     38 * \author  Ittiam
     39 ***************************************************************************
     40 */
     41 
     42 /*****************************************************************************/
     43 /* File Includes                                                             */
     44 /*****************************************************************************/
     45 /* System include files */
     46 #include <stdio.h>
     47 #include <string.h>
     48 #include <stdlib.h>
     49 #include <assert.h>
     50 #include <stdarg.h>
     51 #include <math.h>
     52 #include <limits.h>
     53 
     54 /* User include files */
     55 #include "ihevc_typedefs.h"
     56 #include "itt_video_api.h"
     57 #include "ihevce_api.h"
     58 
     59 #include "rc_cntrl_param.h"
     60 #include "rc_frame_info_collector.h"
     61 #include "rc_look_ahead_params.h"
     62 
     63 #include "ihevc_defs.h"
     64 #include "ihevc_structs.h"
     65 #include "ihevc_platform_macros.h"
     66 #include "ihevc_deblk.h"
     67 #include "ihevc_itrans_recon.h"
     68 #include "ihevc_chroma_itrans_recon.h"
     69 #include "ihevc_chroma_intra_pred.h"
     70 #include "ihevc_intra_pred.h"
     71 #include "ihevc_inter_pred.h"
     72 #include "ihevc_mem_fns.h"
     73 #include "ihevc_padding.h"
     74 #include "ihevc_weighted_pred.h"
     75 #include "ihevc_sao.h"
     76 #include "ihevc_resi_trans.h"
     77 #include "ihevc_quant_iquant_ssd.h"
     78 #include "ihevc_cabac_tables.h"
     79 
     80 #include "ihevce_defs.h"
     81 #include "ihevce_lap_enc_structs.h"
     82 #include "ihevce_multi_thrd_structs.h"
     83 #include "ihevce_multi_thrd_funcs.h"
     84 #include "ihevce_me_common_defs.h"
     85 #include "ihevce_had_satd.h"
     86 #include "ihevce_error_codes.h"
     87 #include "ihevce_bitstream.h"
     88 #include "ihevce_cabac.h"
     89 #include "ihevce_rdoq_macros.h"
     90 #include "ihevce_function_selector.h"
     91 #include "ihevce_enc_structs.h"
     92 #include "ihevce_entropy_structs.h"
     93 #include "ihevce_cmn_utils_instr_set_router.h"
     94 #include "ihevce_enc_loop_structs.h"
     95 #include "ihevce_bs_compute_ctb.h"
     96 #include "ihevce_global_tables.h"
     97 #include "ihevce_dep_mngr_interface.h"
     98 #include "hme_datatype.h"
     99 #include "hme_interface.h"
    100 #include "hme_common_defs.h"
    101 #include "hme_defs.h"
    102 #include "ihevce_me_instr_set_router.h"
    103 #include "hme_globals.h"
    104 #include "hme_utils.h"
    105 #include "hme_coarse.h"
    106 #include "hme_refine.h"
    107 #include "hme_err_compute.h"
    108 #include "hme_common_utils.h"
    109 #include "hme_search_algo.h"
    110 #include "ihevce_stasino_helpers.h"
    111 
    112 /******************************************************************************
    113 *                         MACRO DEFINITIONS
    114 ******************************************************************************/
    115 
    116 /*****************************************************************************/
    117 /* Theoritically, the various types of SAD functions that are needed for     */
    118 /* reasons of optimality. SADs that are to be evaluated at a single pt can be*/
    119 /* more optimal than SADs that are to be evaluated for a grid of 3x3. The    */
    120 /* SADs to be evaluated at a grid are classified as separate functions, since*/
    121 /* evaluating them on a single function call helps reuse inputs for a small  */
    122 /* grid of 3x3. Also, if no partial updates are required, there are 3 basic  */
    123 /* funcitons, width 4K (K = odd number), width 8K (K = odd number) and width */
    124 /* 16K, K any number. For partial updates, it is assumed that the block size */
    125 /* is square (8x8, 16x16, 32x32, 64x64) and further differentiation is done  */
    126 /* based on the basic evaluation unit. E.g. if 16x16 blk size requires, part */
    127 /* update on AMP partitions, then basic SAD unit is 4x4, if it doesnt, then  */
    128 /* basic SAD unit is 8x8.                                                    */
    129 /*****************************************************************************/
    130 
    131 #define UPD_RES_PT_NPU_BEST1 hme_update_results_grid_pu_bestn
    132 #define UPD_RES_PT_NPU_BESTN hme_update_results_grid_pu_bestn
    133 #define UPD_RES_PT_PU_BEST1 hme_update_results_grid_pu_bestn
    134 #define UPD_RES_PT_PU_BESTN hme_update_results_grid_pu_bestn
    135 #define UPD_RES_GRID_NPU_BEST1 hme_update_results_grid_pu_bestn
    136 #define UPD_RES_GRID_NPU_BESTN hme_update_results_grid_pu_bestn
    137 #define UPD_RES_GRID_PU_BEST1 hme_update_results_grid_pu_bestn
    138 #define UPD_RES_GRID_PU_BESTN hme_update_results_grid_pu_bestn
    139 
    140 /*******************************************************************************
    141 *                         FUNCTION DEFINITIONS
    142 *******************************************************************************/
    143 S32 hme_cmp_nodes(search_node_t *ps_best_node1, search_node_t *ps_best_node2)
    144 {
    145     if((ps_best_node1->s_mv.i2_mvx == ps_best_node2->s_mv.i2_mvx) &&
    146        (ps_best_node1->s_mv.i2_mvy == ps_best_node2->s_mv.i2_mvy) &&
    147        (ps_best_node1->i1_ref_idx == ps_best_node2->i1_ref_idx))
    148     {
    149         return 0;
    150     }
    151     return -1;
    152 }
    153 
    154 void compute_4x4_sads_for_16x16_blk(
    155     grid_ctxt_t *ps_grid, /* Grid ctxt */
    156     UWORD8 *pu1_cur_ptr, /* Pointer to top-left of current block */
    157     WORD32 cur_buf_stride, /* Buffer stride of current buffer */
    158     UWORD16 **
    159         u2_part_sads, /* 2D Array containing SADs for all 17 partitions. As many rows as partitions. SADs in a row correspond to each of the candidates */
    160     cand_t *ps_cand, /* Return the list of candidates evaluated */
    161     WORD32 *num_cands /* Number of candidates that were processed */
    162 )
    163 {
    164     WORD32 a, b, c, d, i;
    165     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
    166     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
    167     //WORD32 offset_x[9] = {-grd_sz_x, 0, grd_sz_x, -grd_sz_x, 0, grd_sz_x, grd_sz_x, 0, -grd_sz_x};
    168     //WORD32 offset_y[9] = {-grd_sz_y, -grd_sz_y, -grd_sz_y, 0, 0, 0, grd_sz_y, grd_sz_y, grd_sz_y};
    169     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
    170     WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
    171     WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
    172     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
    173     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
    174     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
    175     cand_t *cand0 = ps_cand;
    176     UWORD16 au2_4x4_sad[NUM_4X4];
    177 
    178     *num_cands = 0;
    179 
    180     /* Loop to fill up the cand_t array and to calculate num_cands */
    181     for(i = 0; i < ps_grid->num_grids; i++)
    182     {
    183         WORD32 j;
    184         WORD32 mask = ps_grid->pi4_grd_mask[i];
    185         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
    186         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
    187         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
    188 
    189         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
    190         {
    191             if(mask & 1)
    192             {
    193                 *num_cands = *num_cands + 1;
    194                 cand0->grid_ix = i;
    195                 cand0->ref_idx = ps_grid->p_ref_idx[i];
    196                 cand0->pu1_ref_ptr =
    197                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
    198                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
    199                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
    200                 cand0++;
    201             }
    202         }
    203     }
    204 
    205     /* Loop to compute the SAD's */
    206     for(a = 0; a < *num_cands; a++)
    207     {
    208         cand_t *cand = ps_cand + a;
    209         memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
    210         for(b = 0; b < NUM_4X4; b++)
    211         {
    212             WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
    213             WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
    214 
    215             for(c = 0; c < NUM_ROWS_IN_4X4; c++)
    216             {
    217                 WORD32 z_cur = (cur_buf_stride)*c + t1;
    218                 WORD32 z_ref = (ref_buf_stride)*c + t2;
    219                 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
    220                 {
    221                     au2_4x4_sad[b] += (UWORD16)ABS(
    222                         (((S32)cand->pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
    223                 }
    224             }
    225         }
    226 
    227         u2_part_sads[PART_ID_NxN_TL][a] =
    228             (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
    229         u2_part_sads[PART_ID_NxN_TR][a] =
    230             (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
    231         u2_part_sads[PART_ID_NxN_BL][a] =
    232             (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
    233         u2_part_sads[PART_ID_NxN_BR][a] =
    234             (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
    235         u2_part_sads[PART_ID_Nx2N_L][a] =
    236             u2_part_sads[PART_ID_NxN_TL][a] + u2_part_sads[PART_ID_NxN_BL][a];
    237         u2_part_sads[PART_ID_Nx2N_R][a] =
    238             u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_BR][a];
    239         u2_part_sads[PART_ID_2NxN_T][a] =
    240             u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_TL][a];
    241         u2_part_sads[PART_ID_2NxN_B][a] =
    242             u2_part_sads[PART_ID_NxN_BR][a] + u2_part_sads[PART_ID_NxN_BL][a];
    243         u2_part_sads[PART_ID_nLx2N_L][a] =
    244             (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
    245         u2_part_sads[PART_ID_nRx2N_R][a] =
    246             (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
    247         u2_part_sads[PART_ID_2NxnU_T][a] =
    248             (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
    249         u2_part_sads[PART_ID_2NxnD_B][a] =
    250             (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
    251         u2_part_sads[PART_ID_2Nx2N][a] =
    252             u2_part_sads[PART_ID_2NxN_T][a] + u2_part_sads[PART_ID_2NxN_B][a];
    253         u2_part_sads[PART_ID_2NxnU_B][a] =
    254             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnU_T][a];
    255         u2_part_sads[PART_ID_2NxnD_T][a] =
    256             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnD_B][a];
    257         u2_part_sads[PART_ID_nRx2N_L][a] =
    258             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nRx2N_R][a];
    259         u2_part_sads[PART_ID_nLx2N_R][a] =
    260             u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nLx2N_L][a];
    261     }
    262 }
    263 
    264 /**
    265 ********************************************************************************
    266 *  @fn     compute_part_sads_for_MxM_blk(grid_ctxt_t *ps_grid,
    267 *                                       UWORD8      *pu1_cur_ptr,
    268 *                                       WORD32      cur_buf_stride,
    269 *                                       WORD32     **pi4_part_sads,
    270 *                                       cand_t      *ps_cand,
    271 *                                       WORD32      *num_cands
    272 *
    273 *  @brief  Computes partial SADs and updates partition results for an MxM blk
    274 *          and does so for several grids of points. This can be used for
    275 *          32x32/64x64 blks with 17 partition updates
    276 *
    277 *
    278 *  @param[in]  ps_grid : Pointer to grid ctxt that has multiple grid of max
    279 *                        9 pts per grid
    280 *
    281 *  @param[in]  pu1_cur_ptr : Top left of input buffer
    282 *
    283 *  @param[in]  pi4_part_sads : array of pointers, each entry pointing to
    284 *                             results to be updated for a given partition
    285 *
    286 *  @return   The ps_search_results structure has the best result updated for
    287 *            the 2Nx2N partition alone.
    288 
    289 ********************************************************************************
    290 */
    291 void compute_part_sads_for_MxM_blk(
    292     grid_ctxt_t *ps_grid,
    293     UWORD8 *pu1_cur_ptr,
    294     WORD32 cur_buf_stride,
    295     WORD32 **pp_part_sads,
    296     cand_t *ps_cand,
    297     WORD32 *num_cands,
    298     CU_SIZE_T e_cu_size)
    299 {
    300     WORD32 a, b, c, d, i;
    301     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
    302     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
    303 
    304     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
    305     WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
    306     WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
    307     WORD32 shift = (WORD32)e_cu_size;
    308 
    309     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
    310     WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
    311     WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
    312     /* Num rows and pixels per row: 8 for CU_32x32 and 16 for CU_64x64 */
    313     WORD32 num_rows_in_nxn = 2 << shift;
    314     WORD32 num_pixels_in_row = 2 << shift;
    315     cand_t *cand0 = ps_cand;
    316     /* for a 2Nx2N partition we evaluate nxn SADs, where n = N/2. This is */
    317     /* needed for AMP cases.                                              */
    318     WORD32 a_nxn_sad[NUM_4X4];
    319     *num_cands = 0;
    320 
    321     /* Loop to fill up the cand_t array and to calculate num_cands */
    322     for(i = 0; i < ps_grid->num_grids; i++)
    323     {
    324         WORD32 j;
    325         WORD32 mask = ps_grid->pi4_grd_mask[i];
    326         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
    327         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
    328         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
    329 
    330         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
    331         {
    332             if(mask & 1)
    333             {
    334                 *num_cands = *num_cands + 1;
    335                 cand0->grid_ix = i;
    336                 cand0->ref_idx = ps_grid->p_ref_idx[i];
    337                 cand0->pu1_ref_ptr =
    338                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
    339                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
    340                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
    341                 cand0++;
    342             }
    343         }
    344     }
    345 
    346     /* Loop to compute the SAD's */
    347     for(a = 0; a < *num_cands; a++)
    348     {
    349         cand_t *cand = ps_cand + a;
    350         memset(&a_nxn_sad[0], 0, NUM_4X4 * sizeof(WORD32));
    351         for(b = 0; b < NUM_4X4; b++)
    352         {
    353             WORD32 t1 = (b % 4) * num_pixels_in_row + (b >> 2) * cur_buf_stride_lsN;
    354             WORD32 t2 = (b % 4) * num_pixels_in_row + (b >> 2) * ref_buf_stride_lsN;
    355 
    356             for(c = 0; c < num_rows_in_nxn; c++)
    357             {
    358                 WORD32 z_cur = (cur_buf_stride)*c + t1;
    359                 WORD32 z_ref = (ref_buf_stride)*c + t2;
    360                 for(d = 0; d < num_pixels_in_row; d++)
    361                 {
    362                     a_nxn_sad[b] += (WORD32)ABS(
    363                         (((WORD32)cand->pu1_ref_ptr[(z_ref + d)]) -
    364                          ((WORD32)pu1_cur_ptr[(z_cur + d)])));
    365                 }
    366             }
    367         }
    368 
    369         pp_part_sads[PART_ID_NxN_TL][a] =
    370             (a_nxn_sad[0] + a_nxn_sad[1] + a_nxn_sad[4] + a_nxn_sad[5]);
    371         pp_part_sads[PART_ID_NxN_TR][a] =
    372             (a_nxn_sad[2] + a_nxn_sad[3] + a_nxn_sad[6] + a_nxn_sad[7]);
    373         pp_part_sads[PART_ID_NxN_BL][a] =
    374             (a_nxn_sad[8] + a_nxn_sad[9] + a_nxn_sad[12] + a_nxn_sad[13]);
    375         pp_part_sads[PART_ID_NxN_BR][a] =
    376             (a_nxn_sad[10] + a_nxn_sad[11] + a_nxn_sad[14] + a_nxn_sad[15]);
    377         pp_part_sads[PART_ID_Nx2N_L][a] =
    378             pp_part_sads[PART_ID_NxN_TL][a] + pp_part_sads[PART_ID_NxN_BL][a];
    379         pp_part_sads[PART_ID_Nx2N_R][a] =
    380             pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_BR][a];
    381         pp_part_sads[PART_ID_2NxN_T][a] =
    382             pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_TL][a];
    383         pp_part_sads[PART_ID_2NxN_B][a] =
    384             pp_part_sads[PART_ID_NxN_BR][a] + pp_part_sads[PART_ID_NxN_BL][a];
    385         pp_part_sads[PART_ID_nLx2N_L][a] =
    386             (a_nxn_sad[8] + a_nxn_sad[0] + a_nxn_sad[12] + a_nxn_sad[4]);
    387         pp_part_sads[PART_ID_nRx2N_R][a] =
    388             (a_nxn_sad[3] + a_nxn_sad[7] + a_nxn_sad[15] + a_nxn_sad[11]);
    389         pp_part_sads[PART_ID_2NxnU_T][a] =
    390             (a_nxn_sad[1] + a_nxn_sad[0] + a_nxn_sad[2] + a_nxn_sad[3]);
    391         pp_part_sads[PART_ID_2NxnD_B][a] =
    392             (a_nxn_sad[15] + a_nxn_sad[14] + a_nxn_sad[12] + a_nxn_sad[13]);
    393         pp_part_sads[PART_ID_2Nx2N][a] =
    394             pp_part_sads[PART_ID_2NxN_T][a] + pp_part_sads[PART_ID_2NxN_B][a];
    395         pp_part_sads[PART_ID_2NxnU_B][a] =
    396             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnU_T][a];
    397         pp_part_sads[PART_ID_2NxnD_T][a] =
    398             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnD_B][a];
    399         pp_part_sads[PART_ID_nRx2N_L][a] =
    400             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nRx2N_R][a];
    401         pp_part_sads[PART_ID_nLx2N_R][a] =
    402             pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nLx2N_L][a];
    403     }
    404 }
    405 
    406 void hme_evalsad_grid_pu_16x16(err_prms_t *ps_prms)
    407 {
    408     grid_ctxt_t s_grid;
    409     cand_t as_candt[9];
    410     U16 au2_sad_grid[TOT_NUM_PARTS * 9];
    411     U16 *apu2_sad_grid[TOT_NUM_PARTS];
    412     hme_mv_t s_mv = { 0, 0 };
    413     S32 i4_ref_idx = 0, i;
    414     S32 num_candts = 0;
    415     s_grid.num_grids = 1;
    416     s_grid.ref_buf_stride = ps_prms->i4_ref_stride;
    417     s_grid.grd_sz_y_x = ((ps_prms->i4_step << 16) | ps_prms->i4_step);
    418     s_grid.ppu1_ref_ptr = &ps_prms->pu1_ref;
    419     s_grid.pi4_grd_mask = &ps_prms->i4_grid_mask;
    420     s_grid.p_mv = &s_mv;
    421     s_grid.p_ref_idx = &i4_ref_idx;
    422     for(i = 0; i < 9; i++)
    423     {
    424         if(s_grid.pi4_grd_mask[0] & (1 << i))
    425             num_candts++;
    426     }
    427 
    428     for(i = 0; i < TOT_NUM_PARTS; i++)
    429         apu2_sad_grid[i] = &au2_sad_grid[i * num_candts];
    430 
    431     compute_4x4_sads_for_16x16_blk(
    432         &s_grid, ps_prms->pu1_inp, ps_prms->i4_inp_stride, apu2_sad_grid, as_candt, &num_candts);
    433     for(i = 0; i < TOT_NUM_PARTS * num_candts; i++)
    434     {
    435         ps_prms->pi4_sad_grid[i] = au2_sad_grid[i];
    436     }
    437 }
    438 
    439 void hme_evalsad_grid_npu_MxN(err_prms_t *ps_prms)
    440 {
    441     U08 *pu1_inp_base, *pu1_ref_c;
    442     S32 *pi4_sad = ps_prms->pi4_sad_grid;
    443     S32 i, grid_count = 0;
    444     S32 step = ps_prms->i4_step;
    445     S32 x_off = step, y_off = step * ps_prms->i4_ref_stride;
    446 
    447     ASSERT((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
    448 
    449     //assert(ps_prms->i4_blk_ht <= 8);
    450     //assert(ps_prms->i4_blk_wd <= 8);
    451     for(i = 0; i < 9; i++)
    452     {
    453         if(ps_prms->i4_grid_mask & (1 << i))
    454             grid_count++;
    455     }
    456     pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
    457 
    458     pu1_inp_base = ps_prms->pu1_inp;
    459     pu1_ref_c = ps_prms->pu1_ref;
    460     for(i = 0; i < 9; i++)
    461     {
    462         S32 sad = 0, j, k;
    463         U08 *pu1_inp, *pu1_ref;
    464 
    465         if(!(ps_prms->i4_grid_mask & (1 << i)))
    466             continue;
    467         pu1_ref = pu1_ref_c + x_off * gai1_grid_id_to_x[i];
    468         pu1_ref += y_off * gai1_grid_id_to_y[i];
    469         pu1_inp = pu1_inp_base;
    470 
    471         for(j = 0; j < ps_prms->i4_blk_ht; j++)
    472         {
    473             for(k = 0; k < ps_prms->i4_blk_wd; k++)
    474             {
    475                 sad += (ABS((pu1_inp[k] - pu1_ref[k])));
    476             }
    477             pu1_inp += ps_prms->i4_inp_stride;
    478             pu1_ref += ps_prms->i4_ref_stride;
    479         }
    480         *pi4_sad++ = sad;
    481     }
    482 }
    483 
    484 WORD32 hme_evalsad_pt_npu_MxN_8bit_compute(
    485     WORD32 ht,
    486     WORD32 wd,
    487     UWORD8 *pu1_inp,
    488     UWORD8 *pu1_ref,
    489     WORD32 i4_inp_stride,
    490     WORD32 i4_ref_stride)
    491 {
    492     WORD32 i, j;
    493     WORD32 sad = 0;
    494     for(i = 0; i < ht; i++)
    495     {
    496         for(j = 0; j < wd; j++)
    497         {
    498             sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
    499         }
    500         pu1_inp += i4_inp_stride;
    501         pu1_ref += i4_ref_stride;
    502     }
    503     return sad;
    504 }
    505 
    506 void hme_evalsad_pt_npu_MxN_8bit(err_prms_t *ps_prms)
    507 {
    508     S32 wd, ht;
    509     U08 *pu1_inp, *pu1_ref;
    510 
    511     wd = ps_prms->i4_blk_wd;
    512     ht = ps_prms->i4_blk_ht;
    513 
    514     pu1_inp = ps_prms->pu1_inp;
    515     pu1_ref = ps_prms->pu1_ref;
    516 
    517     ps_prms->pi4_sad_grid[0] = hme_evalsad_pt_npu_MxN_8bit_compute(
    518         ht, wd, pu1_inp, pu1_ref, ps_prms->i4_inp_stride, ps_prms->i4_ref_stride);
    519 }
    520 
    521 void compute_satd_8bit(err_prms_t *ps_prms)
    522 {
    523     U08 *pu1_origin;
    524     S32 src_strd;
    525     U08 *pu1_pred_buf;
    526     S32 dst_strd;
    527     S32 wd, ht;
    528     U32 u4_sad = 0;
    529     WORD32 x, y;
    530     U08 *u1_pi0, *u1_pi1;
    531 
    532     pu1_origin = ps_prms->pu1_inp;
    533     pu1_pred_buf = ps_prms->pu1_ref;
    534     src_strd = ps_prms->i4_inp_stride;
    535     dst_strd = ps_prms->i4_ref_stride;
    536     wd = ps_prms->i4_blk_wd;
    537     ht = ps_prms->i4_blk_ht;
    538 
    539     u1_pi0 = pu1_origin;
    540     u1_pi1 = pu1_pred_buf;
    541 
    542     /* Follows the following logic:
    543     For block sizes less than or equal to 16X16, the basic transform size is 4x4
    544     For block sizes greater than or equal to 32x32, the basic transform size is 8x8 */
    545     if((wd > 0x10) || (ht > 0x10))
    546     {
    547         for(y = 0; y < ht; y += 8)
    548         {
    549             for(x = 0; x < wd; x += 8)
    550             {
    551                 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
    552                     &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
    553             }
    554             u1_pi0 += src_strd * 8;
    555             u1_pi1 += dst_strd * 8;
    556         }
    557     }
    558     else
    559     {
    560         for(y = 0; y < ht; y += 4)
    561         {
    562             for(x = 0; x < wd; x += 4)
    563             {
    564                 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
    565                     &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
    566             }
    567             u1_pi0 += src_strd * 4;
    568             u1_pi1 += dst_strd * 4;
    569         }
    570     }
    571 
    572     ps_prms->pi4_sad_grid[0] = (S32)u4_sad;
    573 }
    574 
    575 void hme_init_pred_part(
    576     pred_ctxt_t *ps_pred_ctxt,
    577     search_node_t *ps_tl,
    578     search_node_t *ps_t,
    579     search_node_t *ps_tr,
    580     search_node_t *ps_l,
    581     search_node_t *ps_bl,
    582     search_node_t *ps_coloc,
    583     search_node_t *ps_zeromv,
    584     search_node_t **pps_proj_coloc,
    585     PART_ID_T e_part_id)
    586 {
    587     pred_candt_nodes_t *ps_candt_nodes;
    588 
    589     ps_candt_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
    590 
    591     ps_candt_nodes->ps_tl = ps_tl;
    592     ps_candt_nodes->ps_tr = ps_tr;
    593     ps_candt_nodes->ps_t = ps_t;
    594     ps_candt_nodes->ps_l = ps_l;
    595     ps_candt_nodes->ps_bl = ps_bl;
    596     ps_candt_nodes->ps_coloc = ps_coloc;
    597     ps_candt_nodes->ps_zeromv = ps_zeromv;
    598     ps_candt_nodes->pps_proj_coloc = pps_proj_coloc;
    599 }
    600 
    601 void hme_init_pred_ctxt_no_encode(
    602     pred_ctxt_t *ps_pred_ctxt,
    603     search_results_t *ps_search_results,
    604     search_node_t *ps_top_candts,
    605     search_node_t *ps_left_candts,
    606     search_node_t **pps_proj_coloc_candts,
    607     search_node_t *ps_coloc_candts,
    608     search_node_t *ps_zeromv_candt,
    609     S32 pred_lx,
    610     S32 lambda,
    611     S32 lambda_q_shift,
    612     U08 **ppu1_ref_bits_tlu,
    613     S16 *pi2_ref_scf)
    614 {
    615     search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
    616     search_node_t *ps_coloc;
    617     PART_ID_T e_part_id;
    618 
    619     /* Assume that resolution is subpel to begin with */
    620     ps_pred_ctxt->mv_pel = 0;  // FPEL
    621 
    622     /* lambda and pred_lx (PRED_L0/PRED_L1) */
    623     ps_pred_ctxt->lambda = lambda;
    624     ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
    625     ps_pred_ctxt->pred_lx = pred_lx;
    626     ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
    627     ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
    628     ps_pred_ctxt->proj_used = 0;
    629 
    630     /* Bottom left should not be valid */
    631     ASSERT(ps_left_candts[2].u1_is_avail == 0);
    632     ps_invalid = &ps_left_candts[2];
    633 
    634     /*************************************************************************/
    635     /* for the case of no encode, the idea is to set up cants as follows     */
    636     /*                                                                       */
    637     /*    ____ ______________                                                */
    638     /*   | TL | T  | T1 | TR |                                               */
    639     /*   |____|____|____|____|                                               */
    640     /*   | L  | b0 | b1 |                                                    */
    641     /*   |____|____|____|                                                    */
    642     /*   | L1 | b2 | b3 |                                                    */
    643     /*   |____|____|____|                                                    */
    644     /*   | BL |                                                              */
    645     /*   |____|                                                              */
    646     /*                                                                       */
    647     /*  If use_4x4 is 0, then b0,b1,b2,b3 are single 8x8 blk. then T=T1      */
    648     /* and L=L1. topleft, top and topright are TL,T,TR respectively          */
    649     /* Left and bottom left is L and BL respectively.                        */
    650     /* If use_4x4 is 1: then the above holds true only for PARTID = 0 (8x8)  */
    651     /*  For the 4 subblocks (partids 4-7)                                    */
    652     /*                                                                       */
    653     /*  Block   Left   Top   Top Left   Top Right   Bottom Left             */
    654     /*    b0    L      T      TL          T1          L1                     */
    655     /*    b1    b0     T1     T           TR          BL(invalid)            */
    656     /*    b2    L1     b0     L0          b1          BL (invalid)           */
    657     /*    b3    b2     b1     b0          BL(inv)     BL (inv)               */
    658     /*                                                                       */
    659     /* Note : For block b1, bottom left pts to b2, which is not yet ready    */
    660     /*  hence it is kept invalid and made to pt to BL. For block b3 top rt   */
    661     /* is invalid and hence made to pt to BL which is invalid.               */
    662     /* BL is invalid since it lies in a bottom left 8x8 blk and not yet ready*/
    663     /*************************************************************************/
    664 
    665     /* ps_coloc always points to a fixe candt (global) */
    666     /* TODO : replace incoming ps_coloc from global to geniune coloc */
    667     ps_coloc = ps_coloc_candts;
    668 
    669     /* INITIALIZATION OF 8x8 BLK */
    670     ps_tl = ps_top_candts;
    671     ps_t = ps_tl + 2;
    672     ps_tr = ps_t + 1;
    673     ps_l = ps_left_candts + 1;
    674     ps_bl = ps_invalid;
    675     e_part_id = PART_ID_2Nx2N;
    676     hme_init_pred_part(
    677         ps_pred_ctxt,
    678         ps_tl,
    679         ps_t,
    680         ps_tr,
    681         ps_l,
    682         ps_bl,
    683         ps_coloc,
    684         ps_zeromv_candt,
    685         pps_proj_coloc_candts,
    686         e_part_id);
    687 
    688     /* INITIALIZATION OF 4x4 TL BLK */
    689     e_part_id = PART_ID_NxN_TL;
    690     ps_tl = ps_top_candts;
    691     ps_t = ps_tl + 1;
    692     ps_tr = ps_t + 1;
    693     ps_l = ps_left_candts;
    694     ps_bl = ps_l + 1;
    695     hme_init_pred_part(
    696         ps_pred_ctxt,
    697         ps_tl,
    698         ps_t,
    699         ps_tr,
    700         ps_l,
    701         ps_bl,
    702         ps_coloc,
    703         ps_zeromv_candt,
    704         pps_proj_coloc_candts,
    705         e_part_id);
    706 
    707     /* INITIALIZATION OF 4x4 TR BLK */
    708     e_part_id = PART_ID_NxN_TR;
    709     ps_tl = ps_top_candts + 1;
    710     ps_t = ps_tl + 1;
    711     ps_tr = ps_t + 1;
    712     ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
    713     ps_bl = ps_invalid;
    714     hme_init_pred_part(
    715         ps_pred_ctxt,
    716         ps_tl,
    717         ps_t,
    718         ps_tr,
    719         ps_l,
    720         ps_bl,
    721         ps_coloc,
    722         ps_zeromv_candt,
    723         pps_proj_coloc_candts,
    724         e_part_id);
    725 
    726     /* INITIALIZATION OF 4x4 BL BLK */
    727     e_part_id = PART_ID_NxN_BL;
    728     ps_tl = ps_left_candts;
    729     ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
    730     ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
    731     ps_l = ps_left_candts + 1;
    732     ps_bl = ps_invalid;  //invalid
    733     hme_init_pred_part(
    734         ps_pred_ctxt,
    735         ps_tl,
    736         ps_t,
    737         ps_tr,
    738         ps_l,
    739         ps_bl,
    740         ps_coloc,
    741         ps_zeromv_candt,
    742         pps_proj_coloc_candts,
    743         e_part_id);
    744 
    745     /* INITIALIZATION OF 4x4 BR BLK */
    746     e_part_id = PART_ID_NxN_BR;
    747     ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
    748     ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
    749     ps_tr = ps_invalid;  // invalid
    750     ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
    751     ps_bl = ps_invalid;  // invalid
    752     hme_init_pred_part(
    753         ps_pred_ctxt,
    754         ps_tl,
    755         ps_t,
    756         ps_tr,
    757         ps_l,
    758         ps_bl,
    759         ps_coloc,
    760         ps_zeromv_candt,
    761         pps_proj_coloc_candts,
    762         e_part_id);
    763 }
    764 
    765 void hme_init_pred_ctxt_encode(
    766     pred_ctxt_t *ps_pred_ctxt,
    767     search_results_t *ps_search_results,
    768     search_node_t *ps_coloc_candts,
    769     search_node_t *ps_zeromv_candt,
    770     mv_grid_t *ps_mv_grid,
    771     S32 pred_lx,
    772     S32 lambda,
    773     S32 lambda_q_shift,
    774     U08 **ppu1_ref_bits_tlu,
    775     S16 *pi2_ref_scf)
    776 {
    777     search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
    778     search_node_t *ps_coloc;
    779     search_node_t *ps_grid_cu_base;
    780     CU_SIZE_T e_cu_size = ps_search_results->e_cu_size;
    781 
    782     /* Part Start, Part sizes in 4x4 units */
    783     S32 part_wd, part_ht, part_start_x, part_start_y;
    784 
    785     /* Partition type, number of partitions in type */
    786     S32 part_id;
    787 
    788     /* Coordinates of the CU in 4x4 units */
    789     S32 cu_start_x, cu_start_y;
    790     S32 shift = e_cu_size;
    791 
    792     /* top right and bot left validity at CU level */
    793     S32 cu_tr_valid, cu_bl_valid;
    794     /* strideo f the grid */
    795     S32 grid_stride = ps_mv_grid->i4_stride;
    796 
    797     ps_pred_ctxt->lambda = lambda;
    798     ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
    799     ps_pred_ctxt->pred_lx = pred_lx;
    800     ps_pred_ctxt->mv_pel = 0;
    801     ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
    802     ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
    803     ps_pred_ctxt->proj_used = 1;
    804 
    805     cu_start_x = ps_search_results->u1_x_off >> 2;
    806     cu_start_y = ps_search_results->u1_y_off >> 2;
    807 
    808     /* Coloc always points to fixed global candt */
    809     ps_coloc = ps_coloc_candts;
    810 
    811     /* Go to base of the CU in the MV Grid */
    812     ps_grid_cu_base = &ps_mv_grid->as_node[0];
    813     ps_grid_cu_base += (ps_mv_grid->i4_start_offset + cu_start_x);
    814     ps_grid_cu_base += (grid_stride * cu_start_y);
    815 
    816     /* points to the real bottom left of the grid, will never be valid */
    817     ps_invalid = &ps_mv_grid->as_node[0];
    818     ps_invalid += (grid_stride * 17);
    819 
    820     {
    821         S32 shift = 1 + e_cu_size;
    822         cu_tr_valid = gau1_cu_tr_valid[cu_start_y >> shift][cu_start_x >> shift];
    823         cu_bl_valid = gau1_cu_bl_valid[cu_start_y >> shift][cu_start_x >> shift];
    824     }
    825 
    826     /*************************************************************************/
    827     /* for the case of    encode, the idea is to set up cants as follows     */
    828     /*                                                                       */
    829     /*    ____ ______________ ____ ____                                      */
    830     /*   | T0 | T1 | T2 | T3 | T4 | T5 |                                     */
    831     /*   |____|____|____|____|____|____|                                     */
    832     /*   | L1 |    |              |                                          */
    833     /*   |____|    |              |                                          */
    834     /*   | L2 | p0 |     p1       |                                          */
    835     /*   |____|    |              |                                          */
    836     /*   | L3 |    |              |                                          */
    837     /*   |____|    |              |                                          */
    838     /*   | L4 | L' |              |                                          */
    839     /*   |____|____|______________|                                          */
    840     /*   | BL |                                                              */
    841     /*   |____|                                                              */
    842     /*  The example is shown with 16x16 CU, though it can be generalized     */
    843     /*  This CU has 2 partitions, cu_wd = 4. also p_wd, p_ht are partition   */
    844     /*  width and ht in 4x4 units.                                           */
    845     /*  For a given CU, derive the top left, top and bottom left and top rt  */
    846     /*  pts. Left and top are assumed to be valid.                           */
    847     /*  IF there aretwo partitions in the CU (like p0 and p1) and vertical,  */
    848     /*  then for first partition, left, top, top left and top right valid    */
    849     /*  Bottom left is valid. store these validity flags. Also store the     */
    850     /*  grid offsets of the partitions w.r.t. CU start in units of 4x4.For p0*/
    851     /*  Left grid offset = -1, 3. Top Grd offset = -1, 0.                    */
    852     /*  Top left grid offset = -1, -1. Top right = 1, -1. BL = -1, 4.        */
    853     /*  For p1, validity flags are left, top, top left, top right, valid.    */
    854     /*  BL is invalid. Grid offsets are: Left = dont care. T = 1, -1 (T2)    */
    855     /*  TR = 4, -1 (T5). TL = 0, -1 (T1). BL = don't care.                   */
    856     /*  For p1, set the left pred candt to the best search result of p0.     */
    857     /*************************************************************************/
    858 
    859     /* Loop over all partitions, and identify the 5 neighbours */
    860     for(part_id = 0; part_id < TOT_NUM_PARTS; part_id++)
    861     {
    862         part_attr_t *ps_part_attr = &gas_part_attr_in_cu[part_id];
    863         S32 tr_valid, bl_valid, is_vert;
    864         search_node_t *ps_grid_pu_base;
    865         PART_TYPE_T e_part_type;
    866         PART_ID_T first_part;
    867         S32 part_num;
    868 
    869         e_part_type = ge_part_id_to_part_type[part_id];
    870         first_part = ge_part_type_to_part_id[e_part_type][0];
    871         is_vert = gau1_is_vert_part[e_part_type];
    872         part_num = gau1_part_id_to_part_num[part_id];
    873         tr_valid = gau1_partid_tr_valid[part_id] & cu_tr_valid;
    874         bl_valid = gau1_partid_bl_valid[part_id] & cu_bl_valid;
    875 
    876         part_start_x = (ps_part_attr->u1_x_start << shift) >> 2;
    877         part_start_y = (ps_part_attr->u1_y_start << shift) >> 2;
    878         part_wd = (ps_part_attr->u1_x_count << shift) >> 2;
    879         part_ht = (ps_part_attr->u1_y_count << shift) >> 2;
    880 
    881         /* go to top left of part */
    882         ps_grid_pu_base = ps_grid_cu_base + part_start_x;
    883         ps_grid_pu_base += (part_start_y * grid_stride);
    884 
    885         ps_tl = ps_grid_pu_base - 1 - grid_stride;
    886         ps_t = ps_grid_pu_base - grid_stride + part_wd - 1;
    887         ps_l = ps_grid_pu_base - 1 + ((part_ht - 1) * grid_stride);
    888         ps_tr = ps_t + 1;
    889         ps_bl = ps_l + grid_stride;
    890 
    891         if(!tr_valid)
    892             ps_tr = ps_invalid;
    893         if(!bl_valid)
    894             ps_bl = ps_invalid;
    895 
    896         if(part_num == 1)
    897         {
    898             /* for cases of two partitions 2nd part has 1st part as candt */
    899             /* if vertical type, left candt of 2nd part is 1st part.      */
    900             /* if horz type, top candt of 2nd part is 1st part.           */
    901             if(is_vert)
    902             {
    903                 ps_l = ps_search_results->aps_part_results[pred_lx][first_part];
    904             }
    905             else
    906             {
    907                 ps_t = ps_search_results->aps_part_results[pred_lx][first_part];
    908             }
    909         }
    910         if(part_num == 2)
    911         {
    912             /* only possible for NxN_BL */
    913             ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
    914             ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
    915         }
    916         if(part_num == 3)
    917         {
    918             /* only possible for NxN_BR */
    919             ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
    920             ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
    921             ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
    922         }
    923         hme_init_pred_part(
    924             ps_pred_ctxt,
    925             ps_tl,
    926             ps_t,
    927             ps_tr,
    928             ps_l,
    929             ps_bl,
    930             ps_coloc,
    931             ps_zeromv_candt,
    932             NULL,
    933             (PART_ID_T)part_id);
    934     }
    935 }
    936 
    937 /**
    938 ********************************************************************************
    939 *  @fn     compute_mv_cost_explicit(search_node_t *ps_node,
    940 *                   pred_ctxt_t *ps_pred_ctxt,
    941 *                   PART_ID_T e_part_id)
    942 *
    943 *  @brief  MV cost for explicit search in layers not encoded
    944 *
    945 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
    946 *
    947 *  @param[in]  ps_pred_ctxt : mv pred context
    948 *
    949 *  @param[in]  e_part_id : Partition id.
    950 *
    951 *  @return   Cost value
    952 
    953 ********************************************************************************
    954 */
    955 S32 compute_mv_cost_explicit(
    956     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
    957 {
    958 #define RETURN_FIXED_COST 0
    959     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
    960     pred_candt_nodes_t *ps_pred_nodes;
    961     S32 inp_shift = 2 - inp_mv_pel;
    962     S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
    963     S32 mv_p_x, mv_p_y;
    964     S16 mvdx1, mvdx2, mvdy1, mvdy2;
    965     S32 cost, ref_bits;
    966 
    967     /*************************************************************************/
    968     /* Logic for cost computation for explicit search. For such a search,    */
    969     /* it is guaranteed that all predictor candts have same ref id. The only */
    970     /* probable issue is with the availability which needs checking. This fxn*/
    971     /* does not suffer the need to scale predictor candts due to diff ref id */
    972     /*************************************************************************/
    973 
    974     /* Hack: currently we always assume 2Nx2N. */
    975     /* TODO: get rid of this hack and return cost tuned to each partition */
    976     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
    977     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
    978 
    979     /*************************************************************************/
    980     /* Priority to bottom left availability. Else we go to left. If both are */
    981     /* not available, then a remains null                                    */
    982     /*************************************************************************/
    983     if(ps_pred_nodes->ps_tl->u1_is_avail)
    984         ps_pred_node_a = ps_pred_nodes->ps_tl;
    985     else if(ps_pred_nodes->ps_l->u1_is_avail)
    986         ps_pred_node_a = ps_pred_nodes->ps_l;
    987 
    988     /*************************************************************************/
    989     /* For encoder, top left may not be really needed unless we use slices,  */
    990     /* and even then in ME it may not be relevant. So we only consider T or  */
    991     /* TR, as, if both T and TR are not available, TL also will not be       */
    992     /*************************************************************************/
    993     if(ps_pred_nodes->ps_tr->u1_is_avail)
    994         ps_pred_node_b = ps_pred_nodes->ps_tr;
    995     else if(ps_pred_nodes->ps_t->u1_is_avail)
    996         ps_pred_node_b = ps_pred_nodes->ps_t;
    997 
    998     if(ps_pred_node_a == NULL)
    999     {
   1000         ps_pred_node_a = ps_pred_nodes->ps_coloc;
   1001         if(ps_pred_node_b == NULL)
   1002             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
   1003     }
   1004     else if(ps_pred_node_b == NULL)
   1005         ps_pred_node_b = ps_pred_nodes->ps_coloc;
   1006     else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
   1007     {
   1008         ps_pred_node_b = ps_pred_nodes->ps_coloc;
   1009     }
   1010 
   1011     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   1012     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   1013     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   1014     mvdx1 = ABS(mvdx1);
   1015     mvdy1 = ABS(mvdy1);
   1016 
   1017     mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
   1018     mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
   1019     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   1020     mvdx2 = ABS(mvdx2);
   1021     mvdy2 = ABS(mvdy2);
   1022 
   1023     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
   1024     {
   1025         cost =
   1026             hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
   1027     }
   1028     else
   1029     {
   1030         cost =
   1031             hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
   1032     }
   1033     {
   1034         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
   1035         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
   1036     }
   1037 }
   1038 /**
   1039 ********************************************************************************
   1040 *  @fn     compute_mv_cost_coarse(search_node_t *ps_node,
   1041 *                   pred_ctxt_t *ps_pred_ctxt,
   1042 *                   PART_ID_T e_part_id)
   1043 *
   1044 *  @brief  MV cost for coarse explicit search in coarsest layer
   1045 *
   1046 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
   1047 *
   1048 *  @param[in]  ps_pred_ctxt : mv pred context
   1049 *
   1050 *  @param[in]  e_part_id : Partition id.
   1051 *
   1052 *  @return   Cost value
   1053 
   1054 ********************************************************************************
   1055 */
   1056 S32 compute_mv_cost_coarse(
   1057     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
   1058 {
   1059     ARG_NOT_USED(e_part_id);
   1060 
   1061     return (compute_mv_cost_explicit(ps_node, ps_pred_ctxt, PART_ID_2Nx2N, inp_mv_pel));
   1062 }
   1063 
   1064 /**
   1065 ********************************************************************************
   1066 *  @fn     compute_mv_cost_coarse_high_speed(search_node_t *ps_node,
   1067 *                                            pred_ctxt_t *ps_pred_ctxt,
   1068 *                                            PART_ID_T e_part_id)
   1069 *
   1070 *  @brief  MV cost for coarse explicit search in coarsest layer
   1071 *
   1072 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
   1073 *
   1074 *  @param[in]  ps_pred_ctxt : mv pred context
   1075 *
   1076 *  @param[in]  e_part_id : Partition id.
   1077 *
   1078 *  @return   Cost value
   1079 
   1080 ********************************************************************************
   1081 */
   1082 S32 compute_mv_cost_coarse_high_speed(
   1083     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
   1084 {
   1085     S32 rnd, mvx, mvy, i4_search_idx;
   1086     S32 cost;
   1087 
   1088     mvx = ps_node->s_mv.i2_mvx;
   1089     mvy = ps_node->s_mv.i2_mvy;
   1090     i4_search_idx = ps_node->i1_ref_idx;
   1091 
   1092     cost = (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i4_search_idx;
   1093     cost += (mvx != 0) ? 1 : 0;
   1094     cost += (mvy != 0) ? 1 : 0;
   1095     rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
   1096     cost = (cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift;
   1097     return cost;
   1098 }
   1099 
   1100 /**
   1101 ********************************************************************************
   1102 *  @fn     compute_mv_cost_explicit_refine(search_node_t *ps_node,
   1103 *                                          pred_ctxt_t *ps_pred_ctxt,
   1104 *                                          PART_ID_T e_part_id)
   1105 *
   1106 *  @brief  MV cost for explicit search in layers not encoded. Always returns
   1107 *          cost of the projected colocated candidate
   1108 *
   1109 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
   1110 *
   1111 *  @param[in]  ps_pred_ctxt : mv pred context
   1112 *
   1113 *  @param[in]  e_part_id : Partition id.
   1114 *
   1115 *  @return   Cost value
   1116 
   1117 ********************************************************************************
   1118 */
   1119 S32 compute_mv_cost_explicit_refine(
   1120     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
   1121 {
   1122     search_node_t *ps_pred_node_a = NULL;
   1123     pred_candt_nodes_t *ps_pred_nodes;
   1124     S32 inp_shift = 2 - inp_mv_pel;
   1125     S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
   1126     S32 mv_p_x, mv_p_y;
   1127     S16 mvdx1, mvdy1;
   1128     S32 cost, ref_bits;
   1129 
   1130     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
   1131     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
   1132 
   1133     ps_pred_node_a = ps_pred_nodes->pps_proj_coloc[0];
   1134 
   1135     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   1136     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   1137     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   1138     mvdx1 = ABS(mvdx1);
   1139     mvdy1 = ABS(mvdy1);
   1140 
   1141     cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
   1142 
   1143     {
   1144         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
   1145         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
   1146     }
   1147 }
   1148 
   1149 /**
   1150 ********************************************************************************
   1151 *  @fn     compute_mv_cost_refine(search_node_t *ps_node,
   1152 *                   pred_ctxt_t *ps_pred_ctxt,
   1153 *                   PART_ID_T e_part_id)
   1154 *
   1155 *  @brief  MV cost for coarse explicit search in coarsest layer
   1156 *
   1157 *  @param[in]  ps_node: search node having mv and ref id for which to eval cost
   1158 *
   1159 *  @param[in]  ps_pred_ctxt : mv pred context
   1160 *
   1161 *  @param[in]  e_part_id : Partition id.
   1162 *
   1163 *  @return   Cost value
   1164 
   1165 ********************************************************************************
   1166 */
   1167 S32 compute_mv_cost_refine(
   1168     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
   1169 {
   1170     return (compute_mv_cost_explicit_refine(ps_node, ps_pred_ctxt, e_part_id, inp_mv_pel));
   1171 }
   1172 
   1173 S32 compute_mv_cost_implicit(
   1174     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
   1175 {
   1176     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
   1177     pred_candt_nodes_t *ps_pred_nodes;
   1178     S08 i1_ref_idx;
   1179     S08 i1_ref_tl = -1, i1_ref_tr = -1, i1_ref_t = -1;
   1180     S08 i1_ref_bl = -1, i1_ref_l = -1;
   1181     S32 inp_shift = 2 - inp_mv_pel;
   1182     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel;*/
   1183     S32 ref_bits, cost;
   1184     S32 mv_p_x, mv_p_y;
   1185     S16 mvdx1, mvdx2, mvdy1, mvdy2;
   1186 
   1187     //return 0;
   1188     i1_ref_idx = ps_node->i1_ref_idx;
   1189 
   1190     /*************************************************************************/
   1191     /* Logic for cost computation for explicit search. For such a search,    */
   1192     /* it is guaranteed that all predictor candts have same ref id. The only */
   1193     /* probable issue is with the availability which needs checking. This fxn*/
   1194     /* does not suffer the need to scale predictor candts due to diff ref id */
   1195     /*************************************************************************/
   1196 
   1197     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
   1198     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
   1199 
   1200     /*************************************************************************/
   1201     /* Priority to bottom left availability. Else we go to left. If both are */
   1202     /* not available, then a remains null                                    */
   1203     /*************************************************************************/
   1204     if(ps_pred_nodes->ps_bl->u1_is_avail)
   1205         i1_ref_bl = ps_pred_nodes->ps_bl->i1_ref_idx;
   1206     if(ps_pred_nodes->ps_l->u1_is_avail)
   1207         i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
   1208     if(i1_ref_bl == i1_ref_idx)
   1209         ps_pred_node_a = ps_pred_nodes->ps_bl;
   1210     else if(i1_ref_l == i1_ref_idx)
   1211         ps_pred_node_a = ps_pred_nodes->ps_l;
   1212     if(ps_pred_node_a == NULL)
   1213     {
   1214         if(i1_ref_bl != -1)
   1215             ps_pred_node_a = ps_pred_nodes->ps_bl;
   1216         else if(i1_ref_l != -1)
   1217             ps_pred_node_a = ps_pred_nodes->ps_l;
   1218     }
   1219 
   1220     /*************************************************************************/
   1221     /* For encoder, top left may not be really needed unless we use slices,  */
   1222     /* and even then in ME it may not be relevant. So we only consider T or  */
   1223     /* TR, as, if both T and TR are not available, TL also will not be       */
   1224     /*************************************************************************/
   1225     if(ps_pred_nodes->ps_tr->u1_is_avail)
   1226         i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
   1227     if(ps_pred_nodes->ps_t->u1_is_avail)
   1228         i1_ref_t = ps_pred_nodes->ps_t->i1_ref_idx;
   1229     if(ps_pred_nodes->ps_tl->u1_is_avail)
   1230         i1_ref_tl = ps_pred_nodes->ps_tl->i1_ref_idx;
   1231     if(i1_ref_tr == i1_ref_idx)
   1232         ps_pred_node_b = ps_pred_nodes->ps_tr;
   1233     else if(i1_ref_t == i1_ref_idx)
   1234         ps_pred_node_b = ps_pred_nodes->ps_t;
   1235     else if(i1_ref_tl == i1_ref_idx)
   1236         ps_pred_node_b = ps_pred_nodes->ps_tl;
   1237 
   1238     if(ps_pred_node_b == NULL)
   1239     {
   1240         if(i1_ref_tr != -1)
   1241             ps_pred_node_b = ps_pred_nodes->ps_tr;
   1242         else if(i1_ref_t != -1)
   1243             ps_pred_node_b = ps_pred_nodes->ps_t;
   1244         else if(i1_ref_tl != -1)
   1245             ps_pred_node_b = ps_pred_nodes->ps_tl;
   1246     }
   1247     if(ps_pred_node_a == NULL)
   1248     {
   1249         ps_pred_node_a = ps_pred_nodes->ps_coloc;
   1250         if(ps_pred_node_b == NULL)
   1251             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
   1252     }
   1253     else if(ps_pred_node_b == NULL)
   1254         ps_pred_node_b = ps_pred_nodes->ps_coloc;
   1255     else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
   1256     {
   1257         ps_pred_node_b = ps_pred_nodes->ps_coloc;
   1258     }
   1259 
   1260     if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
   1261     {
   1262         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
   1263     }
   1264     else
   1265     {
   1266         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   1267         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   1268     }
   1269     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
   1270     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   1271     mvdx1 = ABS(mvdx1);
   1272     mvdy1 = ABS(mvdy1);
   1273 
   1274     if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
   1275     {
   1276         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
   1277     }
   1278     else
   1279     {
   1280         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
   1281         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
   1282     }
   1283     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
   1284     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   1285     mvdx2 = ABS(mvdx2);
   1286     mvdy2 = ABS(mvdy2);
   1287 
   1288     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
   1289     {
   1290         cost = 2 * hme_get_range(mvdx1) + 2 * hme_get_range(mvdy1) + 2 * (mvdx1 > 0) +
   1291                2 * (mvdy1 > 0) + ref_bits + 2;
   1292     }
   1293     else
   1294     {
   1295         cost = 2 * hme_get_range(mvdx2) + 2 * hme_get_range(mvdy2) + 2 * (mvdx2 > 0) +
   1296                2 * (mvdy2 > 0) + ref_bits + 2;
   1297     }
   1298     {
   1299         /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
   1300         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift);
   1301         S32 tot_cost = (cost * ps_pred_ctxt->lambda) << 1;
   1302 
   1303         tot_cost += (gau1_bits_for_part_id_q1[e_part_id] * ps_pred_ctxt->lambda);
   1304         return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift + 1));
   1305     }
   1306 }
   1307 
   1308 S32 compute_mv_cost_implicit_high_speed(
   1309     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
   1310 {
   1311     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
   1312     pred_candt_nodes_t *ps_pred_nodes;
   1313     S08 i1_ref_idx;
   1314     S08 i1_ref_tr = -1;
   1315     S08 i1_ref_l = -1;
   1316     S32 inp_shift = 2 - inp_mv_pel;
   1317     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
   1318     S32 ref_bits, cost;
   1319     S32 mv_p_x, mv_p_y;
   1320     S16 mvdx1, mvdx2, mvdy1, mvdy2;
   1321 
   1322     i1_ref_idx = ps_node->i1_ref_idx;
   1323 
   1324     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
   1325     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
   1326 
   1327     /*************************************************************************/
   1328     /* Priority to bottom left availability. Else we go to left. If both are */
   1329     /* not available, then a remains null                                    */
   1330     /*************************************************************************/
   1331     if(ps_pred_nodes->ps_l->u1_is_avail)
   1332     {
   1333         i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
   1334         ps_pred_node_a = ps_pred_nodes->ps_l;
   1335     }
   1336 
   1337     /*************************************************************************/
   1338     /* For encoder, top left may not be really needed unless we use slices,  */
   1339     /* and even then in ME it may not be relevant. So we only consider T or  */
   1340     /* TR, as, if both T and TR are not available, TL also will not be       */
   1341     /*************************************************************************/
   1342 
   1343     if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
   1344     {
   1345         i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
   1346         ps_pred_node_b = ps_pred_nodes->ps_tr;
   1347     }
   1348     else
   1349     {
   1350         ps_pred_node_b = ps_pred_nodes->ps_coloc;
   1351     }
   1352 
   1353     if(ps_pred_node_a == NULL)
   1354     {
   1355         ps_pred_node_a = ps_pred_nodes->ps_coloc;
   1356 
   1357         if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
   1358             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
   1359     }
   1360 
   1361     if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
   1362     {
   1363         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
   1364     }
   1365     else
   1366     {
   1367         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   1368         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   1369     }
   1370 
   1371     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
   1372     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   1373     mvdx1 = ABS(mvdx1);
   1374     mvdy1 = ABS(mvdy1);
   1375 
   1376     if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
   1377     {
   1378         SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
   1379     }
   1380     else
   1381     {
   1382         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
   1383         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
   1384     }
   1385 
   1386     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
   1387     COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   1388     mvdx2 = ABS(mvdx2);
   1389     mvdy2 = ABS(mvdy2);
   1390 
   1391     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
   1392     {
   1393         cost =
   1394             hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
   1395     }
   1396     else
   1397     {
   1398         cost =
   1399             hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
   1400     }
   1401     {
   1402         /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
   1403         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
   1404         S32 tot_cost = (cost * ps_pred_ctxt->lambda);
   1405 
   1406         return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift));
   1407     }
   1408 }
   1409 
   1410 S32 compute_mv_cost_implicit_high_speed_modified(
   1411     search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
   1412 {
   1413     search_node_t *ps_pred_node_a = NULL;
   1414     pred_candt_nodes_t *ps_pred_nodes;
   1415     S32 inp_shift = 2 - inp_mv_pel;
   1416     S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
   1417     S32 mv_p_x, mv_p_y;
   1418     S16 mvdx1, mvdy1;
   1419     S32 cost, ref_bits;
   1420 
   1421     ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
   1422     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
   1423 
   1424     ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
   1425 
   1426     mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   1427     mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   1428     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
   1429     COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   1430     mvdx1 = ABS(mvdx1);
   1431     mvdy1 = ABS(mvdy1);
   1432 
   1433     cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
   1434 
   1435     {
   1436         S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
   1437         return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
   1438     }
   1439 }
   1440 
   1441 void hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t *ps_result_prms)
   1442 {
   1443     /*The function modified with assumption that only 2NxN_B and Nx2N_R is modified */
   1444 
   1445     search_node_t s_search_node_grid;
   1446     const search_node_t *ps_search_node_base;
   1447     search_node_t *ps_search_node_grid, *ps_best_node;
   1448     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
   1449     S32 num_results, i4_unique_id = -1, i4_grid_pt;
   1450     search_results_t *ps_search_results;
   1451     S32 *pi4_valid_part_ids;
   1452     S32 i4_step = ps_result_prms->i4_step;
   1453     S32 i4_grid_mask, i, i4_min_id;
   1454     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
   1455     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
   1456     S32 grid_count = 0;
   1457     S32 pred_lx;
   1458 
   1459     i4_min_id = (S32)PT_C;
   1460     i4_min_cost = MAX_32BIT_VAL;
   1461     ps_search_node_grid = &s_search_node_grid;
   1462     ps_search_node_base = ps_result_prms->ps_search_node_base;
   1463     *ps_search_node_grid = *ps_search_node_base;
   1464     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
   1465     ps_search_results = ps_result_prms->ps_search_results;
   1466     num_results = (S32)ps_search_results->u1_num_results_per_part;
   1467     i4_grid_mask = ps_result_prms->i4_grid_mask;
   1468 
   1469     for(i = 0; i < 9; i++)
   1470     {
   1471         if(i4_grid_mask & (1 << i))
   1472             grid_count++;
   1473     }
   1474 
   1475     /* Some basic assumptions: only single pt, only part updates */
   1476     /* and more than 1 best result to be computed.               */
   1477     //ASSERT(ps_result_prms->i4_grid_mask != 1);
   1478     //ASSERT(ps_result_prms->i4_part_mask != ENABLE_2Nx2N);
   1479     //ASSERT(ps_search_results->num_results > 1);
   1480 
   1481     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
   1482     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
   1483 
   1484     /*************************************************************************/
   1485     /* Supposing we do hte result update for a unique partid, we can */
   1486     /* store the best pt id in the grid and also min cost is return */
   1487     /* param. This will be useful for early exit cases.             */
   1488     /* TODO : once we have separate fxn for unique part+grid, we can */
   1489     /* do away with this code here                                   */
   1490     /*************************************************************************/
   1491     //if (pi4_valid_part_ids[1] == -1)
   1492     i4_unique_id = pi4_valid_part_ids[0];
   1493 
   1494     /* pi4_valid_part_ids contains all the valid ids. We loop through */
   1495     /* this till we encounter -1. This is easier than having to       */
   1496     /* figure out part by part, besides, active part decision is      */
   1497     /* usually fixed for a given duration of search, e.g. entire fpel */
   1498     /* refinement for a blk/cu will use fixed valid part mask         */
   1499     id = pi4_valid_part_ids[0];
   1500 
   1501     /*****************************************************************/
   1502     /* points to the best search results corresponding to this       */
   1503     /* specific part type.                                           */
   1504     /*****************************************************************/
   1505     ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
   1506 
   1507     /*************************************************************************/
   1508     /* Outer loop runs through all active pts in the grid                    */
   1509     /*************************************************************************/
   1510     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
   1511     {
   1512         if(!(i4_grid_mask & (1 << i4_grid_pt)))
   1513             continue;
   1514 
   1515         /* For the pt in the grid, update mvx and y depending on */
   1516         /* location of pt. Updates are in FPEL units.            */
   1517         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
   1518         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
   1519         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
   1520         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
   1521 
   1522         {
   1523             /* evaluate mv cost and totalcost for this part for this given mv*/
   1524             i4_mv_cost = compute_mv_cost_coarse_high_speed(
   1525                 ps_search_node_grid,
   1526                 &ps_search_results->as_pred_ctxt[pred_lx],
   1527                 (PART_ID_T)id,
   1528                 MV_RES_FPEL);
   1529 
   1530             i4_sad = pi4_sad_grid[grid_count * id];
   1531             i4_tot_cost = i4_sad + i4_mv_cost;
   1532 
   1533             ASSERT(i4_unique_id == id);
   1534             ASSERT(num_results == 1);
   1535 
   1536             /*****************************************************************/
   1537             /* We do not labor through the results if the total cost worse   */
   1538             /* than the last of the results.                                 */
   1539             /*****************************************************************/
   1540             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
   1541             {
   1542                 i4_min_id = i4_grid_pt;
   1543                 ps_result_prms->i4_min_cost = i4_tot_cost;
   1544 
   1545                 ps_best_node[0] = *ps_search_node_grid;
   1546                 ps_best_node[0].i4_sad = i4_sad;
   1547                 ps_best_node[0].i4_mv_cost = i4_mv_cost;
   1548                 ps_best_node[0].i4_tot_cost = i4_tot_cost;
   1549             }
   1550         }
   1551         pi4_sad_grid++;
   1552     }
   1553     ps_result_prms->i4_min_id = i4_min_id;
   1554 }
   1555 
   1556 void hme_update_results_grid_pu_bestn(result_upd_prms_t *ps_result_prms)
   1557 {
   1558     search_node_t s_search_node_grid;
   1559     const search_node_t *ps_search_node_base;
   1560     search_node_t *ps_search_node_grid, *ps_best_node;
   1561     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
   1562     S32 num_results, i4_unique_id = -1, i4_grid_pt;
   1563     search_results_t *ps_search_results;
   1564     S32 *pi4_valid_part_ids;
   1565     S32 i4_step = ps_result_prms->i4_step;
   1566     S32 i4_grid_mask, i4_count, i, i4_min_id;
   1567     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
   1568     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
   1569     S32 grid_count = 0;
   1570     S32 pred_lx;
   1571 
   1572     i4_min_id = (S32)PT_C;
   1573     i4_min_cost = MAX_32BIT_VAL;
   1574     ps_search_node_grid = &s_search_node_grid;
   1575     ps_search_node_base = ps_result_prms->ps_search_node_base;
   1576     *ps_search_node_grid = *ps_search_node_base;
   1577     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
   1578     ps_search_results = ps_result_prms->ps_search_results;
   1579     num_results = (S32)ps_search_results->u1_num_results_per_part;
   1580     i4_grid_mask = ps_result_prms->i4_grid_mask;
   1581 
   1582     for(i = 0; i < 9; i++)
   1583     {
   1584         if(i4_grid_mask & (1 << i))
   1585         {
   1586             grid_count++;
   1587         }
   1588     }
   1589 
   1590     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
   1591     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
   1592 
   1593     i4_unique_id = pi4_valid_part_ids[0];
   1594 
   1595     /*************************************************************************/
   1596     /* Outer loop runs through all active pts in the grid                    */
   1597     /*************************************************************************/
   1598     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
   1599     {
   1600         if(!(i4_grid_mask & (1 << i4_grid_pt)))
   1601         {
   1602             continue;
   1603         }
   1604 
   1605         /* For the pt in the grid, update mvx and y depending on */
   1606         /* location of pt. Updates are in FPEL units.            */
   1607         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
   1608         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
   1609         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
   1610         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
   1611 
   1612         i4_count = 0;
   1613 
   1614         while((id = pi4_valid_part_ids[i4_count]) >= 0)
   1615         {
   1616             /*****************************************************************/
   1617             /* points to the best search results corresponding to this       */
   1618             /* specific part type.                                           */
   1619             /*****************************************************************/
   1620             ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
   1621 
   1622             /* evaluate mv cost and totalcost for this part for this given mv*/
   1623             i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
   1624                 ps_search_node_grid,
   1625                 &ps_search_results->as_pred_ctxt[pred_lx],
   1626                 (PART_ID_T)id,
   1627                 MV_RES_FPEL);
   1628 
   1629             i4_sad = pi4_sad_grid[grid_count * id];
   1630             i4_tot_cost = i4_sad + i4_mv_cost;
   1631 
   1632             if(i4_unique_id == id)
   1633             {
   1634                 if(i4_tot_cost < ps_result_prms->i4_min_cost)
   1635                 {
   1636                     i4_min_id = i4_grid_pt;
   1637                     ps_result_prms->i4_min_cost = i4_tot_cost;
   1638                 }
   1639             }
   1640 
   1641             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
   1642             {
   1643                 for(i = 0; i < num_results - 1; i++)
   1644                 {
   1645                     if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
   1646                     {
   1647                         memmove(
   1648                             ps_best_node + i + 1,
   1649                             ps_best_node + i,
   1650                             sizeof(search_node_t) * (num_results - 1 - i));
   1651                         break;
   1652                     }
   1653                     else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
   1654                     {
   1655                         if(0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node + i))
   1656                             break;
   1657                     }
   1658                 }
   1659                 ps_best_node[i] = *ps_search_node_grid;
   1660                 ps_best_node[i].i4_sad = i4_sad;
   1661                 ps_best_node[i].i4_mv_cost = i4_mv_cost;
   1662                 ps_best_node[i].i4_tot_cost = i4_tot_cost;
   1663             }
   1664             i4_count++;
   1665         }
   1666         pi4_sad_grid++;
   1667     }
   1668     ps_result_prms->i4_min_id = i4_min_id;
   1669 }
   1670 
   1671 /**
   1672 ********************************************************************************
   1673 *  @fn     hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
   1674 *
   1675 *  @brief  Updates results for the case where 1 best result is to be updated
   1676 *          for a given pt, for several parts
   1677 *          Note : The function is replicated for CLIPing the cost to 16bit to make
   1678 *                  bit match with SIMD version
   1679 *
   1680 *  @param[in]  result_upd_prms_t : Contains the input parameters to this fxn
   1681 *
   1682 *  @return   The result_upd_prms_t structure is updated for all the active
   1683 *            parts in case the current candt has results for any given part
   1684 *             that is the best result for that part
   1685 ********************************************************************************
   1686 */
   1687 void hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
   1688 {
   1689     search_node_t s_search_node_grid;
   1690     const search_node_t *ps_search_node_base;
   1691     search_node_t *ps_search_node_grid, *ps_best_node;
   1692     S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
   1693     S32 num_results, i4_unique_id = -1, i4_grid_pt;
   1694     search_results_t *ps_search_results;
   1695     S32 *pi4_valid_part_ids;
   1696     S32 i4_step = ps_result_prms->i4_step;
   1697     S32 i4_grid_mask, i4_count, i, i4_min_id;
   1698     S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
   1699     S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
   1700     S32 grid_count = 0;
   1701     S32 pred_lx;
   1702 
   1703     i4_min_id = (S32)PT_C;
   1704     i4_min_cost = MAX_32BIT_VAL;
   1705     ps_search_node_grid = &s_search_node_grid;
   1706     ps_search_node_base = ps_result_prms->ps_search_node_base;
   1707     *ps_search_node_grid = *ps_search_node_base;
   1708     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
   1709     ps_search_results = ps_result_prms->ps_search_results;
   1710     num_results = (S32)ps_search_results->u1_num_results_per_part;
   1711     i4_grid_mask = ps_result_prms->i4_grid_mask;
   1712 
   1713     for(i = 0; i < 9; i++)
   1714     {
   1715         if(i4_grid_mask & (1 << i))
   1716             grid_count++;
   1717     }
   1718 
   1719     /* Some basic assumptions: only single pt, only part updates */
   1720     /* and more than 1 best result to be computed.               */
   1721 
   1722     i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
   1723     pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
   1724 
   1725     /*************************************************************************/
   1726     /* Supposing we do hte result update for a unique partid, we can */
   1727     /* store the best pt id in the grid and also min cost is return */
   1728     /* param. This will be useful for early exit cases.             */
   1729     /* TODO : once we have separate fxn for unique part+grid, we can */
   1730     /* do away with this code here                                   */
   1731     /*************************************************************************/
   1732     //if (pi4_valid_part_ids[1] == -1)
   1733     i4_unique_id = pi4_valid_part_ids[0];
   1734 
   1735     /*************************************************************************/
   1736     /* Outer loop runs through all active pts in the grid                    */
   1737     /*************************************************************************/
   1738     for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
   1739     {
   1740         if(!(i4_grid_mask & (1 << i4_grid_pt)))
   1741             continue;
   1742 
   1743         /* For the pt in the grid, update mvx and y depending on */
   1744         /* location of pt. Updates are in FPEL units.            */
   1745         ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
   1746         ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
   1747         ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
   1748         ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
   1749 
   1750         i4_count = 0;
   1751 
   1752         /* pi4_valid_part_ids contains all the valid ids. We loop through */
   1753         /* this till we encounter -1. This is easier than having to       */
   1754         /* figure out part by part, besides, active part decision is      */
   1755         /* usually fixed for a given duration of search, e.g. entire fpel */
   1756         /* refinement for a blk/cu will use fixed valid part mask         */
   1757 
   1758         while((id = pi4_valid_part_ids[i4_count]) >= 0)
   1759         {
   1760             //ps_search_node_grid->e_part_type = (PART_TYPE_T)id;
   1761 
   1762             /*****************************************************************/
   1763             /* points to the best search results corresponding to this       */
   1764             /* specific part type.                                           */
   1765             /*****************************************************************/
   1766             ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
   1767 
   1768             /* evaluate mv cost and totalcost for this part for this given mv*/
   1769             i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
   1770                 ps_search_node_grid,
   1771                 &ps_search_results->as_pred_ctxt[pred_lx],
   1772                 (PART_ID_T)id,
   1773                 MV_RES_FPEL);
   1774 
   1775             i4_sad = pi4_sad_grid[grid_count * id];
   1776 
   1777             /* Clipping to 16 bit to bit match with SIMD version */
   1778             i4_mv_cost = CLIP_S16(i4_mv_cost);
   1779             i4_sad = CLIP_S16(i4_sad);
   1780 
   1781             i4_tot_cost = i4_sad + i4_mv_cost;
   1782             /* Clipping to 16 bit to bit match with SIMD version */
   1783             i4_tot_cost = CLIP_S16(i4_tot_cost);
   1784 
   1785             if(i4_unique_id == id)
   1786             {
   1787                 if(i4_tot_cost < ps_result_prms->i4_min_cost)
   1788                 {
   1789                     i4_min_id = i4_grid_pt;
   1790                     ps_result_prms->i4_min_cost = i4_tot_cost;
   1791                 }
   1792             }
   1793 
   1794             /*****************************************************************/
   1795             /* We do not labor through the results if the total cost worse   */
   1796             /* than the last of the results.                                 */
   1797             /*****************************************************************/
   1798             if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
   1799             {
   1800                 S32 eq_cost = 0;
   1801                 /*************************************************************/
   1802                 /* Identify where the current result isto be placed.Basically*/
   1803                 /* find the node which has cost just higher thannodeundertest*/
   1804                 /*************************************************************/
   1805                 for(i = 0; i < num_results - 1; i++)
   1806                 {
   1807                     if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
   1808                     {
   1809                         memmove(
   1810                             ps_best_node + i + 1,
   1811                             ps_best_node + i,
   1812                             sizeof(search_node_t) * (num_results - 1 - i));
   1813                         break;
   1814                     }
   1815                     else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
   1816                     {
   1817                         //if (0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node+i))
   1818                         //  break;
   1819                         /* When cost is same we comp. the nodes and if it's same skip. */
   1820                         /* We don't want to add this code to intrinsic. So we are      */
   1821                         /* commenting it. The quality impact was minor when we did the */
   1822                         /* regression.                                                 */
   1823                         eq_cost = 1;
   1824                     }
   1825                 }
   1826                 if(!eq_cost)
   1827                 {
   1828                     ps_best_node[i] = *ps_search_node_grid;
   1829                     ps_best_node[i].i4_sad = i4_sad;
   1830                     ps_best_node[i].i4_mv_cost = i4_mv_cost;
   1831                     ps_best_node[i].i4_tot_cost = i4_tot_cost;
   1832                 }
   1833             }
   1834             i4_count++;
   1835         }
   1836         pi4_sad_grid++;
   1837     }
   1838     ps_result_prms->i4_min_id = i4_min_id;
   1839 }
   1840 
   1841 /**
   1842 ********************************************************************************
   1843 *  @fn     hme_update_results_pt_npu_best1(result_upd_prms_t *ps_result_prms)
   1844 *
   1845 *  @brief  Updates results for the case where 1 best result is to be updated
   1846 *          for a given pt, for several parts
   1847 *
   1848 *  @param[in]  ps_result_prms. Contains the input parameters to this fxn
   1849 *              ::ps_pred_info : contains cost fxn ptr and predictor info
   1850 *              ::pi4_sad : 17x9 SAD Grid, this case, only 1st 17 entries valid
   1851 *              ::ps_search_results: Search results structure
   1852 *              ::i1_ref_id : Reference index
   1853 *              ::i4_grid_mask: Dont Care for this fxn
   1854 *              ::pi4_valid_part_ids : valid part ids
   1855 *              ::ps_search_node_base: Contains the centre pt candt info.
   1856 *
   1857 *  @return   The ps_search_results structure is updated for all the active
   1858 *            parts in case the current candt has results for any given part
   1859 *             that is the best result for that part
   1860 ********************************************************************************
   1861 */
   1862 
   1863 void hme_update_results_pt_pu_best1_subpel_hs(
   1864     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
   1865 {
   1866     search_node_t *ps_search_node_base, *ps_best_node;
   1867     search_results_t *ps_search_results;
   1868     S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
   1869     S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
   1870     S32 num_results, i;
   1871     S32 *pi4_valid_part_ids;
   1872 
   1873     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
   1874     /* Some basic assumptions: only single pt, only part updates */
   1875     /* and more than 1 best result to be computed.               */
   1876     ASSERT(ps_result_prms->i4_grid_mask == 1);
   1877 
   1878     ps_search_results = ps_result_prms->ps_search_results;
   1879     num_results = (S32)ps_search_results->u1_num_results_per_part;
   1880 
   1881     /* Compute mv cost, total cost */
   1882     ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
   1883 
   1884     while((id = pi4_valid_part_ids[i4_count]) >= 0)
   1885     {
   1886         S32 update_required = 1;
   1887 
   1888         ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
   1889         /* Use a pre-computed cost instead of freshly evaluating subpel cost */
   1890         i4_mv_cost = ps_best_node->i4_mv_cost;
   1891         i4_sad = ps_result_prms->pi4_sad_grid[id];
   1892         i4_tot_cost = i4_sad + i4_mv_cost;
   1893 
   1894         /* We do not labor through the results if the total cost is worse than   */
   1895         /* the last of the results.                                              */
   1896         if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
   1897         {
   1898             /* Identify where the current result is to be placed. Basically find  */
   1899             /* the node which has cost just higher than node under test           */
   1900             for(i = 0; i < num_results - 1; i++)
   1901             {
   1902                 if(ps_best_node[i].i1_ref_idx != -1)
   1903                 {
   1904                     if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
   1905                     {
   1906                         memmove(
   1907                             ps_best_node + i + 1,
   1908                             ps_best_node + i,
   1909                             sizeof(search_node_t) * (num_results - 1 - i));
   1910                         break;
   1911                     }
   1912                     else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
   1913                     {
   1914                         update_required = 0;
   1915                         break;
   1916                     }
   1917                 }
   1918                 else
   1919                 {
   1920                     break;
   1921                 }
   1922             }
   1923 
   1924             if(update_required)
   1925             {
   1926                 /* Update when either ref_idx or mv's are different */
   1927                 ps_best_node[i] = *ps_search_node_base;
   1928                 ps_best_node[i].i4_sad = i4_sad;
   1929                 ps_best_node[i].i4_mv_cost = i4_mv_cost;
   1930                 ps_best_node[i].i4_tot_cost = i4_tot_cost;
   1931             }
   1932         }
   1933         i4_count++;
   1934     }
   1935 }
   1936 
   1937 void hme_update_results_pt_pu_best1_subpel_hs_1(
   1938     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
   1939 {
   1940     search_node_t *ps_search_node_base, *ps_best_node;
   1941     search_results_t *ps_search_results;
   1942     S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
   1943     S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
   1944     S32 num_results;
   1945     S32 *pi4_valid_part_ids;
   1946 
   1947     pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
   1948     /* Some basic assumptions: only single pt, only part updates */
   1949     /* and more than 1 best result to be computed.               */
   1950     ASSERT(ps_result_prms->i4_grid_mask == 1);
   1951 
   1952     ps_search_results = ps_result_prms->ps_search_results;
   1953     num_results = (S32)ps_search_results->u1_num_results_per_part;
   1954 
   1955     /* Compute mv cost, total cost */
   1956     ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
   1957 
   1958     while((id = pi4_valid_part_ids[i4_count]) >= 0)
   1959     {
   1960         S32 update_required = 0;
   1961 
   1962         ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
   1963         /* Use a pre-computed cost instead of freshly evaluating subpel cost */
   1964         i4_mv_cost = ps_best_node->i4_mv_cost;
   1965         i4_sad = ps_result_prms->pi4_sad_grid[id];
   1966         i4_tot_cost = i4_sad + i4_mv_cost;
   1967 
   1968         /* We do not labor through the results if the total cost is worse than   */
   1969         /* the last of the results.                                              */
   1970         if(i4_tot_cost < ps_best_node[1].i4_tot_cost)
   1971         {
   1972             S32 sdi_value = 0;
   1973 
   1974             update_required = 2;
   1975             /* Identify where the current result is to be placed. Basically find  */
   1976             /* the node which has cost just higher than node under test           */
   1977             {
   1978                 if(i4_tot_cost < ps_best_node[0].i4_tot_cost)
   1979                 {
   1980                     update_required = 1;
   1981                     sdi_value = ps_best_node[0].i4_sad - i4_sad;
   1982                 }
   1983                 else if(
   1984                     (ps_result_prms->i2_mv_x == ps_best_node[0].s_mv.i2_mvx) &&
   1985                     (ps_result_prms->i2_mv_y == ps_best_node[0].s_mv.i2_mvy) &&
   1986                     (ps_best_node[0].i1_ref_idx == ps_result_prms->i1_ref_idx))
   1987                 {
   1988                     update_required = 0;
   1989                 }
   1990             }
   1991             if(update_required == 2)
   1992             {
   1993                 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
   1994 
   1995                 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] = i4_tot_cost;
   1996                 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] = i4_mv_cost;
   1997                 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] = ps_result_prms->i2_mv_x;
   1998                 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] = ps_result_prms->i2_mv_y;
   1999                 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] = ps_result_prms->i1_ref_idx;
   2000             }
   2001             else if(update_required == 1)
   2002             {
   2003                 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
   2004 
   2005                 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] =
   2006                     ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count];
   2007                 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] =
   2008                     ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count];
   2009                 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] =
   2010                     ps_subpel_refine_ctxt->i2_mv_x[0][i4_count];
   2011                 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] =
   2012                     ps_subpel_refine_ctxt->i2_mv_y[0][i4_count];
   2013                 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] =
   2014                     ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count];
   2015 
   2016                 ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] = i4_tot_cost;
   2017                 ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count] = i4_mv_cost;
   2018                 ps_subpel_refine_ctxt->i2_mv_x[0][i4_count] = ps_result_prms->i2_mv_x;
   2019                 ps_subpel_refine_ctxt->i2_mv_y[0][i4_count] = ps_result_prms->i2_mv_y;
   2020                 ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count] = ps_result_prms->i1_ref_idx;
   2021             }
   2022         }
   2023         i4_count++;
   2024     }
   2025 }
   2026 
   2027 /**
   2028 ******************************************************************************
   2029 *  @brief Gives a result fxn ptr for a index [x] where x is as:
   2030 *         0 : single pt, no partial updates, 1 best result
   2031 *         1 : single pt, no partial updates, N best results
   2032 *         2 : single pt,    partial updates, 1 best result
   2033 *         3 : single pt,    partial updates, N best results
   2034 *         0 : grid     , no partial updates, 1 best result
   2035 *         1 : grid     , no partial updates, N best results
   2036 *         2 : grid     ,    partial updates, 1 best result
   2037 *         3 : grid     ,    partial updates, N best results
   2038 ******************************************************************************
   2039 */
   2040 
   2041 static PF_RESULT_FXN_T g_pf_result_fxn[8] = { UPD_RES_PT_NPU_BEST1,   UPD_RES_PT_NPU_BESTN,
   2042                                               UPD_RES_PT_PU_BEST1,    UPD_RES_PT_PU_BESTN,
   2043                                               UPD_RES_GRID_NPU_BEST1, UPD_RES_GRID_NPU_BESTN,
   2044                                               UPD_RES_GRID_PU_BEST1,  UPD_RES_GRID_PU_BESTN };
   2045 
   2046 /**
   2047 ********************************************************************************
   2048 *  @fn     hme_get_result_fxn(i4_grid_mask, i4_part_mask, i4_num_results)
   2049 *
   2050 *  @brief  Obtains the suitable result function that evaluates COST and also
   2051 *           computes one or more best results for point/grid, single part or
   2052 *           more than one part.
   2053 *
   2054 *  @param[in]  i4_grid_mask : Mask containing which of 9 grid pts active
   2055 *
   2056 *  @param[in]  i4_part_mask : Mask containing which of the 17 parts active
   2057 *
   2058 *  @param[in]  i4_num_results: Number of active results
   2059 *
   2060 *  @return   Pointer to the appropriate result update function
   2061 ********************************************************************************
   2062 */
   2063 PF_RESULT_FXN_T hme_get_result_fxn(S32 i4_grid_mask, S32 i4_part_mask, S32 i4_num_results)
   2064 {
   2065     S32 i4_is_grid = (i4_grid_mask != 1);
   2066     S32 i4_is_pu = ((i4_part_mask & (i4_part_mask - 1)) != 0);
   2067     S32 i4_res_gt1 = (i4_num_results > 1);
   2068     S32 id;
   2069 
   2070     id = (i4_is_grid << 2) + (i4_is_pu << 1) + i4_res_gt1;
   2071 
   2072     return (g_pf_result_fxn[id]);
   2073 }
   2074 
   2075 void hme_calc_sad_and_2_best_results(
   2076     hme_search_prms_t *ps_search_prms,
   2077     wgt_pred_ctxt_t *ps_wt_inp_prms,
   2078     err_prms_t *ps_err_prms,
   2079     result_upd_prms_t *ps_result_prms,
   2080     U08 **ppu1_ref,
   2081     S32 i4_ref_stride)
   2082 {
   2083     S32 i4_candt;
   2084     S32 i4_inp_off;
   2085     S32 i4_ref_offset;
   2086     S32 i4_num_nodes;
   2087 
   2088     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
   2089     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
   2090     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
   2091     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
   2092     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
   2093 
   2094     mv_refine_ctxt_t *ps_mv_refine_ctxt;
   2095     search_node_t *ps_search_node;
   2096 
   2097     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
   2098     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
   2099     i4_inp_off = ps_search_prms->i4_cu_x_off;
   2100     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
   2101     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
   2102     ps_search_node = ps_search_prms->ps_search_nodes;
   2103 
   2104     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
   2105     {
   2106         /**********************************************************************/
   2107         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
   2108         /**********************************************************************/
   2109         {
   2110             WORD32 b, c, d;
   2111             UWORD8 *pu1_cur_ptr;
   2112             UWORD8 *pu1_ref_ptr;
   2113             UWORD16 au2_4x4_sad[NUM_4X4];
   2114 
   2115             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
   2116             {
   2117                 continue;
   2118             }
   2119 
   2120             ps_err_prms->pu1_inp =
   2121                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
   2122             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
   2123             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
   2124             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
   2125 
   2126             pu1_cur_ptr = ps_err_prms->pu1_inp;
   2127             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
   2128 
   2129             /* Loop to compute the SAD's */
   2130             {
   2131                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
   2132                 for(b = 0; b < NUM_4X4; b++)
   2133                 {
   2134                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
   2135                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
   2136 
   2137                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
   2138                     {
   2139                         WORD32 z_cur = (cur_buf_stride)*c + t1;
   2140                         WORD32 z_ref = (ref_buf_stride)*c + t2;
   2141                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
   2142                         {
   2143                             au2_4x4_sad[b] += (UWORD16)ABS((
   2144                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
   2145                         }
   2146                     }
   2147                 }
   2148 
   2149                 pi4_sad_grid[PART_ID_NxN_TL] =
   2150                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
   2151                 pi4_sad_grid[PART_ID_NxN_TR] =
   2152                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
   2153                 pi4_sad_grid[PART_ID_NxN_BL] =
   2154                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   2155                 pi4_sad_grid[PART_ID_NxN_BR] =
   2156                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
   2157                 pi4_sad_grid[PART_ID_Nx2N_L] =
   2158                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
   2159                 pi4_sad_grid[PART_ID_Nx2N_R] =
   2160                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
   2161                 pi4_sad_grid[PART_ID_2NxN_T] =
   2162                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
   2163                 pi4_sad_grid[PART_ID_2NxN_B] =
   2164                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
   2165                 pi4_sad_grid[PART_ID_nLx2N_L] =
   2166                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
   2167                 pi4_sad_grid[PART_ID_nRx2N_R] =
   2168                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
   2169                 pi4_sad_grid[PART_ID_2NxnU_T] =
   2170                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
   2171                 pi4_sad_grid[PART_ID_2NxnD_B] =
   2172                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   2173                 pi4_sad_grid[PART_ID_2Nx2N] =
   2174                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
   2175                 pi4_sad_grid[PART_ID_2NxnU_B] =
   2176                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
   2177                 pi4_sad_grid[PART_ID_2NxnD_T] =
   2178                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
   2179                 pi4_sad_grid[PART_ID_nRx2N_L] =
   2180                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
   2181                 pi4_sad_grid[PART_ID_nLx2N_R] =
   2182                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
   2183             }
   2184         }
   2185 
   2186         {
   2187             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
   2188             S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
   2189             S32 best_node_cost;
   2190             S32 second_best_node_cost;
   2191 
   2192             {
   2193                 S16 mvdx1, mvdy1;
   2194                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
   2195                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
   2196                 S32 pred_lx = i4_search_idx;
   2197 
   2198                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
   2199                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
   2200                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
   2201 
   2202                 S32 inp_shift = 2;
   2203                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
   2204                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
   2205                 S32 lambda = ps_pred_ctxt->lambda;
   2206                 S32 rnd = 1 << (lambda_q_shift - 1);
   2207                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   2208                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   2209                 S32 ref_bits =
   2210                     ps_pred_ctxt
   2211                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
   2212 
   2213                 COMPUTE_DIFF_MV(
   2214                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   2215 
   2216                 mvdx1 = ABS(mvdx1);
   2217                 mvdy1 = ABS(mvdy1);
   2218 
   2219                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
   2220                              (mvdy1 > 0) + ref_bits + 2;
   2221 
   2222                 i4_mv_cost *= lambda;
   2223                 i4_mv_cost += rnd;
   2224                 i4_mv_cost >>= lambda_q_shift;
   2225 
   2226                 i4_mv_cost = CLIP_U16(i4_mv_cost);
   2227             }
   2228 
   2229             /*For each valid partition, update the refine_prm structure to reflect the best and second
   2230             best candidates for that partition*/
   2231 
   2232             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
   2233             {
   2234                 S32 update_required = 0;
   2235                 S32 part_id = pi4_valid_part_ids[i4_count];
   2236                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
   2237 
   2238                 /*Calculate total cost*/
   2239                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
   2240                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
   2241 
   2242                 /*****************************************************************/
   2243                 /* We do not labor through the results if the total cost worse   */
   2244                 /* than the last of the results.                                 */
   2245                 /*****************************************************************/
   2246                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
   2247                 second_best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[1][index]);
   2248 
   2249                 if(i4_tot_cost < second_best_node_cost)
   2250                 {
   2251                     update_required = 2;
   2252 
   2253                     /*************************************************************/
   2254                     /* Identify where the current result isto be placed.Basically*/
   2255                     /* find the node which has cost just higher thannodeundertest*/
   2256                     /*************************************************************/
   2257                     if(i4_tot_cost < best_node_cost)
   2258                     {
   2259                         update_required = 1;
   2260                     }
   2261                     else if(i4_tot_cost == best_node_cost)
   2262                     {
   2263                         update_required = 0;
   2264                     }
   2265 
   2266                     if(update_required == 2)
   2267                     {
   2268                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
   2269                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
   2270                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
   2271                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
   2272                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
   2273                     }
   2274                     else if(update_required == 1)
   2275                     {
   2276                         ps_mv_refine_ctxt->i2_tot_cost[1][index] =
   2277                             ps_mv_refine_ctxt->i2_tot_cost[0][index];
   2278                         ps_mv_refine_ctxt->i2_mv_cost[1][index] =
   2279                             ps_mv_refine_ctxt->i2_mv_cost[0][index];
   2280                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
   2281                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
   2282                         ps_mv_refine_ctxt->i2_ref_idx[1][index] =
   2283                             ps_mv_refine_ctxt->i2_ref_idx[0][index];
   2284 
   2285                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
   2286                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
   2287                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
   2288                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
   2289                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
   2290                     }
   2291                 }
   2292             }
   2293         }
   2294         ps_search_node++;
   2295     }
   2296 
   2297     {
   2298         WORD32 i4_i;
   2299         WORD32 part_id;
   2300         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
   2301         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
   2302         {
   2303             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
   2304             if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
   2305             {
   2306                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
   2307                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
   2308                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
   2309 
   2310                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
   2311             }
   2312             if(ps_mv_refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
   2313             {
   2314                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
   2315                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
   2316                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
   2317 
   2318                 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
   2319             }
   2320         }
   2321     }
   2322 }
   2323 
   2324 void hme_calc_sad_and_2_best_results_subpel(
   2325     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
   2326 {
   2327     S32 i4_candt;
   2328     S32 i4_num_nodes;
   2329 
   2330     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
   2331     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
   2332     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
   2333     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
   2334     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
   2335 
   2336     mv_refine_ctxt_t *ps_subpel_refine_ctxt;
   2337     ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
   2338     i4_num_nodes = 1;
   2339 
   2340     /* Run through each of the candts in a loop */
   2341     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
   2342     {
   2343         /**********************************************************************/
   2344         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
   2345         /**********************************************************************/
   2346         {
   2347             WORD32 b, c, d;
   2348             UWORD8 *pu1_cur_ptr;
   2349             UWORD8 *pu1_ref_ptr;
   2350             UWORD16 au2_4x4_sad[NUM_4X4];
   2351 
   2352             pu1_cur_ptr = ps_err_prms->pu1_inp;
   2353             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
   2354 
   2355             /* Loop to compute the SAD's */
   2356             {
   2357                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
   2358                 for(b = 0; b < NUM_4X4; b++)
   2359                 {
   2360                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
   2361                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
   2362 
   2363                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
   2364                     {
   2365                         WORD32 z_cur = (cur_buf_stride)*c + t1;
   2366                         WORD32 z_ref = (ref_buf_stride)*c + t2;
   2367                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
   2368                         {
   2369                             au2_4x4_sad[b] += (UWORD16)ABS((
   2370                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
   2371                         }
   2372                     }
   2373                 }
   2374 
   2375                 pi4_sad_grid[PART_ID_NxN_TL] =
   2376                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
   2377                 pi4_sad_grid[PART_ID_NxN_TR] =
   2378                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
   2379                 pi4_sad_grid[PART_ID_NxN_BL] =
   2380                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   2381                 pi4_sad_grid[PART_ID_NxN_BR] =
   2382                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
   2383                 pi4_sad_grid[PART_ID_Nx2N_L] =
   2384                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
   2385                 pi4_sad_grid[PART_ID_Nx2N_R] =
   2386                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
   2387                 pi4_sad_grid[PART_ID_2NxN_T] =
   2388                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
   2389                 pi4_sad_grid[PART_ID_2NxN_B] =
   2390                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
   2391                 pi4_sad_grid[PART_ID_nLx2N_L] =
   2392                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
   2393                 pi4_sad_grid[PART_ID_nRx2N_R] =
   2394                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
   2395                 pi4_sad_grid[PART_ID_2NxnU_T] =
   2396                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
   2397                 pi4_sad_grid[PART_ID_2NxnD_B] =
   2398                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   2399                 pi4_sad_grid[PART_ID_2Nx2N] =
   2400                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
   2401                 pi4_sad_grid[PART_ID_2NxnU_B] =
   2402                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
   2403                 pi4_sad_grid[PART_ID_2NxnD_T] =
   2404                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
   2405                 pi4_sad_grid[PART_ID_nRx2N_L] =
   2406                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
   2407                 pi4_sad_grid[PART_ID_nLx2N_R] =
   2408                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
   2409             }
   2410         }
   2411         /**********************************************************************/
   2412         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
   2413         /**********************************************************************/
   2414         {
   2415             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
   2416             S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
   2417             S32 best_node_cost;
   2418             S32 second_best_node_cost;
   2419 
   2420             /*For each valid partition, update the refine_prm structure to reflect the best and second
   2421             best candidates for that partition*/
   2422 
   2423             for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
   2424             {
   2425                 S32 update_required = 0;
   2426                 S32 part_id = pi4_valid_part_ids[i4_count];
   2427                 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
   2428 
   2429                 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
   2430                 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   2431 
   2432                 /*Calculate total cost*/
   2433                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
   2434                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
   2435 
   2436                 /*****************************************************************/
   2437                 /* We do not labor through the results if the total cost worse   */
   2438                 /* than the last of the results.                                 */
   2439                 /*****************************************************************/
   2440                 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
   2441                 second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
   2442 
   2443                 if(i4_tot_cost < second_best_node_cost)
   2444                 {
   2445                     update_required = 2;
   2446 
   2447                     /*************************************************************/
   2448                     /* Identify where the current result isto be placed.Basically*/
   2449                     /* find the node which has cost just higher thannodeundertest*/
   2450                     /*************************************************************/
   2451                     if(i4_tot_cost < best_node_cost)
   2452                     {
   2453                         update_required = 1;
   2454                     }
   2455                     else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
   2456                     {
   2457                         update_required = 0;
   2458                     }
   2459                     if(update_required == 2)
   2460                     {
   2461                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
   2462                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
   2463                         ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
   2464                         ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
   2465                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
   2466                     }
   2467                     else if(update_required == 1)
   2468                     {
   2469                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
   2470                             ps_subpel_refine_ctxt->i2_tot_cost[0][index];
   2471                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
   2472                             ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   2473                         ps_subpel_refine_ctxt->i2_mv_x[1][index] =
   2474                             ps_subpel_refine_ctxt->i2_mv_x[0][index];
   2475                         ps_subpel_refine_ctxt->i2_mv_y[1][index] =
   2476                             ps_subpel_refine_ctxt->i2_mv_y[0][index];
   2477                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
   2478                             ps_subpel_refine_ctxt->i2_ref_idx[0][index];
   2479 
   2480                         ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
   2481                         ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
   2482                         ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
   2483                         ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
   2484                         ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
   2485                     }
   2486                 }
   2487             }
   2488         }
   2489     }
   2490 
   2491     {
   2492         WORD32 i4_count = 0;
   2493         for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
   2494         {
   2495             WORD32 j;
   2496             for(j = 0; j < 2; j++)
   2497             {
   2498                 if(ps_subpel_refine_ctxt->i2_tot_cost[j][i4_count] >= MAX_SIGNED_16BIT_VAL)
   2499                 {
   2500                     ps_subpel_refine_ctxt->ai2_fullpel_satd[j][i4_count] = MAX_SIGNED_16BIT_VAL;
   2501                 }
   2502             }
   2503         }
   2504     }
   2505 }
   2506 
   2507 void hme_calc_stim_injected_sad_and_2_best_results(
   2508     hme_search_prms_t *ps_search_prms,
   2509     wgt_pred_ctxt_t *ps_wt_inp_prms,
   2510     err_prms_t *ps_err_prms,
   2511     result_upd_prms_t *ps_result_prms,
   2512     U08 **ppu1_ref,
   2513     S32 i4_ref_stride)
   2514 {
   2515     mv_refine_ctxt_t *ps_mv_refine_ctxt;
   2516     search_node_t *ps_search_node;
   2517 
   2518     S32 i4_candt;
   2519     S32 i4_count;
   2520     S32 i4_inp_off;
   2521     S32 i4_ref_offset;
   2522     S32 i4_num_nodes;
   2523     ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
   2524         au8_final_ref_sigmaXSquared[17];
   2525     UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
   2526     S32 *pi4_valid_part_ids;
   2527 
   2528     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
   2529     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
   2530     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
   2531     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
   2532     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
   2533 
   2534     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
   2535     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
   2536     i4_inp_off = ps_search_prms->i4_cu_x_off;
   2537     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
   2538     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
   2539     ps_search_node = ps_search_prms->ps_search_nodes;
   2540     pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
   2541 
   2542     /* Set local pointer to point to partition level sigma values calculated in hme_refine */
   2543     au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
   2544     au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
   2545 
   2546     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
   2547     {
   2548         {
   2549             WORD32 b, c, d;
   2550             UWORD8 *pu1_cur_ptr;
   2551             UWORD8 *pu1_ref_ptr;
   2552             UWORD16 au2_4x4_sad[NUM_4X4];
   2553 
   2554             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
   2555             {
   2556                 continue;
   2557             }
   2558 
   2559             ps_err_prms->pu1_inp =
   2560                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
   2561             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
   2562             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
   2563             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
   2564 
   2565             pu1_cur_ptr = ps_err_prms->pu1_inp;
   2566             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
   2567 
   2568             /* Loop to compute the SAD's */
   2569             {
   2570                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
   2571                 for(b = 0; b < NUM_4X4; b++)
   2572                 {
   2573                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
   2574                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
   2575 
   2576                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
   2577                     {
   2578                         WORD32 z_cur = (cur_buf_stride)*c + t1;
   2579                         WORD32 z_ref = (ref_buf_stride)*c + t2;
   2580                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
   2581                         {
   2582                             au2_4x4_sad[b] += (UWORD16)ABS((
   2583                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
   2584                         }
   2585                     }
   2586                 }
   2587 
   2588                 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
   2589                 hme_compute_sigmaX_and_sigmaXSquared(
   2590                     pu1_ref_ptr,
   2591                     ref_buf_stride,
   2592                     au4_4x4_ref_sigmaX,
   2593                     au4_4x4_ref_sigmaXSquared,
   2594                     4,
   2595                     4,
   2596                     16,
   2597                     16,
   2598                     1,
   2599                     4);
   2600 
   2601                 pi4_sad_grid[PART_ID_NxN_TL] =
   2602                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
   2603                 pi4_sad_grid[PART_ID_NxN_TR] =
   2604                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
   2605                 pi4_sad_grid[PART_ID_NxN_BL] =
   2606                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   2607                 pi4_sad_grid[PART_ID_NxN_BR] =
   2608                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
   2609                 pi4_sad_grid[PART_ID_Nx2N_L] =
   2610                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
   2611                 pi4_sad_grid[PART_ID_Nx2N_R] =
   2612                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
   2613                 pi4_sad_grid[PART_ID_2NxN_T] =
   2614                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
   2615                 pi4_sad_grid[PART_ID_2NxN_B] =
   2616                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
   2617                 pi4_sad_grid[PART_ID_nLx2N_L] =
   2618                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
   2619                 pi4_sad_grid[PART_ID_nRx2N_R] =
   2620                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
   2621                 pi4_sad_grid[PART_ID_2NxnU_T] =
   2622                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
   2623                 pi4_sad_grid[PART_ID_2NxnD_B] =
   2624                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   2625                 pi4_sad_grid[PART_ID_2Nx2N] =
   2626                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
   2627                 pi4_sad_grid[PART_ID_2NxnU_B] =
   2628                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
   2629                 pi4_sad_grid[PART_ID_2NxnD_T] =
   2630                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
   2631                 pi4_sad_grid[PART_ID_nRx2N_L] =
   2632                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
   2633                 pi4_sad_grid[PART_ID_nLx2N_R] =
   2634                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
   2635             }
   2636         }
   2637 
   2638         {
   2639             S32 i4_sad, i4_mv_cost, i4_tot_cost;
   2640             S32 best_node_cost;
   2641             S32 second_best_node_cost;
   2642             ULWORD64 u8_temp_var, u8_temp_var1;
   2643             ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
   2644 
   2645             {
   2646                 S16 mvdx1, mvdy1;
   2647                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
   2648                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
   2649                 S32 pred_lx = i4_search_idx;
   2650 
   2651                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
   2652                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
   2653                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
   2654 
   2655                 S32 inp_shift = 2;
   2656                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
   2657                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
   2658                 S32 lambda = ps_pred_ctxt->lambda;
   2659                 S32 rnd = 1 << (lambda_q_shift - 1);
   2660                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   2661                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   2662                 S32 ref_bits =
   2663                     ps_pred_ctxt
   2664                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
   2665 
   2666                 COMPUTE_DIFF_MV(
   2667                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   2668 
   2669                 mvdx1 = ABS(mvdx1);
   2670                 mvdy1 = ABS(mvdy1);
   2671 
   2672                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
   2673                              (mvdy1 > 0) + ref_bits + 2;
   2674 
   2675                 i4_mv_cost *= lambda;
   2676                 i4_mv_cost += rnd;
   2677                 i4_mv_cost >>= lambda_q_shift;
   2678 
   2679                 i4_mv_cost = CLIP_U16(i4_mv_cost);
   2680             }
   2681 
   2682             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
   2683             {
   2684                 S32 i4_stim_injected_sad;
   2685                 S32 i4_stim_injected_cost;
   2686                 S32 i4_noise_term;
   2687                 unsigned long u4_shift_val;
   2688                 S32 i4_bits_req;
   2689 
   2690                 S32 update_required = 0;
   2691                 S32 part_id = pi4_valid_part_ids[i4_count];
   2692                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
   2693 
   2694                 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
   2695 
   2696                 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
   2697 
   2698                 if(ps_search_prms->i4_alpha_stim_multiplier)
   2699                 {
   2700                     /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
   2701                     hme_compute_final_sigma_of_pu_from_base_blocks(
   2702                         au4_4x4_ref_sigmaX,
   2703                         au4_4x4_ref_sigmaXSquared,
   2704                         au8_final_ref_sigmaX,
   2705                         au8_final_ref_sigmaXSquared,
   2706                         16,
   2707                         4,
   2708                         part_id,
   2709                         4);
   2710 
   2711                     u8_ref_X_Square =
   2712                         (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
   2713                     u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
   2714 
   2715                     /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
   2716                     /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
   2717                     /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
   2718                     u4_shift_val = ihevce_calc_stim_injected_variance(
   2719                         au8_final_src_sigmaX,
   2720                         au8_final_src_sigmaXSquared,
   2721                         &u8_src_var,
   2722                         i4_inv_wt,
   2723                         ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
   2724                         ps_wt_inp_prms->wpred_log_wdc,
   2725                         part_id);
   2726 
   2727                     u8_ref_var = u8_ref_var >> u4_shift_val;
   2728 
   2729                     /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
   2730                     GETRANGE64(i4_bits_req, u8_ref_var);
   2731 
   2732                     if(i4_bits_req > 27)
   2733                     {
   2734                         u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
   2735                         u8_src_var = u8_src_var >> (i4_bits_req - 27);
   2736                     }
   2737 
   2738                     if(u8_src_var == u8_ref_var)
   2739                     {
   2740                         u8_temp_var = (1 << STIM_Q_FORMAT);
   2741                     }
   2742                     else
   2743                     {
   2744                         u8_temp_var = (2 * u8_src_var * u8_ref_var);
   2745                         u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
   2746                         u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
   2747                         u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
   2748                         u8_temp_var = (u8_temp_var / u8_temp_var1);
   2749                     }
   2750 
   2751                     i4_noise_term = (UWORD32)u8_temp_var;
   2752 
   2753                     ASSERT(i4_noise_term >= 0);
   2754 
   2755                     i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
   2756                 }
   2757                 else
   2758                 {
   2759                     i4_noise_term = 0;
   2760                 }
   2761                 u8_pure_dist = pi4_sad_grid[part_id];
   2762                 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
   2763                 u8_pure_dist += (1 << ((i4_q_level)-1));
   2764                 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
   2765 
   2766                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
   2767                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
   2768                 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
   2769                 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
   2770 
   2771                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
   2772                 second_best_node_cost =
   2773                     CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[1][index]);
   2774 
   2775                 if(i4_stim_injected_cost < second_best_node_cost)
   2776                 {
   2777                     update_required = 2;
   2778 
   2779                     if(i4_stim_injected_cost < best_node_cost)
   2780                     {
   2781                         update_required = 1;
   2782                     }
   2783                     else if(i4_stim_injected_cost == best_node_cost)
   2784                     {
   2785                         update_required = 0;
   2786                     }
   2787 
   2788                     if(update_required == 2)
   2789                     {
   2790                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
   2791                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
   2792                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
   2793                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
   2794                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
   2795                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
   2796                     }
   2797                     else if(update_required == 1)
   2798                     {
   2799                         ps_mv_refine_ctxt->i2_tot_cost[1][index] =
   2800                             ps_mv_refine_ctxt->i2_tot_cost[0][index];
   2801                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] =
   2802                             ps_mv_refine_ctxt->i2_stim_injected_cost[0][index];
   2803                         ps_mv_refine_ctxt->i2_mv_cost[1][index] =
   2804                             ps_mv_refine_ctxt->i2_mv_cost[0][index];
   2805                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
   2806                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
   2807                         ps_mv_refine_ctxt->i2_ref_idx[1][index] =
   2808                             ps_mv_refine_ctxt->i2_ref_idx[0][index];
   2809 
   2810                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
   2811                         ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
   2812                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
   2813                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
   2814                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
   2815                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
   2816                     }
   2817                 }
   2818             }
   2819         }
   2820 
   2821         ps_search_node++;
   2822     }
   2823 
   2824     {
   2825         WORD32 i4_i;
   2826         WORD32 part_id;
   2827         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
   2828         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
   2829         {
   2830             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
   2831             if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
   2832             {
   2833                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
   2834                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
   2835                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
   2836 
   2837                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
   2838             }
   2839             if(ps_mv_refine_ctxt->i2_stim_injected_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
   2840             {
   2841                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
   2842                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
   2843                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
   2844 
   2845                 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
   2846             }
   2847         }
   2848     }
   2849 }
   2850 
   2851 void hme_calc_sad_and_1_best_result(
   2852     hme_search_prms_t *ps_search_prms,
   2853     wgt_pred_ctxt_t *ps_wt_inp_prms,
   2854     err_prms_t *ps_err_prms,
   2855     result_upd_prms_t *ps_result_prms,
   2856     U08 **ppu1_ref,
   2857     S32 i4_ref_stride)
   2858 {
   2859     S32 i4_candt;
   2860     S32 i4_inp_off;
   2861     S32 i4_ref_offset;
   2862     S32 i4_num_nodes;
   2863 
   2864     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
   2865     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
   2866     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
   2867     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
   2868     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
   2869 
   2870     mv_refine_ctxt_t *ps_mv_refine_ctxt;
   2871     search_node_t *ps_search_node;
   2872 
   2873     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
   2874     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
   2875     i4_inp_off = ps_search_prms->i4_cu_x_off;
   2876     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
   2877     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
   2878     ps_search_node = ps_search_prms->ps_search_nodes;
   2879 
   2880     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
   2881     {
   2882         /**********************************************************************/
   2883         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
   2884         /**********************************************************************/
   2885         {
   2886             WORD32 b, c, d;
   2887             UWORD8 *pu1_cur_ptr;
   2888             UWORD8 *pu1_ref_ptr;
   2889             UWORD16 au2_4x4_sad[NUM_4X4];
   2890 
   2891             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
   2892             {
   2893                 continue;
   2894             }
   2895 
   2896             ps_err_prms->pu1_inp =
   2897                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
   2898             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
   2899             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
   2900             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
   2901 
   2902             pu1_cur_ptr = ps_err_prms->pu1_inp;
   2903             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
   2904 
   2905             /* Loop to compute the SAD's */
   2906             {
   2907                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
   2908                 for(b = 0; b < NUM_4X4; b++)
   2909                 {
   2910                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
   2911                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
   2912 
   2913                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
   2914                     {
   2915                         WORD32 z_cur = (cur_buf_stride)*c + t1;
   2916                         WORD32 z_ref = (ref_buf_stride)*c + t2;
   2917                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
   2918                         {
   2919                             au2_4x4_sad[b] += (UWORD16)ABS((
   2920                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
   2921                         }
   2922                     }
   2923                 }
   2924 
   2925                 pi4_sad_grid[PART_ID_NxN_TL] =
   2926                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
   2927                 pi4_sad_grid[PART_ID_NxN_TR] =
   2928                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
   2929                 pi4_sad_grid[PART_ID_NxN_BL] =
   2930                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   2931                 pi4_sad_grid[PART_ID_NxN_BR] =
   2932                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
   2933                 pi4_sad_grid[PART_ID_Nx2N_L] =
   2934                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
   2935                 pi4_sad_grid[PART_ID_Nx2N_R] =
   2936                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
   2937                 pi4_sad_grid[PART_ID_2NxN_T] =
   2938                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
   2939                 pi4_sad_grid[PART_ID_2NxN_B] =
   2940                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
   2941                 pi4_sad_grid[PART_ID_nLx2N_L] =
   2942                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
   2943                 pi4_sad_grid[PART_ID_nRx2N_R] =
   2944                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
   2945                 pi4_sad_grid[PART_ID_2NxnU_T] =
   2946                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
   2947                 pi4_sad_grid[PART_ID_2NxnD_B] =
   2948                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   2949                 pi4_sad_grid[PART_ID_2Nx2N] =
   2950                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
   2951                 pi4_sad_grid[PART_ID_2NxnU_B] =
   2952                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
   2953                 pi4_sad_grid[PART_ID_2NxnD_T] =
   2954                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
   2955                 pi4_sad_grid[PART_ID_nRx2N_L] =
   2956                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
   2957                 pi4_sad_grid[PART_ID_nLx2N_R] =
   2958                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
   2959             }
   2960         }
   2961 
   2962         {
   2963             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
   2964             S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
   2965             S32 best_node_cost;
   2966             S32 second_best_node_cost;
   2967 
   2968             {
   2969                 S16 mvdx1, mvdy1;
   2970                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
   2971                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
   2972                 S32 pred_lx = i4_search_idx;
   2973 
   2974                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
   2975                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
   2976                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
   2977 
   2978                 S32 inp_shift = 2;
   2979                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
   2980                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
   2981                 S32 lambda = ps_pred_ctxt->lambda;
   2982                 S32 rnd = 1 << (lambda_q_shift - 1);
   2983                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   2984                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   2985                 S32 ref_bits =
   2986                     ps_pred_ctxt
   2987                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
   2988 
   2989                 COMPUTE_DIFF_MV(
   2990                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   2991 
   2992                 mvdx1 = ABS(mvdx1);
   2993                 mvdy1 = ABS(mvdy1);
   2994 
   2995                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
   2996                              (mvdy1 > 0) + ref_bits + 2;
   2997 
   2998                 i4_mv_cost *= lambda;
   2999                 i4_mv_cost += rnd;
   3000                 i4_mv_cost >>= lambda_q_shift;
   3001 
   3002                 i4_mv_cost = CLIP_U16(i4_mv_cost);
   3003             }
   3004 
   3005             /*For each valid partition, update the refine_prm structure to reflect the best and second
   3006             best candidates for that partition*/
   3007 
   3008             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
   3009             {
   3010                 S32 update_required = 0;
   3011                 S32 part_id = pi4_valid_part_ids[i4_count];
   3012                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
   3013 
   3014                 /*Calculate total cost*/
   3015                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
   3016                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
   3017 
   3018                 /*****************************************************************/
   3019                 /* We do not labor through the results if the total cost worse   */
   3020                 /* than the last of the results.                                 */
   3021                 /*****************************************************************/
   3022                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
   3023                 second_best_node_cost = SHRT_MAX;
   3024 
   3025                 if(i4_tot_cost < second_best_node_cost)
   3026                 {
   3027                     update_required = 0;
   3028 
   3029                     /*************************************************************/
   3030                     /* Identify where the current result isto be placed.Basically*/
   3031                     /* find the node which has cost just higher thannodeundertest*/
   3032                     /*************************************************************/
   3033                     if(i4_tot_cost < best_node_cost)
   3034                     {
   3035                         update_required = 1;
   3036                     }
   3037                     else if(i4_tot_cost == best_node_cost)
   3038                     {
   3039                         update_required = 0;
   3040                     }
   3041 
   3042                     if(update_required == 2)
   3043                     {
   3044                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
   3045                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
   3046                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
   3047                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
   3048                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
   3049                     }
   3050                     else if(update_required == 1)
   3051                     {
   3052                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
   3053                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
   3054                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
   3055                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
   3056                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
   3057                     }
   3058                 }
   3059             }
   3060         }
   3061         ps_search_node++;
   3062     }
   3063 
   3064     {
   3065         WORD32 i4_i;
   3066         WORD32 part_id;
   3067         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
   3068         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
   3069         {
   3070             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
   3071             if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
   3072             {
   3073                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
   3074                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
   3075                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
   3076 
   3077                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
   3078             }
   3079         }
   3080     }
   3081 }
   3082 
   3083 void hme_calc_stim_injected_sad_and_1_best_result(
   3084     hme_search_prms_t *ps_search_prms,
   3085     wgt_pred_ctxt_t *ps_wt_inp_prms,
   3086     err_prms_t *ps_err_prms,
   3087     result_upd_prms_t *ps_result_prms,
   3088     U08 **ppu1_ref,
   3089     S32 i4_ref_stride)
   3090 {
   3091     mv_refine_ctxt_t *ps_mv_refine_ctxt;
   3092     search_node_t *ps_search_node;
   3093 
   3094     S32 i4_candt;
   3095     S32 i4_count;
   3096     S32 i4_inp_off;
   3097     S32 i4_ref_offset;
   3098     S32 i4_num_nodes;
   3099     ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
   3100         au8_final_ref_sigmaXSquared[17];
   3101     UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
   3102     S32 *pi4_valid_part_ids;
   3103 
   3104     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
   3105     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
   3106     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
   3107     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
   3108     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
   3109 
   3110     ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
   3111     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
   3112     i4_inp_off = ps_search_prms->i4_cu_x_off;
   3113     i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
   3114     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
   3115     ps_search_node = ps_search_prms->ps_search_nodes;
   3116     pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
   3117 
   3118     /* Set local pointer to point to partition level sigma values calculated in hme_refine */
   3119     au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
   3120     au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
   3121 
   3122     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
   3123     {
   3124         {
   3125             WORD32 b, c, d;
   3126             UWORD8 *pu1_cur_ptr;
   3127             UWORD8 *pu1_ref_ptr;
   3128             UWORD16 au2_4x4_sad[NUM_4X4];
   3129 
   3130             if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
   3131             {
   3132                 continue;
   3133             }
   3134 
   3135             ps_err_prms->pu1_inp =
   3136                 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
   3137             ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
   3138             ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
   3139             ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
   3140 
   3141             pu1_cur_ptr = ps_err_prms->pu1_inp;
   3142             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
   3143 
   3144             /* Loop to compute the SAD's */
   3145             {
   3146                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
   3147                 for(b = 0; b < NUM_4X4; b++)
   3148                 {
   3149                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
   3150                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
   3151 
   3152                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
   3153                     {
   3154                         WORD32 z_cur = (cur_buf_stride)*c + t1;
   3155                         WORD32 z_ref = (ref_buf_stride)*c + t2;
   3156                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
   3157                         {
   3158                             au2_4x4_sad[b] += (UWORD16)ABS((
   3159                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
   3160                         }
   3161                     }
   3162                 }
   3163 
   3164                 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
   3165                 hme_compute_sigmaX_and_sigmaXSquared(
   3166                     pu1_ref_ptr,
   3167                     ref_buf_stride,
   3168                     au4_4x4_ref_sigmaX,
   3169                     au4_4x4_ref_sigmaXSquared,
   3170                     4,
   3171                     4,
   3172                     16,
   3173                     16,
   3174                     1,
   3175                     4);
   3176 
   3177                 pi4_sad_grid[PART_ID_NxN_TL] =
   3178                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
   3179                 pi4_sad_grid[PART_ID_NxN_TR] =
   3180                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
   3181                 pi4_sad_grid[PART_ID_NxN_BL] =
   3182                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   3183                 pi4_sad_grid[PART_ID_NxN_BR] =
   3184                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
   3185                 pi4_sad_grid[PART_ID_Nx2N_L] =
   3186                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
   3187                 pi4_sad_grid[PART_ID_Nx2N_R] =
   3188                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
   3189                 pi4_sad_grid[PART_ID_2NxN_T] =
   3190                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
   3191                 pi4_sad_grid[PART_ID_2NxN_B] =
   3192                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
   3193                 pi4_sad_grid[PART_ID_nLx2N_L] =
   3194                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
   3195                 pi4_sad_grid[PART_ID_nRx2N_R] =
   3196                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
   3197                 pi4_sad_grid[PART_ID_2NxnU_T] =
   3198                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
   3199                 pi4_sad_grid[PART_ID_2NxnD_B] =
   3200                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   3201                 pi4_sad_grid[PART_ID_2Nx2N] =
   3202                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
   3203                 pi4_sad_grid[PART_ID_2NxnU_B] =
   3204                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
   3205                 pi4_sad_grid[PART_ID_2NxnD_T] =
   3206                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
   3207                 pi4_sad_grid[PART_ID_nRx2N_L] =
   3208                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
   3209                 pi4_sad_grid[PART_ID_nLx2N_R] =
   3210                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
   3211             }
   3212         }
   3213 
   3214         {
   3215             S32 i4_sad, i4_mv_cost, i4_tot_cost;
   3216             S32 best_node_cost;
   3217             S32 second_best_node_cost;
   3218             ULWORD64 u8_temp_var, u8_temp_var1;
   3219             ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
   3220 
   3221             {
   3222                 S16 mvdx1, mvdy1;
   3223                 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
   3224                 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
   3225                 S32 pred_lx = i4_search_idx;
   3226 
   3227                 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
   3228                 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
   3229                 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
   3230 
   3231                 S32 inp_shift = 2;
   3232                 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
   3233                 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
   3234                 S32 lambda = ps_pred_ctxt->lambda;
   3235                 S32 rnd = 1 << (lambda_q_shift - 1);
   3236                 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   3237                 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   3238                 S32 ref_bits =
   3239                     ps_pred_ctxt
   3240                         ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
   3241 
   3242                 COMPUTE_DIFF_MV(
   3243                     mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
   3244 
   3245                 mvdx1 = ABS(mvdx1);
   3246                 mvdy1 = ABS(mvdy1);
   3247 
   3248                 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
   3249                              (mvdy1 > 0) + ref_bits + 2;
   3250 
   3251                 i4_mv_cost *= lambda;
   3252                 i4_mv_cost += rnd;
   3253                 i4_mv_cost >>= lambda_q_shift;
   3254 
   3255                 i4_mv_cost = CLIP_U16(i4_mv_cost);
   3256             }
   3257 
   3258             for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
   3259             {
   3260                 S32 i4_stim_injected_sad;
   3261                 S32 i4_stim_injected_cost;
   3262                 S32 i4_noise_term;
   3263                 unsigned long u4_shift_val;
   3264                 S32 i4_bits_req;
   3265 
   3266                 S32 update_required = 0;
   3267                 S32 part_id = pi4_valid_part_ids[i4_count];
   3268                 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
   3269 
   3270                 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
   3271 
   3272                 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
   3273 
   3274                 if(ps_search_prms->i4_alpha_stim_multiplier)
   3275                 {
   3276                     /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
   3277                     hme_compute_final_sigma_of_pu_from_base_blocks(
   3278                         au4_4x4_ref_sigmaX,
   3279                         au4_4x4_ref_sigmaXSquared,
   3280                         au8_final_ref_sigmaX,
   3281                         au8_final_ref_sigmaXSquared,
   3282                         16,
   3283                         4,
   3284                         part_id,
   3285                         4);
   3286 
   3287                     u8_ref_X_Square =
   3288                         (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
   3289                     u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
   3290 
   3291                     /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
   3292                     /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
   3293                     /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
   3294                     u4_shift_val = ihevce_calc_stim_injected_variance(
   3295                         au8_final_src_sigmaX,
   3296                         au8_final_src_sigmaXSquared,
   3297                         &u8_src_var,
   3298                         i4_inv_wt,
   3299                         ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
   3300                         ps_wt_inp_prms->wpred_log_wdc,
   3301                         part_id);
   3302 
   3303                     u8_ref_var = u8_ref_var >> u4_shift_val;
   3304 
   3305                     /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
   3306                     GETRANGE64(i4_bits_req, u8_ref_var);
   3307 
   3308                     if(i4_bits_req > 27)
   3309                     {
   3310                         u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
   3311                         u8_src_var = u8_src_var >> (i4_bits_req - 27);
   3312                     }
   3313 
   3314                     if(u8_src_var == u8_ref_var)
   3315                     {
   3316                         u8_temp_var = (1 << STIM_Q_FORMAT);
   3317                     }
   3318                     else
   3319                     {
   3320                         u8_temp_var = (2 * u8_src_var * u8_ref_var);
   3321                         u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
   3322                         u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
   3323                         u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
   3324                         u8_temp_var = (u8_temp_var / u8_temp_var1);
   3325                     }
   3326 
   3327                     i4_noise_term = (UWORD32)u8_temp_var;
   3328 
   3329                     ASSERT(i4_noise_term >= 0);
   3330 
   3331                     i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
   3332                 }
   3333                 else
   3334                 {
   3335                     i4_noise_term = 0;
   3336                 }
   3337                 u8_pure_dist = pi4_sad_grid[part_id];
   3338                 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
   3339                 u8_pure_dist += (1 << ((i4_q_level)-1));
   3340                 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
   3341 
   3342                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
   3343                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
   3344                 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
   3345                 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
   3346 
   3347                 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
   3348                 second_best_node_cost = SHRT_MAX;
   3349 
   3350                 if(i4_stim_injected_cost < second_best_node_cost)
   3351                 {
   3352                     update_required = 0;
   3353 
   3354                     if(i4_stim_injected_cost < best_node_cost)
   3355                     {
   3356                         update_required = 1;
   3357                     }
   3358                     else if(i4_stim_injected_cost == best_node_cost)
   3359                     {
   3360                         update_required = 0;
   3361                     }
   3362 
   3363                     if(update_required == 2)
   3364                     {
   3365                         ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
   3366                         ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
   3367                         ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
   3368                         ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
   3369                         ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
   3370                         ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
   3371                     }
   3372                     else if(update_required == 1)
   3373                     {
   3374                         ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
   3375                         ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
   3376                         ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
   3377                         ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
   3378                         ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
   3379                         ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
   3380                     }
   3381                 }
   3382             }
   3383         }
   3384 
   3385         ps_search_node++;
   3386     }
   3387 
   3388     {
   3389         WORD32 i4_i;
   3390         WORD32 part_id;
   3391         search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
   3392         for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
   3393         {
   3394             part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
   3395             if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
   3396             {
   3397                 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
   3398                 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
   3399                 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
   3400 
   3401                 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
   3402             }
   3403         }
   3404     }
   3405 }
   3406 
   3407 void hme_calc_sad_and_1_best_result_subpel(
   3408     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
   3409 {
   3410     S32 i4_candt;
   3411     S32 i4_num_nodes;
   3412 
   3413     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
   3414 
   3415     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
   3416     WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
   3417     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
   3418     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
   3419 
   3420     mv_refine_ctxt_t *ps_subpel_refine_ctxt;
   3421     ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
   3422     i4_num_nodes = 1;
   3423 
   3424     /* Run through each of the candts in a loop */
   3425     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
   3426     {
   3427         /**********************************************************************/
   3428         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
   3429         /**********************************************************************/
   3430         {
   3431             WORD32 b, c, d;
   3432             UWORD8 *pu1_cur_ptr;
   3433             UWORD8 *pu1_ref_ptr;
   3434             UWORD16 au2_4x4_sad[NUM_4X4];
   3435 
   3436             pu1_cur_ptr = ps_err_prms->pu1_inp;
   3437             pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
   3438 
   3439             /* Loop to compute the SAD's */
   3440             {
   3441                 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
   3442                 for(b = 0; b < NUM_4X4; b++)
   3443                 {
   3444                     WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
   3445                     WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
   3446 
   3447                     for(c = 0; c < NUM_ROWS_IN_4X4; c++)
   3448                     {
   3449                         WORD32 z_cur = (cur_buf_stride)*c + t1;
   3450                         WORD32 z_ref = (ref_buf_stride)*c + t2;
   3451                         for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
   3452                         {
   3453                             au2_4x4_sad[b] += (UWORD16)ABS((
   3454                                 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
   3455                         }
   3456                     }
   3457                 }
   3458 
   3459                 pi4_sad_grid[PART_ID_NxN_TL] =
   3460                     (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
   3461                 pi4_sad_grid[PART_ID_NxN_TR] =
   3462                     (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
   3463                 pi4_sad_grid[PART_ID_NxN_BL] =
   3464                     (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   3465                 pi4_sad_grid[PART_ID_NxN_BR] =
   3466                     (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
   3467                 pi4_sad_grid[PART_ID_Nx2N_L] =
   3468                     pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
   3469                 pi4_sad_grid[PART_ID_Nx2N_R] =
   3470                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
   3471                 pi4_sad_grid[PART_ID_2NxN_T] =
   3472                     pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
   3473                 pi4_sad_grid[PART_ID_2NxN_B] =
   3474                     pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
   3475                 pi4_sad_grid[PART_ID_nLx2N_L] =
   3476                     (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
   3477                 pi4_sad_grid[PART_ID_nRx2N_R] =
   3478                     (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
   3479                 pi4_sad_grid[PART_ID_2NxnU_T] =
   3480                     (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
   3481                 pi4_sad_grid[PART_ID_2NxnD_B] =
   3482                     (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
   3483                 pi4_sad_grid[PART_ID_2Nx2N] =
   3484                     pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
   3485                 pi4_sad_grid[PART_ID_2NxnU_B] =
   3486                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
   3487                 pi4_sad_grid[PART_ID_2NxnD_T] =
   3488                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
   3489                 pi4_sad_grid[PART_ID_nRx2N_L] =
   3490                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
   3491                 pi4_sad_grid[PART_ID_nLx2N_R] =
   3492                     pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
   3493             }
   3494         }
   3495         /**********************************************************************/
   3496         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
   3497         /**********************************************************************/
   3498         {
   3499             S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
   3500             S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
   3501             S32 best_node_cost;
   3502             S32 second_best_node_cost;
   3503 
   3504             /*For each valid partition, update the refine_prm structure to reflect the best and second
   3505             best candidates for that partition*/
   3506 
   3507             for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
   3508             {
   3509                 S32 update_required = 0;
   3510                 S32 part_id = pi4_valid_part_ids[i4_count];
   3511                 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
   3512 
   3513                 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
   3514                 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3515 
   3516                 /*Calculate total cost*/
   3517                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
   3518                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
   3519 
   3520                 /*****************************************************************/
   3521                 /* We do not labor through the results if the total cost worse   */
   3522                 /* than the last of the results.                                 */
   3523                 /*****************************************************************/
   3524                 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
   3525                 second_best_node_cost = SHRT_MAX;
   3526 
   3527                 if(i4_tot_cost < second_best_node_cost)
   3528                 {
   3529                     update_required = 0;
   3530 
   3531                     /*************************************************************/
   3532                     /* Identify where the current result isto be placed.Basically*/
   3533                     /* find the node which has cost just higher thannodeundertest*/
   3534                     /*************************************************************/
   3535                     if(i4_tot_cost < best_node_cost)
   3536                     {
   3537                         update_required = 1;
   3538                     }
   3539                     else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
   3540                     {
   3541                         update_required = 0;
   3542                     }
   3543                     if(update_required == 2)
   3544                     {
   3545                         ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
   3546                         ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
   3547                         ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
   3548                         ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
   3549                         ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
   3550                     }
   3551                     else if(update_required == 1)
   3552                     {
   3553                         ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
   3554                         ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
   3555                         ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
   3556                         ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
   3557                         ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
   3558                     }
   3559                 }
   3560             }
   3561         }
   3562     }
   3563 
   3564     {
   3565         WORD32 i4_count = 0;
   3566         for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
   3567         {
   3568             if(ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] >= MAX_SIGNED_16BIT_VAL)
   3569             {
   3570                 ps_subpel_refine_ctxt->ai2_fullpel_satd[0][i4_count] = MAX_SIGNED_16BIT_VAL;
   3571             }
   3572         }
   3573     }
   3574 }
   3575 
   3576 /**
   3577 ********************************************************************************
   3578 *  @fn     hme_calc_pt_sad_and_result_explicit(hme_search_prms_t *ps_search_prms,
   3579 *                                              wgt_pred_ctxt_t *ps_wt_inp_prms,
   3580 *                                              err_prms_t *ps_err_prms,
   3581 *                                              result_upd_prms_t *ps_result_prms,
   3582 *                                              U08 **ppu1_ref,
   3583 *                                              S32 i4_ref_stride)
   3584 *
   3585 *  @brief   Run thorugh the provided candidates and compute the point SAD and
   3586 *           cost and update the results in the order
   3587 *
   3588 *  @param[in]  ps_search_prms
   3589 *  @param[in]  ps_wt_inp_prms
   3590 *  @param[in]  ps_err_prms
   3591 *  @param[out] ps_result_prms
   3592 *  @param[in]  ppu1_ref
   3593 *  @param[in]  i4_ref_stride
   3594 *
   3595 *  @return   None
   3596 ********************************************************************************
   3597 */
   3598 
   3599 void hme_calc_pt_sad_and_result_explicit(
   3600     hme_search_prms_t *ps_search_prms,
   3601     wgt_pred_ctxt_t *ps_wt_inp_prms,
   3602     err_prms_t *ps_err_prms,
   3603     result_upd_prms_t *ps_result_prms,
   3604     U08 **ppu1_ref,
   3605     S32 i4_ref_stride)
   3606 {
   3607     WORD32 i4_grid_mask, i4_part_mask, i4_num_results, i4_candt, i4_num_nodes;
   3608     WORD32 i4_inp_stride, i4_inp_off, i4_ref_offset;
   3609 
   3610     search_node_t *ps_search_node;
   3611     BLK_SIZE_T e_blk_size;
   3612     PF_SAD_FXN_T pf_sad_fxn;
   3613     PF_RESULT_FXN_T pf_hme_result_fxn;
   3614 
   3615     i4_grid_mask = 0x1; /* Point SAD */
   3616 
   3617     /* Get the parameters required */
   3618     i4_part_mask = ps_search_prms->i4_part_mask;
   3619     e_blk_size = ps_search_prms->e_blk_size;
   3620     i4_num_results = (S32)ps_search_prms->ps_search_results->u1_num_results_per_part;
   3621     i4_num_nodes = ps_search_prms->i4_num_search_nodes;
   3622     ps_search_node = ps_search_prms->ps_search_nodes;
   3623 
   3624     i4_inp_stride = ps_search_prms->i4_inp_stride;
   3625     /* Move to the location of the search blk in inp buffer */
   3626     i4_inp_off = ps_search_prms->i4_cu_x_off;
   3627     i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride;
   3628     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
   3629 
   3630     pf_sad_fxn = hme_get_sad_fxn(e_blk_size, i4_grid_mask, i4_part_mask);
   3631     /**********************************************************************/
   3632     /* we have a sparsely populated SAD grid of size 9x17.                */
   3633     /* the id of the results in the grid is shown                         */
   3634     /*     5   2   6                                                      */
   3635     /*     1   0   3                                                      */
   3636     /*     7   4   8                                                      */
   3637     /* The motivation for choosing a grid like this is that               */
   3638     /* in case of no refinement, the central location is                  */
   3639     /* the first entry in the grid                                        */
   3640     /* Also for diamond, the 4 entries get considered first               */
   3641     /* This is consistent with the diamond notation used in               */
   3642     /* subpel refinement. To Check                                        */
   3643     /* Update the results for the given search candt                      */
   3644     /* returns the cost of the 2Nx2N partition                            */
   3645     /**********************************************************************/
   3646 
   3647     /* Get the modified update result fun. with CLIP16 of cost to match   */
   3648     /* with SIMD */
   3649     pf_hme_result_fxn = hme_update_results_grid_pu_bestn_no_encode;
   3650 
   3651     for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
   3652     {
   3653         if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
   3654             continue;
   3655 
   3656         /* initialize minimum cost for this candidate. As we search around */
   3657         /* this candidate, this is used to check early exit, when in any   */
   3658         /* given iteration, the center pt of the grid is lowest value      */
   3659         ps_result_prms->i4_min_cost = MAX_32BIT_VAL;
   3660 
   3661         ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
   3662         ps_err_prms->i4_grid_mask = i4_grid_mask;
   3663 
   3664         ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
   3665         ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
   3666         ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
   3667 
   3668         /**********************************************************************/
   3669         /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID   */
   3670         /**********************************************************************/
   3671         pf_sad_fxn(ps_err_prms);
   3672 
   3673         /**********************************************************************/
   3674         /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS           */
   3675         /**********************************************************************/
   3676         ps_result_prms->i4_grid_mask = i4_grid_mask;
   3677         ps_result_prms->ps_search_node_base = ps_search_node;
   3678         pf_hme_result_fxn(ps_result_prms);
   3679 
   3680         ps_search_node++;
   3681     }
   3682 }
   3683 
   3684 /**
   3685 ********************************************************************************
   3686 *  @fn     hme_set_mvp_node(search_results_t *ps_search_results,
   3687 *                           search_node_t *ps_candt_prj_coloc,
   3688 *                           S08 i1_ref_idx)
   3689 *
   3690 *  @brief   Set node used for motion vector predictor computation
   3691 *           Either TR or L is compared to projected colocated and
   3692 *           closest is decided as MVP
   3693 *
   3694 *  @param[in]  ps_search_results
   3695 *
   3696 *  @param[in]  ps_candt_prj_coloc
   3697 *
   3698 *  @param[in]  i1_ref_idx
   3699 *
   3700 *  @return   None
   3701 ********************************************************************************
   3702 */
   3703 void hme_set_mvp_node(
   3704     search_results_t *ps_search_results,
   3705     search_node_t *ps_candt_prj_coloc,
   3706     U08 u1_pred_lx,
   3707     U08 u1_default_ref_id)
   3708 {
   3709     S32 i;
   3710     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[u1_pred_lx];
   3711     pred_candt_nodes_t *ps_pred_nodes = ps_pred_ctxt->as_pred_nodes;
   3712     search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
   3713 
   3714     S32 inp_shift = 2;
   3715     S32 pred_shift;
   3716     S32 ref_bits;
   3717     S32 mv_p_x, mv_p_y;
   3718     S16 mvdx1, mvdx2, mvdy1, mvdy2;
   3719 
   3720     ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[u1_pred_lx][u1_default_ref_id];
   3721 
   3722     /*************************************************************************/
   3723     /* Priority to bottom left availability. Else we go to left. If both are */
   3724     /* not available, then a remains null                                    */
   3725     /*************************************************************************/
   3726     if(ps_pred_nodes->ps_l->u1_is_avail)
   3727     {
   3728         ps_pred_node_a = ps_pred_nodes->ps_l;
   3729     }
   3730 
   3731     if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
   3732     {
   3733         ps_pred_node_b = ps_pred_nodes->ps_tr;
   3734     }
   3735     else
   3736     {
   3737         ps_pred_node_b = ps_pred_nodes->ps_coloc;
   3738         ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
   3739     }
   3740 
   3741     if(ps_pred_node_a == NULL)
   3742     {
   3743         ps_pred_node_a = ps_pred_nodes->ps_coloc;
   3744         ps_pred_node_a->s_mv = ps_pred_node_a->ps_mv[0];
   3745 
   3746         if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
   3747         {
   3748             ps_pred_node_b = ps_pred_nodes->ps_zeromv;
   3749             ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
   3750         }
   3751     }
   3752 
   3753     if(ps_pred_node_a->i1_ref_idx != u1_default_ref_id)
   3754     {
   3755         SCALE_FOR_POC_DELTA(
   3756             mv_p_x, mv_p_y, ps_pred_node_a, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
   3757     }
   3758     else
   3759     {
   3760         mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
   3761         mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
   3762     }
   3763     pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
   3764     COMPUTE_MV_DIFFERENCE(mvdx1, mvdy1, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
   3765     mvdx1 = ABS(mvdx1);
   3766     mvdy1 = ABS(mvdy1);
   3767 
   3768     if(ps_pred_node_b->i1_ref_idx != u1_default_ref_id)
   3769     {
   3770         SCALE_FOR_POC_DELTA(
   3771             mv_p_x, mv_p_y, ps_pred_node_b, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
   3772     }
   3773     else
   3774     {
   3775         mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
   3776         mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
   3777     }
   3778     pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
   3779     COMPUTE_MV_DIFFERENCE(mvdx2, mvdy2, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
   3780     mvdx2 = ABS(mvdx2);
   3781     mvdy2 = ABS(mvdy2);
   3782 
   3783     if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
   3784     {
   3785         for(i = 0; i < TOT_NUM_PARTS; i++)
   3786         {
   3787             ps_pred_nodes[i].ps_mvp_node = ps_pred_node_a;
   3788         }
   3789     }
   3790     else
   3791     {
   3792         for(i = 0; i < TOT_NUM_PARTS; i++)
   3793         {
   3794             ps_pred_nodes[i].ps_mvp_node = ps_pred_node_b;
   3795         }
   3796     }
   3797 }
   3798