Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22 ******************************************************************************
     23 * @file hme_subpel.c
     24 *
     25 * @brief
     26 *    Subpel refinement modules for ME algo
     27 *
     28 * @author
     29 *    Ittiam
     30 *
     31 *
     32 * List of Functions
     33 * hme_qpel_interp_avg()
     34 * hme_subpel_refine_ctblist_bck()
     35 * hme_subpel_refine_ctblist_fwd()
     36 * hme_refine_bidirect()
     37 * hme_subpel_refinement()
     38 * hme_subpel_refine_ctb_fwd()
     39 * hme_subpel_refine_ctb_bck()
     40 * hme_create_bck_inp()
     41 * hme_subpel_refine_search_node()
     42 ******************************************************************************
     43 */
     44 
     45 /*****************************************************************************/
     46 /* File Includes                                                             */
     47 /*****************************************************************************/
     48 /* System include files */
     49 #include <stdio.h>
     50 #include <string.h>
     51 #include <stdlib.h>
     52 #include <assert.h>
     53 #include <stdarg.h>
     54 #include <math.h>
     55 #include <limits.h>
     56 
     57 /* User include files */
     58 #include "ihevc_typedefs.h"
     59 #include "itt_video_api.h"
     60 #include "ihevce_api.h"
     61 
     62 #include "rc_cntrl_param.h"
     63 #include "rc_frame_info_collector.h"
     64 #include "rc_look_ahead_params.h"
     65 
     66 #include "ihevc_defs.h"
     67 #include "ihevc_structs.h"
     68 #include "ihevc_platform_macros.h"
     69 #include "ihevc_deblk.h"
     70 #include "ihevc_itrans_recon.h"
     71 #include "ihevc_chroma_itrans_recon.h"
     72 #include "ihevc_chroma_intra_pred.h"
     73 #include "ihevc_intra_pred.h"
     74 #include "ihevc_inter_pred.h"
     75 #include "ihevc_mem_fns.h"
     76 #include "ihevc_padding.h"
     77 #include "ihevc_weighted_pred.h"
     78 #include "ihevc_sao.h"
     79 #include "ihevc_resi_trans.h"
     80 #include "ihevc_quant_iquant_ssd.h"
     81 #include "ihevc_cabac_tables.h"
     82 
     83 #include "ihevce_defs.h"
     84 #include "ihevce_lap_enc_structs.h"
     85 #include "ihevce_multi_thrd_structs.h"
     86 #include "ihevce_multi_thrd_funcs.h"
     87 #include "ihevce_me_common_defs.h"
     88 #include "ihevce_had_satd.h"
     89 #include "ihevce_error_codes.h"
     90 #include "ihevce_bitstream.h"
     91 #include "ihevce_cabac.h"
     92 #include "ihevce_rdoq_macros.h"
     93 #include "ihevce_function_selector.h"
     94 #include "ihevce_enc_structs.h"
     95 #include "ihevce_entropy_structs.h"
     96 #include "ihevce_cmn_utils_instr_set_router.h"
     97 #include "ihevce_enc_loop_structs.h"
     98 #include "ihevce_bs_compute_ctb.h"
     99 #include "ihevce_global_tables.h"
    100 #include "ihevce_dep_mngr_interface.h"
    101 #include "hme_datatype.h"
    102 #include "hme_interface.h"
    103 #include "hme_common_defs.h"
    104 #include "hme_defs.h"
    105 #include "ihevce_me_instr_set_router.h"
    106 #include "hme_globals.h"
    107 #include "hme_utils.h"
    108 #include "hme_coarse.h"
    109 #include "hme_fullpel.h"
    110 #include "hme_subpel.h"
    111 #include "hme_refine.h"
    112 #include "hme_err_compute.h"
    113 #include "hme_common_utils.h"
    114 #include "hme_search_algo.h"
    115 #include "ihevce_stasino_helpers.h"
    116 #include "ihevce_common_utils.h"
    117 
    118 /*****************************************************************************/
    119 /* Function Definitions                                                      */
    120 /*****************************************************************************/
    121 void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
    122 {
    123     U08 *pu1_src1, *pu1_src2, *pu1_dst;
    124     qpel_input_buf_cfg_t *ps_inp_cfg;
    125     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    126 
    127     /*************************************************************************/
    128     /* For a given QPEL pt, we need to determine the 2 source pts that are   */
    129     /* needed to do the QPEL averaging. The logic to do this is as follows   */
    130     /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are     */
    131     /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0    */
    132     /* pt of th reference blk that is colocated to the inp blk.              */
    133     /*    A j E k B                                                          */
    134     /*    l m n o p                                                          */
    135     /*    F q G r H                                                          */
    136     /*    s t u v w                                                          */
    137     /*    C x I y D                                                          */
    138     /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/
    139     /* and (1,1) respectively in the fpel buffer (id = 0)                    */
    140     /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf  */
    141     /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf  */
    142     /* G is hxhy pt in offset 0,0 in hxhy buf                                */
    143     /* All above offsets are computed w.r.t. motion displaced pt in          */
    144     /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and  */
    145     /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G                     */
    146     /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3   */
    147     /* If we consider pt v to be derived. v has a fractional comp of 3, 3    */
    148     /* v is avg of H and I. So the table look up of v should give following  */
    149     /* buf 1 (H) : offset = (1, 0) buf id = 2.                               */
    150     /* buf 2 (I) : offset = 0 , 1) buf id = 1.                               */
    151     /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1.       */
    152     /*************************************************************************/
    153     i4_mv_x_frac = i4_mv_x & 3;
    154     i4_mv_y_frac = i4_mv_y & 3;
    155 
    156     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride;
    157 
    158     /* Derive the descriptor that has all offset and size info */
    159     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    160 
    161     if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
    162     {
    163         /* This is case for fxfy/hxfy/fxhy/hxhy */
    164         ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    165         ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    166         ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
    167         ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride;
    168 
    169         return;
    170     }
    171 
    172     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    173     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    174     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
    175 
    176     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
    177     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
    178     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride);
    179 
    180     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
    181     hevc_avg_2d(
    182         pu1_src1,
    183         pu1_src2,
    184         ps_prms->i4_ref_stride,
    185         ps_prms->i4_ref_stride,
    186         ps_prms->i4_blk_wd,
    187         ps_prms->i4_blk_ht,
    188         pu1_dst,
    189         ps_prms->i4_out_stride);
    190     ps_prms->pu1_final_out = pu1_dst;
    191     ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
    192 }
    193 
    194 static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse(
    195     interp_prms_t *ps_prms,
    196     S32 i4_mv_x,
    197     S32 i4_mv_y,
    198     U08 **ppu1_final,
    199     S32 *pi4_final_stride,
    200     FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
    201 {
    202     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
    203 
    204     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
    205 }
    206 
    207 static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse(
    208     interp_prms_t *ps_prms,
    209     S32 i4_mv_x,
    210     S32 i4_mv_y,
    211     U08 **ppu1_final,
    212     S32 *pi4_final_stride,
    213     FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
    214 {
    215     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
    216 
    217     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
    218 }
    219 
    220 /********************************************************************************
    221 *  @fn     hme_qpel_interp_comprehensive
    222 *
    223 *  @brief  Interpolates 2 qpel points by hpel averaging
    224 *
    225 *  @param[in,out]  ps_prms: Both input buffer ptrs and location of output
    226 *
    227 *  @param[in]  i4_mv_x : x component of motion vector in QPEL units
    228 *
    229 *  @param[in]  i4_mv_y : y component of motion vector in QPEL units
    230 *
    231 *  @param[in]  i4_grid_mask : mask which determines qpels to be computed
    232 *
    233 *  @param[out]  ppu1_final : storage for final buffer pointers
    234 *
    235 *  @param[out]  pi4_final_stride : storage for final buffer strides
    236 *
    237 *  @return None
    238 ********************************************************************************
    239 */
    240 static __inline void hme_qpel_interp_comprehensive(
    241     interp_prms_t *ps_prms,
    242     U08 **ppu1_final,
    243     S32 *pi4_final_stride,
    244     S32 i4_mv_x,
    245     S32 i4_mv_y,
    246     S32 i4_grid_mask,
    247     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
    248 {
    249     S32 pt_select_for_TB, pt_select_for_LR;
    250     S32 dx, dy, dydx;
    251     S32 vert_func_selector, horz_func_selector;
    252 
    253     S32 i4_ref_stride = ps_prms->i4_ref_stride;
    254 
    255     pt_select_for_TB =
    256         ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1));
    257 
    258     pt_select_for_LR =
    259         ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1));
    260 
    261     dx = (i4_mv_x & 3);
    262     dy = (i4_mv_y & 3);
    263     dydx = (dx + (dy << 2));
    264 
    265     vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx];
    266     horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx];
    267 
    268     /* case descriptions */
    269     /* Let T = (gridmask & T) & B = (gridmask & B) */
    270     /* & hp = pt is an hpel or an fpel */
    271     /* & r = reuse possible */
    272     /* 0 => T || B = 0 */
    273     /* 1 => (!T) && (B) && hp */
    274     /* 2 => (T) && (!B) && hp */
    275     /* 3 => (!T) && (B) && !hp */
    276     /* 4 => (T) && (!B) && !hp */
    277     /* 5 => (T) && (B) && !hp && r */
    278     /* 6 => (T) && (B) && !hp && !r */
    279     /* 7 => (T) && (B) && hp */
    280 
    281     switch(vert_func_selector)
    282     {
    283     case 0:
    284     {
    285         break;
    286     }
    287     case 1:
    288     {
    289         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    290         qpel_input_buf_cfg_t *ps_inp_cfg;
    291         S32 i4_mvyp1 = (i4_mv_y + 1);
    292 
    293         i4_mv_x_frac = dx;
    294         i4_mv_y_frac = i4_mvyp1 & 3;
    295 
    296         i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
    297 
    298         /* Derive the descriptor that has all offset and size info */
    299         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    300 
    301         ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    302         ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    303         ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    304         pi4_final_stride[3] = i4_ref_stride;
    305 
    306         break;
    307     }
    308     case 2:
    309     {
    310         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    311         qpel_input_buf_cfg_t *ps_inp_cfg;
    312         S32 i4_mvym1 = (i4_mv_y - 1);
    313 
    314         i4_mv_x_frac = dx;
    315         i4_mv_y_frac = i4_mvym1 & 3;
    316 
    317         i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
    318 
    319         /* Derive the descriptor that has all offset and size info */
    320         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    321 
    322         ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    323         ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    324         ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    325         pi4_final_stride[1] = i4_ref_stride;
    326 
    327         break;
    328     }
    329     case 3:
    330     {
    331         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
    332             ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
    333 
    334         break;
    335     }
    336     case 4:
    337     {
    338         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
    339             ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
    340 
    341         break;
    342     }
    343     case 5:
    344     {
    345         ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse(
    346             ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
    347         break;
    348     }
    349     case 6:
    350     {
    351         hme_qpel_interp_avg_2pt_vert_no_reuse(
    352             ps_prms,
    353             i4_mv_x,
    354             i4_mv_y,
    355             ppu1_final,
    356             pi4_final_stride,
    357             ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
    358         break;
    359     }
    360     case 7:
    361     {
    362         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    363         qpel_input_buf_cfg_t *ps_inp_cfg;
    364 
    365         S32 i4_mvyp1 = (i4_mv_y + 1);
    366         S32 i4_mvym1 = (i4_mv_y - 1);
    367 
    368         i4_mv_x_frac = dx;
    369         i4_mv_y_frac = i4_mvyp1 & 3;
    370 
    371         i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
    372 
    373         /* Derive the descriptor that has all offset and size info */
    374         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    375 
    376         ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    377         ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    378         ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    379         pi4_final_stride[3] = i4_ref_stride;
    380 
    381         i4_mv_y_frac = i4_mvym1 & 3;
    382 
    383         i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
    384 
    385         /* Derive the descriptor that has all offset and size info */
    386         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    387 
    388         ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    389         ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    390         ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    391         pi4_final_stride[1] = i4_ref_stride;
    392 
    393         break;
    394     }
    395     }
    396 
    397     /* case descriptions */
    398     /* Let L = (gridmask & L) & R = (gridmask & R) */
    399     /* & hp = pt is an hpel or an fpel */
    400     /* & r = reuse possible */
    401     /* 0 => L || R = 0 */
    402     /* 1 => (!L) && (R) && hp */
    403     /* 2 => (L) && (!R) && hp */
    404     /* 3 => (!L) && (R) && !hp */
    405     /* 4 => (L) && (!R) && !hp */
    406     /* 5 => (L) && (R) && !hp && r */
    407     /* 6 => (L) && (R) && !hp && !r */
    408     /* 7 => (L) && (R) && hp */
    409 
    410     switch(horz_func_selector)
    411     {
    412     case 0:
    413     {
    414         break;
    415     }
    416     case 1:
    417     {
    418         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    419         qpel_input_buf_cfg_t *ps_inp_cfg;
    420         S32 i4_mvxp1 = (i4_mv_x + 1);
    421 
    422         i4_mv_x_frac = i4_mvxp1 & 3;
    423         i4_mv_y_frac = dy;
    424 
    425         i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
    426 
    427         /* Derive the descriptor that has all offset and size info */
    428         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    429 
    430         ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    431         ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    432         ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    433         pi4_final_stride[2] = i4_ref_stride;
    434 
    435         break;
    436     }
    437     case 2:
    438     {
    439         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    440         qpel_input_buf_cfg_t *ps_inp_cfg;
    441         S32 i4_mvxm1 = (i4_mv_x - 1);
    442 
    443         i4_mv_x_frac = i4_mvxm1 & 3;
    444         i4_mv_y_frac = dy;
    445 
    446         i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
    447 
    448         /* Derive the descriptor that has all offset and size info */
    449         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    450 
    451         ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    452         ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    453         ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    454         pi4_final_stride[0] = i4_ref_stride;
    455 
    456         break;
    457     }
    458     case 3:
    459     {
    460         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
    461             ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
    462 
    463         break;
    464     }
    465     case 4:
    466     {
    467         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
    468             ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
    469 
    470         break;
    471     }
    472     case 5:
    473     {
    474         ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse(
    475             ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
    476         break;
    477     }
    478     case 6:
    479     {
    480         hme_qpel_interp_avg_2pt_horz_no_reuse(
    481             ps_prms,
    482             i4_mv_x,
    483             i4_mv_y,
    484             ppu1_final,
    485             pi4_final_stride,
    486             ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
    487         break;
    488     }
    489     case 7:
    490     {
    491         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
    492         qpel_input_buf_cfg_t *ps_inp_cfg;
    493 
    494         S32 i4_mvxp1 = (i4_mv_x + 1);
    495         S32 i4_mvxm1 = (i4_mv_x - 1);
    496 
    497         i4_mv_x_frac = i4_mvxp1 & 3;
    498         i4_mv_y_frac = dy;
    499 
    500         i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
    501 
    502         /* Derive the descriptor that has all offset and size info */
    503         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    504 
    505         ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    506         ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    507         ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    508         pi4_final_stride[2] = i4_ref_stride;
    509 
    510         i4_mv_x_frac = i4_mvxm1 & 3;
    511 
    512         i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
    513 
    514         /* Derive the descriptor that has all offset and size info */
    515         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
    516 
    517         ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
    518         ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
    519         ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
    520         pi4_final_stride[0] = i4_ref_stride;
    521 
    522         break;
    523     }
    524     }
    525 }
    526 
    527 /**
    528 ********************************************************************************
    529 *  @fn     S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms,
    530 *                                   search_results_t *ps_search_results,
    531 *                                   layer_ctxt_t *ps_curr_layer,
    532 *                                   U08 **ppu1_pred)
    533 *
    534 *
    535 *  @brief  Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are
    536 *          best L0 and L1 bufs respectively for the entire CU
    537 *
    538 *  @param[in]  ps_prms: subpel prms input to this function
    539 *
    540 *  @param[in] ps_curr_layer: points to the current layer ctxt
    541 *
    542 *  @return The best BI cost of best uni cost, whichever better
    543 ********************************************************************************
    544 */
    545 void hme_compute_pred_and_evaluate_bi(
    546     inter_cu_results_t *ps_cu_results,
    547     inter_pu_results_t *ps_pu_results,
    548     inter_ctb_prms_t *ps_inter_ctb_prms,
    549     part_type_results_t *ps_part_type_result,
    550     ULWORD64 *pu8_winning_pred_sigmaXSquare,
    551     ULWORD64 *pu8_winning_pred_sigmaX,
    552     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
    553     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
    554 {
    555     /* Idx0 - Uni winner */
    556     /* Idx1 - Uni runner-up */
    557     /* Idx2 - Bi winner */
    558     hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS];
    559     err_prms_t s_err_prms;
    560     interp_prms_t s_interp_prms;
    561 
    562     PF_SAD_FXN_T pf_err_compute;
    563 
    564     S32 i, j;
    565     S32 x_off, y_off, x_pic, y_pic;
    566     S32 i4_sad_grid;
    567     U08 e_cu_size;
    568     S32 i4_part_type;
    569     U08 u1_cu_size;
    570     S32 shift;
    571     S32 x_part, y_part, num_parts;
    572     S32 inp_stride, ref_stride;
    573     U08 au1_pred_buf_array_indixes[3];
    574     S32 cur_iter_best_cost;
    575     S32 uni_cost, bi_cost, best_cost, tot_cost;
    576     /* Idx0 - Uni winner */
    577     /* Idx1 - Bi winner */
    578     ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS];
    579     ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS];
    580 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
    581     S32 i4_noise_term;
    582 #endif
    583 
    584     interp_prms_t *ps_interp_prms = &s_interp_prms;
    585 
    586     S32 best_cand_in_opp_dir_idx = 0;
    587     S32 is_best_cand_an_intra = 0;
    588     U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy;
    589 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
    590     const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
    591 #endif
    592     tot_cost = 0;
    593 
    594     /* Start of the CU w.r.t. CTB */
    595     x_off = ps_cu_results->u1_x_off;
    596     y_off = ps_cu_results->u1_y_off;
    597 
    598     inp_stride = ps_inter_ctb_prms->i4_inp_stride;
    599     ref_stride = ps_inter_ctb_prms->i4_rec_stride;
    600 
    601     ps_interp_prms->i4_ref_stride = ref_stride;
    602 
    603     /* Start of the CU w.r.t. Pic 0,0 */
    604     x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off;
    605     y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off;
    606 
    607     u1_cu_size = ps_cu_results->u1_cu_size;
    608     e_cu_size = u1_cu_size;
    609     shift = (S32)e_cu_size;
    610     i4_part_type = ps_part_type_result->u1_part_type;
    611     num_parts = gau1_num_parts_in_part_type[i4_part_type];
    612 
    613     for(i = 0; i < 3; i++)
    614     {
    615         hme_init_pred_buf_info(
    616             &as_pred_buf_data[i],
    617             &ps_inter_ctb_prms->s_pred_buf_mngr,
    618             (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2,
    619             (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2,
    620             (PART_TYPE_T)i4_part_type);
    621 
    622         au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id;
    623     }
    624 
    625     for(j = 0; j < num_parts; j++)
    626     {
    627         UWORD8 *apu1_hpel_ref[2][4];
    628         PART_ID_T e_part_id;
    629         BLK_SIZE_T e_blk_size;
    630         WORD8 i1_ref_idx;
    631         UWORD8 pred_dir;
    632         WORD32 ref_offset, inp_offset, wd, ht;
    633         pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result;
    634         mv_t *aps_mv[2];
    635         UWORD8 num_active_ref_opp;
    636         UWORD8 num_results_per_part;
    637         WORD32 luma_weight_ref1, luma_offset_ref1;
    638         WORD32 luma_weight_ref2, luma_offset_ref2;
    639         WORD32 pu_node2_found = 0;
    640 
    641         e_part_id = ge_part_type_to_part_id[i4_part_type][j];
    642         e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id];
    643 
    644         x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift;
    645         y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift;
    646 
    647         ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride;
    648         inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset;
    649 
    650         pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode;
    651 
    652         ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]);
    653 
    654         if(PRED_L0 == pred_dir)
    655         {
    656             i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx;
    657             aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv);
    658 
    659             num_active_ref_opp =
    660                 ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled);
    661             num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id];
    662 
    663             ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id];
    664 
    665             ASSERT(i1_ref_idx >= 0);
    666 
    667             apu1_hpel_ref[0][0] =
    668                 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
    669                 ref_offset;
    670             apu1_hpel_ref[0][1] =
    671                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
    672                 ref_offset;
    673             apu1_hpel_ref[0][2] =
    674                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
    675                 ref_offset;
    676             apu1_hpel_ref[0][3] =
    677                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
    678                 ref_offset;
    679 
    680             luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
    681                                    ->s_weight_offset.i2_luma_weight;
    682             luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
    683                                    ->s_weight_offset.i2_luma_offset;
    684         }
    685         else
    686         {
    687             i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx;
    688             aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv);
    689 
    690             ASSERT(i1_ref_idx >= 0);
    691 
    692             num_active_ref_opp =
    693                 ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled);
    694             num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id];
    695 
    696             ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id];
    697 
    698             apu1_hpel_ref[0][0] =
    699                 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
    700                 ref_offset;
    701             apu1_hpel_ref[0][1] =
    702                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
    703                 ref_offset;
    704             apu1_hpel_ref[0][2] =
    705                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
    706                 ref_offset;
    707             apu1_hpel_ref[0][3] =
    708                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
    709                 ref_offset;
    710 
    711             luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
    712                                    ->s_weight_offset.i2_luma_weight;
    713             luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
    714                                    ->s_weight_offset.i2_luma_offset;
    715         }
    716 
    717         if(aps_mv[0]->i2_mvx == INTRA_MV)
    718         {
    719             uni_cost = ps_pu_node1->i4_tot_cost;
    720             cur_iter_best_cost = ps_pu_node1->i4_tot_cost;
    721             best_cost = MIN(uni_cost, cur_iter_best_cost);
    722             tot_cost += best_cost;
    723             continue;
    724         }
    725 
    726         ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size];
    727         ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size];
    728         ps_interp_prms->i4_out_stride = MAX_CU_SIZE;
    729 
    730         if(num_active_ref_opp)
    731         {
    732             if(PRED_L0 == pred_dir)
    733             {
    734                 if(ps_pu_results->u1_num_results_per_part_l1[e_part_id])
    735                 {
    736                     ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id];
    737                     pu_node2_found = 1;
    738                 }
    739             }
    740             else
    741             {
    742                 if(ps_pu_results->u1_num_results_per_part_l0[e_part_id])
    743                 {
    744                     ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id];
    745                     pu_node2_found = 1;
    746                 }
    747             }
    748         }
    749 
    750         if(!pu_node2_found)
    751         {
    752             bi_cost = INT_MAX >> 1;
    753 
    754             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
    755             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
    756 
    757             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
    758                 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
    759 
    760             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
    761             {
    762                 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
    763                 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
    764                 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
    765             }
    766 
    767             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
    768             {
    769                 hme_compute_sigmaX_and_sigmaXSquared(
    770                     as_pred_buf_data[0][j].pu1_pred,
    771                     as_pred_buf_data[0][j].i4_pred_stride,
    772                     &au8_sigmaX[0][j],
    773                     &au8_sigmaXSquared[0][j],
    774                     ps_interp_prms->i4_blk_wd,
    775                     ps_interp_prms->i4_blk_ht,
    776                     ps_interp_prms->i4_blk_wd,
    777                     ps_interp_prms->i4_blk_ht,
    778                     0,
    779                     1);
    780             }
    781         }
    782         else
    783         {
    784             i = 0;
    785             bi_cost = MAX_32BIT_VAL;
    786             is_best_cand_an_intra = 0;
    787             best_cand_in_opp_dir_idx = 0;
    788 
    789             pred_dir = ps_pu_node2[i].pu.b2_pred_mode;
    790 
    791             if(PRED_L0 == pred_dir)
    792             {
    793                 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx;
    794                 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv);
    795 
    796                 ASSERT(i1_ref_idx >= 0);
    797 
    798                 apu1_hpel_ref[1][0] =
    799                     (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
    800                                    ->s_yuv_buf_desc.pv_y_buf) +
    801                     ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
    802                 apu1_hpel_ref[1][1] =
    803                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
    804                     ref_offset;
    805                 apu1_hpel_ref[1][2] =
    806                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
    807                     ref_offset;
    808                 apu1_hpel_ref[1][3] =
    809                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
    810                     ref_offset;
    811 
    812                 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
    813                                        ->s_weight_offset.i2_luma_weight;
    814                 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
    815                                        ->s_weight_offset.i2_luma_offset;
    816             }
    817             else
    818             {
    819                 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx;
    820                 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv);
    821 
    822                 ASSERT(i1_ref_idx >= 0);
    823 
    824                 apu1_hpel_ref[1][0] =
    825                     (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
    826                                    ->s_yuv_buf_desc.pv_y_buf) +
    827                     ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
    828                 apu1_hpel_ref[1][1] =
    829                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
    830                     ref_offset;
    831                 apu1_hpel_ref[1][2] =
    832                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
    833                     ref_offset;
    834                 apu1_hpel_ref[1][3] =
    835                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
    836                     ref_offset;
    837 
    838                 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
    839                                        ->s_weight_offset.i2_luma_weight;
    840                 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
    841                                        ->s_weight_offset.i2_luma_offset;
    842             }
    843 
    844             if(aps_mv[1]->i2_mvx == INTRA_MV)
    845             {
    846                 uni_cost = ps_pu_node1->i4_tot_cost;
    847                 cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost;
    848 
    849                 if(cur_iter_best_cost < bi_cost)
    850                 {
    851                     bi_cost = cur_iter_best_cost;
    852                     best_cand_in_opp_dir_idx = i;
    853                     is_best_cand_an_intra = 1;
    854                 }
    855 
    856                 best_cost = MIN(uni_cost, bi_cost);
    857                 tot_cost += best_cost;
    858                 continue;
    859             }
    860 
    861             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
    862             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
    863 
    864             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
    865                 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
    866 
    867             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
    868             {
    869                 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
    870                 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
    871                 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
    872             }
    873 
    874             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
    875             {
    876                 hme_compute_sigmaX_and_sigmaXSquared(
    877                     as_pred_buf_data[0][j].pu1_pred,
    878                     as_pred_buf_data[0][j].i4_pred_stride,
    879                     &au8_sigmaX[0][j],
    880                     &au8_sigmaXSquared[0][j],
    881                     ps_interp_prms->i4_blk_wd,
    882                     ps_interp_prms->i4_blk_ht,
    883                     ps_interp_prms->i4_blk_wd,
    884                     ps_interp_prms->i4_blk_ht,
    885                     0,
    886                     1);
    887             }
    888 
    889             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred;
    890             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0];
    891 
    892             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
    893                 ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0);
    894 
    895             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
    896             {
    897                 as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX;
    898                 as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out;
    899                 as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
    900             }
    901 
    902             ps_cmn_utils_optimised_function_list->pf_wt_avg_2d(
    903                 as_pred_buf_data[0][j].pu1_pred,
    904                 as_pred_buf_data[1][j].pu1_pred,
    905                 as_pred_buf_data[0][j].i4_pred_stride,
    906                 as_pred_buf_data[1][j].i4_pred_stride,
    907                 wd,
    908                 ht,
    909                 as_pred_buf_data[2][j].pu1_pred,
    910                 as_pred_buf_data[2][j].i4_pred_stride,
    911                 luma_weight_ref1,
    912                 luma_weight_ref2,
    913                 luma_offset_ref1,
    914                 luma_offset_ref2,
    915                 ps_inter_ctb_prms->wpred_log_wdc);
    916 
    917             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
    918             {
    919                 hme_compute_sigmaX_and_sigmaXSquared(
    920                     as_pred_buf_data[2][j].pu1_pred,
    921                     as_pred_buf_data[2][j].i4_pred_stride,
    922                     &au8_sigmaX[1][j],
    923                     &au8_sigmaXSquared[1][j],
    924                     ps_interp_prms->i4_blk_wd,
    925                     ps_interp_prms->i4_blk_ht,
    926                     ps_interp_prms->i4_blk_wd,
    927                     ps_interp_prms->i4_blk_ht,
    928                     0,
    929                     1);
    930             }
    931 
    932             s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset;
    933             s_err_prms.i4_inp_stride = inp_stride;
    934             s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride;
    935             s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
    936             s_err_prms.i4_grid_mask = 1;
    937             s_err_prms.pi4_sad_grid = &i4_sad_grid;
    938             s_err_prms.i4_blk_wd = wd;
    939             s_err_prms.i4_blk_ht = ht;
    940             s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred;
    941             s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list;
    942 
    943             if(ps_inter_ctb_prms->u1_use_satd)
    944             {
    945                 pf_err_compute = compute_satd_8bit;
    946             }
    947             else
    948             {
    949                 pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit;
    950             }
    951 
    952             pf_err_compute(&s_err_prms);
    953 
    954 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
    955             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
    956             {
    957                 unsigned long u4_shift_val;
    958                 ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
    959                 ULWORD64 u8_temp_var, u8_temp_var1;
    960                 S32 i4_bits_req;
    961 
    962                 S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
    963 
    964                 u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]);
    965                 u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX;
    966 
    967                 if(e_cu_size == CU_8x8)
    968                 {
    969                     PART_ID_T e_part_id =
    970                         (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
    971 
    972                     u4_shift_val = ihevce_calc_stim_injected_variance(
    973                         ps_inter_ctb_prms->pu8_part_src_sigmaX,
    974                         ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
    975                         &u8_src_variance,
    976                         i4_default_src_wt,
    977                         0,
    978                         ps_inter_ctb_prms->wpred_log_wdc,
    979                         e_part_id);
    980                 }
    981                 else
    982                 {
    983                     u4_shift_val = ihevce_calc_stim_injected_variance(
    984                         ps_inter_ctb_prms->pu8_part_src_sigmaX,
    985                         ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
    986                         &u8_src_variance,
    987                         i4_default_src_wt,
    988                         0,
    989                         ps_inter_ctb_prms->wpred_log_wdc,
    990                         e_part_id);
    991                 }
    992 
    993                 u8_pred_variance = u8_pred_variance >> u4_shift_val;
    994 
    995                 GETRANGE64(i4_bits_req, u8_pred_variance);
    996 
    997                 if(i4_bits_req > 27)
    998                 {
    999                     u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
   1000                     u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
   1001                 }
   1002 
   1003                 if(u8_src_variance == u8_pred_variance)
   1004                 {
   1005                     u8_temp_var = (1 << STIM_Q_FORMAT);
   1006                 }
   1007                 else
   1008                 {
   1009                     u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
   1010                     u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
   1011                     u8_temp_var1 =
   1012                         (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
   1013                     u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
   1014                     u8_temp_var = (u8_temp_var / u8_temp_var1);
   1015                 }
   1016 
   1017                 i4_noise_term = (UWORD32)u8_temp_var;
   1018 
   1019                 i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
   1020 
   1021                 ASSERT(i4_noise_term >= 0);
   1022 
   1023                 u8_temp_var = i4_sad_grid;
   1024                 u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
   1025                 u8_temp_var += (1 << ((i4_q_level)-1));
   1026                 i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level));
   1027             }
   1028 #endif
   1029 
   1030             cur_iter_best_cost = i4_sad_grid;
   1031             cur_iter_best_cost += ps_pu_node1->i4_mv_cost;
   1032             cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost;
   1033 
   1034             if(cur_iter_best_cost < bi_cost)
   1035             {
   1036                 bi_cost = cur_iter_best_cost;
   1037                 best_cand_in_opp_dir_idx = i;
   1038                 is_best_cand_an_intra = 0;
   1039             }
   1040         }
   1041 
   1042         uni_cost = ps_pu_node1->i4_tot_cost;
   1043 
   1044 #if USE_NOISE_TERM_DURING_BICAND_SEARCH
   1045         if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
   1046         {
   1047             unsigned long u4_shift_val;
   1048             ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
   1049             ULWORD64 u8_temp_var, u8_temp_var1;
   1050             S32 i4_bits_req;
   1051 
   1052             S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
   1053 
   1054             S08 i1_ref_idx =
   1055                 (PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
   1056                     ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx]
   1057                     : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx];
   1058             S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost;
   1059 
   1060             u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]);
   1061             u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX;
   1062 
   1063             if(e_cu_size == CU_8x8)
   1064             {
   1065                 PART_ID_T e_part_id =
   1066                     (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
   1067 
   1068                 u4_shift_val = ihevce_calc_stim_injected_variance(
   1069                     ps_inter_ctb_prms->pu8_part_src_sigmaX,
   1070                     ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
   1071                     &u8_src_variance,
   1072                     ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
   1073                     ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
   1074                     ps_inter_ctb_prms->wpred_log_wdc,
   1075                     e_part_id);
   1076             }
   1077             else
   1078             {
   1079                 u4_shift_val = ihevce_calc_stim_injected_variance(
   1080                     ps_inter_ctb_prms->pu8_part_src_sigmaX,
   1081                     ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
   1082                     &u8_src_variance,
   1083                     ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
   1084                     ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
   1085                     ps_inter_ctb_prms->wpred_log_wdc,
   1086                     e_part_id);
   1087             }
   1088 
   1089             u8_pred_variance = u8_pred_variance >> (u4_shift_val);
   1090 
   1091             GETRANGE64(i4_bits_req, u8_pred_variance);
   1092 
   1093             if(i4_bits_req > 27)
   1094             {
   1095                 u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
   1096                 u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
   1097             }
   1098 
   1099             if(u8_src_variance == u8_pred_variance)
   1100             {
   1101                 u8_temp_var = (1 << STIM_Q_FORMAT);
   1102             }
   1103             else
   1104             {
   1105                 u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
   1106                 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
   1107                 u8_temp_var1 =
   1108                     (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
   1109                 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
   1110                 u8_temp_var = (u8_temp_var / u8_temp_var1);
   1111             }
   1112 
   1113             i4_noise_term = (UWORD32)u8_temp_var;
   1114 
   1115             i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
   1116 
   1117             ASSERT(i4_noise_term >= 0);
   1118 
   1119             u8_temp_var = i4_sad;
   1120             u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
   1121             u8_temp_var += (1 << ((i4_q_level)-1));
   1122             i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level));
   1123 
   1124             uni_cost = i4_sad + ps_pu_node1->i4_mv_cost;
   1125 
   1126             pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j];
   1127             pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j];
   1128         }
   1129 #endif
   1130 
   1131         if((bi_cost < uni_cost) && (!is_best_cand_an_intra))
   1132         {
   1133             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
   1134             {
   1135                 pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j];
   1136                 pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j];
   1137             }
   1138 
   1139             if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
   1140             {
   1141                 ps_pu_node1->pu.b2_pred_mode = PRED_BI;
   1142 
   1143                 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
   1144                 {
   1145                     ps_pu_node1->pu.mv.i1_l1_ref_idx =
   1146                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
   1147                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
   1148                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
   1149                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
   1150                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
   1151                 }
   1152                 else
   1153                 {
   1154                     ps_pu_node1->pu.mv.i1_l1_ref_idx =
   1155                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
   1156                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
   1157                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
   1158                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
   1159                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
   1160                 }
   1161             }
   1162             else
   1163             {
   1164                 ps_pu_node1->pu.b2_pred_mode = PRED_BI;
   1165 
   1166                 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
   1167                 {
   1168                     ps_pu_node1->pu.mv.i1_l0_ref_idx =
   1169                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
   1170                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
   1171                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
   1172                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
   1173                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
   1174                 }
   1175                 else
   1176                 {
   1177                     ps_pu_node1->pu.mv.i1_l0_ref_idx =
   1178                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
   1179                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
   1180                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
   1181                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
   1182                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
   1183                 }
   1184             }
   1185 
   1186             ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost;
   1187         }
   1188 
   1189         best_cost = MIN(uni_cost, bi_cost);
   1190         tot_cost += best_cost;
   1191     }
   1192 
   1193     hme_debrief_bipred_eval(
   1194         ps_part_type_result,
   1195         as_pred_buf_data,
   1196         &ps_inter_ctb_prms->s_pred_buf_mngr,
   1197         au1_pred_buf_array_indixes,
   1198         ps_cmn_utils_optimised_function_list);
   1199 
   1200     ps_part_type_result->i4_tot_cost = tot_cost;
   1201 }
   1202 
   1203 WORD32 hme_evalsatd_pt_pu_8x8_tu_rec(
   1204     err_prms_t *ps_prms,
   1205     WORD32 lambda,
   1206     WORD32 lambda_q_shift,
   1207     WORD32 i4_frm_qstep,
   1208     me_func_selector_t *ps_func_selector)
   1209 {
   1210     S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */
   1211     S32 i4_satd_8x8;
   1212     S16 *pi2_had_out;
   1213     S32 i4_tu_split_flag = 0;
   1214     S32 i4_tu_early_cbf = 0;
   1215 
   1216     S32 i4_early_cbf = 1;
   1217     //  S32 i4_i, i4_k;
   1218     S32 i4_total_satd_cost = 0;
   1219     S32 best_cost_tu_split;
   1220 
   1221     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
   1222     S32 *api4_satd_pu[HAD_32x32 + 1];
   1223     S32 *api4_tu_split[HAD_32x32 + 1];
   1224     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
   1225 
   1226     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
   1227     S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags;
   1228     S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf;
   1229 
   1230     U08 *pu1_inp = ps_prms->pu1_inp;
   1231     U08 *pu1_ref = ps_prms->pu1_ref;
   1232 
   1233     S32 inp_stride = ps_prms->i4_inp_stride;
   1234     S32 ref_stride = ps_prms->i4_ref_stride;
   1235 
   1236     /* Initialize tu_split_cost to "0" */
   1237     ps_prms->i4_tu_split_cost = 0;
   1238     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
   1239 
   1240     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
   1241     api4_satd_pu[HAD_8x8] = &i4_satd_8x8;
   1242     api4_satd_pu[HAD_16x16] = NULL;
   1243     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
   1244 
   1245     api4_tu_split[HAD_4x4] = NULL;
   1246     api4_tu_split[HAD_8x8] = &i4_tu_split_flag;
   1247     api4_tu_split[HAD_16x16] = NULL;
   1248     api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
   1249 
   1250     api4_tu_early_cbf[HAD_4x4] = NULL;
   1251     api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf;
   1252     api4_tu_early_cbf[HAD_16x16] = NULL;
   1253     api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
   1254 
   1255     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
   1256 
   1257     /* Return value is merge of both best_stad_cost and tu_split_flags */
   1258     best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r(
   1259         pu1_inp,
   1260         inp_stride,
   1261         pu1_ref,
   1262         ref_stride,
   1263         pi2_had_out,
   1264         8,
   1265         api4_satd_pu,
   1266         api4_tu_split,
   1267         api4_tu_early_cbf,
   1268         0,
   1269         2,
   1270         0,
   1271         0,
   1272         i4_frm_qstep,
   1273         0,
   1274         ps_prms->u1_max_tr_depth,
   1275         ps_prms->u1_max_tr_size,
   1276         &(ps_prms->i4_tu_split_cost),
   1277         NULL);
   1278 
   1279     /* For SATD computation following TU size are assumed for a 8x8 CU */
   1280     /* 8 for 2Nx2N, 4 for Nx2N,2NxN                                    */
   1281 
   1282     i4_total_satd_cost = best_cost_tu_split >> 2;
   1283 
   1284     /* Second last bit has the tu pslit flag */
   1285     i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1;
   1286 
   1287     /* Last bit corrsponds to the Early CBF flag */
   1288     i4_early_cbf = (best_cost_tu_split & 0x1);
   1289 
   1290     /* Update 8x8 SATDs */
   1291     pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8;
   1292     pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag;
   1293     pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf;
   1294 
   1295     return i4_total_satd_cost;
   1296 }
   1297 //#endif
   1298 /**
   1299 ********************************************************************************
   1300 *  @fn     S32 hme_evalsatd_update_1_best_result_pt_pu_16x16
   1301 *
   1302 *  @brief  Evaluates the SATD with partial updates for all the best partitions
   1303 *          of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
   1304 *
   1305 *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
   1306 *                 pointer to sad grid of each partitions
   1307 *
   1308 *  @return     None
   1309 ********************************************************************************
   1310 */
   1311 
   1312 void hme_evalsatd_update_2_best_results_pt_pu_16x16(
   1313     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
   1314 {
   1315     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
   1316     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
   1317     S32 i4_satd_16x16; /* 16x16 satd cost     */
   1318     S32 i;
   1319     S16 ai2_8x8_had[256];
   1320     S16 *pi2_y0;
   1321     U08 *pu1_src, *pu1_pred;
   1322     S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
   1323     S32 *ppi4_hsad;
   1324 
   1325     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
   1326     S32 *api4_satd_pu[HAD_32x32 + 1];
   1327     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
   1328 
   1329     U08 *pu1_inp = ps_prms->pu1_inp;
   1330     U08 *pu1_ref = ps_prms->pu1_ref;
   1331 
   1332     S32 inp_stride = ps_prms->i4_inp_stride;
   1333     S32 ref_stride = ps_prms->i4_ref_stride;
   1334 
   1335     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
   1336     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
   1337     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
   1338     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
   1339 
   1340     ppi4_hsad = api4_satd_pu[HAD_16x16];
   1341 
   1342     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
   1343     for(i = 0; i < 4; i++)
   1344     {
   1345         pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
   1346         pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
   1347         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
   1348         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
   1349 
   1350         ihevce_had_8x8_using_4_4x4(
   1351             pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
   1352     }
   1353 
   1354     /* For SATD computation following TU size are assumed for a 16x16 CU */
   1355     /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
   1356 
   1357     /* Update 8x8 SATDs */
   1358     /* Modified to cost calculation using only 4x4 SATD */
   1359 
   1360     //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
   1361     //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
   1362     //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
   1363     //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
   1364 
   1365     /* Update 16x16 SATDs */
   1366     pi4_sad_grid[PART_ID_2Nx2N] =
   1367         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
   1368 
   1369     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
   1370     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
   1371     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
   1372     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
   1373 
   1374     /* Update 8x16 / 16x8 SATDs */
   1375     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
   1376     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
   1377     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
   1378     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
   1379 
   1380     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
   1381     pi4_sad_grid[PART_ID_nLx2N_L] =
   1382         ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12];
   1383 
   1384     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] +
   1385                                     ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R];
   1386 
   1387     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] +
   1388                                     ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L];
   1389 
   1390     pi4_sad_grid[PART_ID_nRx2N_R] =
   1391         ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15];
   1392 
   1393     pi4_sad_grid[PART_ID_2NxnU_T] =
   1394         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3];
   1395 
   1396     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] +
   1397                                     ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B];
   1398 
   1399     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] +
   1400                                     ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T];
   1401 
   1402     pi4_sad_grid[PART_ID_2NxnD_B] =
   1403         ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
   1404 
   1405     /* Call the update results function */
   1406     {
   1407         S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
   1408         mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
   1409         S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
   1410         S32 best_node_cost;
   1411         S32 second_best_node_cost;
   1412 
   1413         /*For each valid partition, update the refine_prm structure to reflect the best and second
   1414         best candidates for that partition*/
   1415 
   1416         for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
   1417         {
   1418             S32 update_required = 0;
   1419             S32 part_id = pi4_valid_part_ids[i4_count];
   1420             S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
   1421 
   1422             /* Use a pre-computed cost instead of freshly evaluating subpel cost */
   1423             i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   1424 
   1425             /*Calculate total cost*/
   1426             i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
   1427             i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
   1428 
   1429             /*****************************************************************/
   1430             /* We do not labor through the results if the total cost worse   */
   1431             /* than the last of the results.                                 */
   1432             /*****************************************************************/
   1433             best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
   1434             second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
   1435 
   1436             if(i4_tot_cost < second_best_node_cost)
   1437             {
   1438                 update_required = 2;
   1439 
   1440                 /*************************************************************/
   1441                 /* Identify where the current result isto be placed.Basically*/
   1442                 /* find the node which has cost just higher thannodeundertest*/
   1443                 /*************************************************************/
   1444                 if(i4_tot_cost < best_node_cost)
   1445                 {
   1446                     update_required = 1;
   1447                 }
   1448                 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
   1449                 {
   1450                     update_required = 0;
   1451                 }
   1452                 if(update_required == 2)
   1453                 {
   1454                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
   1455                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
   1456                     ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
   1457                     ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
   1458                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
   1459                 }
   1460                 else if(update_required == 1)
   1461                 {
   1462                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
   1463                         ps_subpel_refine_ctxt->i2_tot_cost[0][index];
   1464                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
   1465                         ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   1466                     ps_subpel_refine_ctxt->i2_mv_x[1][index] =
   1467                         ps_subpel_refine_ctxt->i2_mv_x[0][index];
   1468                     ps_subpel_refine_ctxt->i2_mv_y[1][index] =
   1469                         ps_subpel_refine_ctxt->i2_mv_y[0][index];
   1470                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
   1471                         ps_subpel_refine_ctxt->i2_ref_idx[0][index];
   1472 
   1473                     ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
   1474                     ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
   1475                     ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
   1476                     ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
   1477                     ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
   1478                 }
   1479             }
   1480         }
   1481     }
   1482 }
   1483 
   1484 //#if COMPUTE_16x16_R == C
   1485 void hme_evalsatd_update_1_best_result_pt_pu_16x16(
   1486     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
   1487 {
   1488     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
   1489     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
   1490     S32 i4_satd_16x16; /* 16x16 satd cost     */
   1491     S32 i;
   1492     S16 ai2_8x8_had[256];
   1493     S16 *pi2_y0;
   1494     U08 *pu1_src, *pu1_pred;
   1495     S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
   1496     S32 *ppi4_hsad;
   1497 
   1498     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
   1499     S32 *api4_satd_pu[HAD_32x32 + 1];
   1500     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
   1501 
   1502     U08 *pu1_inp = ps_prms->pu1_inp;
   1503     U08 *pu1_ref = ps_prms->pu1_ref;
   1504 
   1505     S32 inp_stride = ps_prms->i4_inp_stride;
   1506     S32 ref_stride = ps_prms->i4_ref_stride;
   1507 
   1508     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
   1509     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
   1510     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
   1511     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
   1512 
   1513     ppi4_hsad = api4_satd_pu[HAD_16x16];
   1514 
   1515     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
   1516     for(i = 0; i < 4; i++)
   1517     {
   1518         pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
   1519         pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
   1520         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
   1521         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
   1522 
   1523         ihevce_had_8x8_using_4_4x4(
   1524             pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
   1525     }
   1526 
   1527     /* For SATD computation following TU size are assumed for a 16x16 CU */
   1528     /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
   1529 
   1530     /* Update 8x8 SATDs */
   1531     /* Modified to cost calculation using only 4x4 SATD */
   1532 
   1533     //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
   1534     //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
   1535     //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
   1536     //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
   1537 
   1538     /* Update 16x16 SATDs */
   1539     pi4_sad_grid[PART_ID_2Nx2N] =
   1540         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
   1541 
   1542     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
   1543     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
   1544     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
   1545     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
   1546 
   1547     /* Update 8x16 / 16x8 SATDs */
   1548     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
   1549     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
   1550     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
   1551     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
   1552 
   1553     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
   1554     pi4_sad_grid[PART_ID_nLx2N_L] =
   1555         ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
   1556     pi4_sad_grid[PART_ID_nRx2N_R] =
   1557         ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
   1558     pi4_sad_grid[PART_ID_2NxnU_T] =
   1559         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
   1560     pi4_sad_grid[PART_ID_2NxnD_B] =
   1561         ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
   1562 
   1563     pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
   1564     pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
   1565     pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
   1566     pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
   1567 
   1568     /* Call the update results function */
   1569     {
   1570         S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
   1571         mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
   1572         S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
   1573         S32 best_node_cost;
   1574         S32 second_best_node_cost;
   1575 
   1576         /*For each valid partition, update the refine_prm structure to reflect the best and second
   1577         best candidates for that partition*/
   1578 
   1579         for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
   1580         {
   1581             S32 update_required = 0;
   1582             S32 part_id = pi4_valid_part_ids[i4_count];
   1583             S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
   1584 
   1585             /* Use a pre-computed cost instead of freshly evaluating subpel cost */
   1586             i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   1587 
   1588             /*Calculate total cost*/
   1589             i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
   1590             i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
   1591 
   1592             /*****************************************************************/
   1593             /* We do not labor through the results if the total cost worse   */
   1594             /* than the last of the results.                                 */
   1595             /*****************************************************************/
   1596             best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
   1597             second_best_node_cost = SHRT_MAX;
   1598 
   1599             if(i4_tot_cost < second_best_node_cost)
   1600             {
   1601                 update_required = 0;
   1602 
   1603                 /*************************************************************/
   1604                 /* Identify where the current result isto be placed.Basically*/
   1605                 /* find the node which has cost just higher thannodeundertest*/
   1606                 /*************************************************************/
   1607                 if(i4_tot_cost < best_node_cost)
   1608                 {
   1609                     update_required = 1;
   1610                 }
   1611                 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
   1612                 {
   1613                     update_required = 0;
   1614                 }
   1615                 if(update_required == 2)
   1616                 {
   1617                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
   1618                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
   1619                     ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
   1620                     ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
   1621                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
   1622                 }
   1623                 else if(update_required == 1)
   1624                 {
   1625                     ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
   1626                     ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
   1627                     ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
   1628                     ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
   1629                     ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
   1630                 }
   1631             }
   1632         }
   1633     }
   1634 }
   1635 
   1636 WORD32 hme_evalsatd_pt_pu_16x16_tu_rec(
   1637     err_prms_t *ps_prms,
   1638     WORD32 lambda,
   1639     WORD32 lambda_q_shift,
   1640     WORD32 i4_frm_qstep,
   1641     me_func_selector_t *ps_func_selector)
   1642 {
   1643     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
   1644     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
   1645     S32 ai4_tu_split_8x8[16];
   1646     S32 i4_satd_16x16; /* 16x16 satd cost     */
   1647 
   1648     S32 ai4_tu_early_cbf_8x8[16];
   1649 
   1650     //S16 ai2_had_out[256];
   1651     S16 *pi2_had_out;
   1652     S32 tu_split_flag = 0;
   1653     S32 early_cbf_flag = 0;
   1654     S32 total_satd_cost = 0;
   1655 
   1656     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
   1657     S32 *api4_satd_pu[HAD_32x32 + 1];
   1658     S32 *api4_tu_split[HAD_32x32 + 1];
   1659     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
   1660 
   1661     U08 *pu1_inp = ps_prms->pu1_inp;
   1662     U08 *pu1_ref = ps_prms->pu1_ref;
   1663 
   1664     S32 inp_stride = ps_prms->i4_inp_stride;
   1665     S32 ref_stride = ps_prms->i4_ref_stride;
   1666 
   1667     /* Initialize tu_split_cost to "0" */
   1668     ps_prms->i4_tu_split_cost = 0;
   1669 
   1670     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
   1671 
   1672     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
   1673     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
   1674     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
   1675     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
   1676 
   1677     api4_tu_split[HAD_4x4] = NULL;
   1678     api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
   1679     api4_tu_split[HAD_16x16] = &tu_split_flag;
   1680     api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
   1681 
   1682     api4_tu_early_cbf[HAD_4x4] = NULL;
   1683     api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
   1684     api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag;
   1685     api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
   1686 
   1687     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
   1688     ps_func_selector->pf_had_16x16_r(
   1689         pu1_inp,
   1690         inp_stride,
   1691         pu1_ref,
   1692         ref_stride,
   1693         pi2_had_out,
   1694         16,
   1695         api4_satd_pu,
   1696         api4_tu_split,
   1697         api4_tu_early_cbf,
   1698         0,
   1699         4,
   1700         lambda,
   1701         lambda_q_shift,
   1702         i4_frm_qstep,
   1703         0,
   1704         ps_prms->u1_max_tr_depth,
   1705         ps_prms->u1_max_tr_size,
   1706         &(ps_prms->i4_tu_split_cost),
   1707         NULL);
   1708 
   1709     total_satd_cost = i4_satd_16x16;
   1710 
   1711     ps_prms->pi4_tu_split_flags[0] = tu_split_flag;
   1712 
   1713     ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag;
   1714 
   1715     return total_satd_cost;
   1716 }
   1717 
   1718 /**
   1719 ********************************************************************************
   1720 *  @fn     S32 hme_evalsatd_pt_pu_32x32
   1721 *
   1722 *  @brief  Evaluates the SATD with partial updates for all the best partitions
   1723 *          of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
   1724 *
   1725 *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
   1726 *                 pointer to sad grid of each partitions
   1727 *
   1728 *  @return     None
   1729 ********************************************************************************
   1730 */
   1731 void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms)
   1732 {
   1733     //S32 ai4_satd_4x4[64];   /* num 4x4s in a 32x32 */
   1734     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
   1735     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
   1736     S32 i4_satd_32x32;
   1737     //    S16 ai2_had_out[32*32];
   1738     U08 *pu1_src;
   1739     U08 *pu1_pred;
   1740     S32 i;
   1741 
   1742     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
   1743     S32 *api4_satd_pu[HAD_32x32 + 1];
   1744     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
   1745 
   1746     U08 *pu1_inp = ps_prms->pu1_inp;
   1747     U08 *pu1_ref = ps_prms->pu1_ref;
   1748 
   1749     S32 inp_stride = ps_prms->i4_inp_stride;
   1750     S32 ref_stride = ps_prms->i4_ref_stride;
   1751 
   1752     //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[0];
   1753     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
   1754     api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
   1755     api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
   1756 
   1757     /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */
   1758     for(i = 0; i < 16; i++)
   1759     {
   1760         pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8);
   1761 
   1762         pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8);
   1763 
   1764         ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
   1765             pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
   1766     }
   1767 
   1768     /* Modified to cost calculation using only 8x8 SATD for 32x32*/
   1769     ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5];
   1770     ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7];
   1771     ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13];
   1772     ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
   1773 
   1774     /* Update 32x32 SATD */
   1775     pi4_sad_grid[PART_ID_2Nx2N] =
   1776         ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3];
   1777 
   1778     /* Update 16x16 SATDs */
   1779     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0];
   1780     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1];
   1781     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2];
   1782     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3];
   1783 
   1784     /* Update 16x32 / 32x16 SATDs */
   1785     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2];
   1786     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3];
   1787     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1];
   1788     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3];
   1789 
   1790     /* Update AMP SATDs 32x24,32x8, 24x32,8x32  */
   1791     pi4_sad_grid[PART_ID_nLx2N_L] =
   1792         ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12];
   1793 
   1794     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] +
   1795                                     ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R];
   1796 
   1797     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] +
   1798                                     ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L];
   1799 
   1800     pi4_sad_grid[PART_ID_nRx2N_R] =
   1801         ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15];
   1802 
   1803     pi4_sad_grid[PART_ID_2NxnU_T] =
   1804         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
   1805 
   1806     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] +
   1807                                     ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B];
   1808 
   1809     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] +
   1810                                     ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T];
   1811 
   1812     pi4_sad_grid[PART_ID_2NxnD_B] =
   1813         ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
   1814 }
   1815 
   1816 WORD32 hme_evalsatd_pt_pu_32x32_tu_rec(
   1817     err_prms_t *ps_prms,
   1818     WORD32 lambda,
   1819     WORD32 lambda_q_shift,
   1820     WORD32 i4_frm_qstep,
   1821     me_func_selector_t *ps_func_selector)
   1822 {
   1823     S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */
   1824     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
   1825     S32 ai4_tu_split_8x8[16];
   1826     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
   1827     S32 ai4_tu_split_16x16[4];
   1828     S32 i4_satd_32x32;
   1829 
   1830     S32 ai4_tu_early_cbf_8x8[16];
   1831     S32 ai4_tu_early_cbf_16x16[4];
   1832     S32 early_cbf_flag;
   1833 
   1834     S16 *pi2_had_out;
   1835 
   1836     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
   1837     S32 *api4_satd_pu[HAD_32x32 + 1];
   1838     S32 *api4_tu_split[HAD_32x32 + 1];
   1839     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
   1840 
   1841     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
   1842     S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags;
   1843     S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf;
   1844 
   1845     S32 tu_split_flag = 0;
   1846     S32 total_satd_cost = 0;
   1847 
   1848     U08 *pu1_inp = ps_prms->pu1_inp;
   1849     U08 *pu1_ref = ps_prms->pu1_ref;
   1850 
   1851     S32 inp_stride = ps_prms->i4_inp_stride;
   1852     S32 ref_stride = ps_prms->i4_ref_stride;
   1853 
   1854     /* Initialize tu_split_cost to "0" */
   1855     ps_prms->i4_tu_split_cost = 0;
   1856 
   1857     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
   1858 
   1859     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
   1860     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
   1861     api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
   1862     api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
   1863 
   1864     api4_tu_split[HAD_4x4] = NULL;
   1865     api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
   1866     api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
   1867     api4_tu_split[HAD_32x32] = &tu_split_flag;
   1868 
   1869     api4_tu_early_cbf[HAD_4x4] = NULL;
   1870     api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
   1871     api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
   1872     api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag;
   1873 
   1874     /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
   1875     ihevce_had_32x32_r(
   1876         pu1_inp,
   1877         inp_stride,
   1878         pu1_ref,
   1879         ref_stride,
   1880         pi2_had_out,
   1881         32,
   1882         api4_satd_pu,
   1883         api4_tu_split,
   1884         api4_tu_early_cbf,
   1885         0,
   1886         8,
   1887         lambda,
   1888         lambda_q_shift,
   1889         i4_frm_qstep,
   1890         0,
   1891         ps_prms->u1_max_tr_depth,
   1892         ps_prms->u1_max_tr_size,
   1893         &(ps_prms->i4_tu_split_cost),
   1894         ps_func_selector);
   1895 
   1896     total_satd_cost = i4_satd_32x32;
   1897 
   1898     /*The structure of the TU_SPLIT flag for the current 32x32 is as follows
   1899     TL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
   1900     TR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
   1901     BL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
   1902     BR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
   1903     32x32_split - 1bit (LSBit)
   1904 
   1905     TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/
   1906 
   1907     pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost;
   1908     pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag;
   1909     pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag;
   1910 
   1911     return total_satd_cost;
   1912 }
   1913 
   1914 /**
   1915 ********************************************************************************
   1916 *  @fn     S32 hme_evalsatd_pt_pu_64x64
   1917 *
   1918 *  @brief  Evaluates the SATD with partial updates for all the best partitions
   1919 *          of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds
   1920 *
   1921 *           Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard
   1922 *                  outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as
   1923 *                  TU size of 64 is not supported in HEVC
   1924 *
   1925 *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
   1926 *                 pointer to sad grid of each partitions
   1927 *
   1928 *  @return     None
   1929 ********************************************************************************
   1930 */
   1931 
   1932 void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms)
   1933 {
   1934     //S32 ai4_satd_4x4[4][64];   /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
   1935     S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
   1936     S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
   1937     S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
   1938     //    S16 ai2_had_out[32*32];
   1939     S32 i, j;
   1940 
   1941     //  S32 ai4_tu_split_8x8[4][16];
   1942     //  S32 ai4_tu_split_16x16[4][4];
   1943     //  S32 ai4_tu_split_32x32[4];
   1944 
   1945     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
   1946     S32 *api4_satd_pu[HAD_32x32 + 1];
   1947     //  S32 *api4_tu_split[HAD_32x32 + 1];
   1948 
   1949     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
   1950 
   1951     U08 *pu1_inp = ps_prms->pu1_inp;
   1952     U08 *pu1_ref = ps_prms->pu1_ref;
   1953     U08 *pu1_src;
   1954     U08 *pu1_pred;
   1955 
   1956     S32 inp_stride = ps_prms->i4_inp_stride;
   1957     S32 ref_stride = ps_prms->i4_ref_stride;
   1958 
   1959     for(i = 0; i < 4; i++)
   1960     {
   1961         S32 blkx = (i & 0x1);
   1962         S32 blky = (i >> 1);
   1963         U08 *pu1_pi0, *pu1_pi1;
   1964 
   1965         //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[i][0];
   1966         api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0];
   1967         api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0];
   1968         api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
   1969 
   1970         pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
   1971         pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
   1972 
   1973         /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */
   1974         for(j = 0; j < 16; j++)
   1975         {
   1976             pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8);
   1977 
   1978             pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8);
   1979 
   1980             ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
   1981                 pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
   1982         }
   1983 
   1984         /* Modified to cost calculation using only 8x8 SATD for 32x32*/
   1985         ai4_satd_16x16[i][0] =
   1986             ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5];
   1987         ai4_satd_16x16[i][1] =
   1988             ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7];
   1989         ai4_satd_16x16[i][2] =
   1990             ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13];
   1991         ai4_satd_16x16[i][3] =
   1992             ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15];
   1993     }
   1994 
   1995     /* Modified to cost calculation using only 8x8 SATD for 32x32*/
   1996 
   1997     ai4_satd_32x32[0] =
   1998         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3];
   1999     ai4_satd_32x32[1] =
   2000         ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3];
   2001     ai4_satd_32x32[2] =
   2002         ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3];
   2003     ai4_satd_32x32[3] =
   2004         ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
   2005 
   2006     /* Update 64x64 SATDs */
   2007     pi4_sad_grid[PART_ID_2Nx2N] =
   2008         ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
   2009 
   2010     /* Update 32x32 SATDs */
   2011     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0];
   2012     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1];
   2013     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2];
   2014     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3];
   2015 
   2016     /* Update 32x64 / 64x32 SATDs */
   2017     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2];
   2018     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3];
   2019     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1];
   2020     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3];
   2021 
   2022     /* Update AMP SATDs 64x48,64x16, 48x64,16x64  */
   2023     pi4_sad_grid[PART_ID_nLx2N_L] =
   2024         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2];
   2025 
   2026     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] +
   2027                                     ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] +
   2028                                     pi4_sad_grid[PART_ID_Nx2N_R];
   2029 
   2030     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] +
   2031                                     ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] +
   2032                                     pi4_sad_grid[PART_ID_Nx2N_L];
   2033 
   2034     pi4_sad_grid[PART_ID_nRx2N_R] =
   2035         ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3];
   2036 
   2037     pi4_sad_grid[PART_ID_2NxnU_T] =
   2038         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1];
   2039 
   2040     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] +
   2041                                     ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] +
   2042                                     pi4_sad_grid[PART_ID_2NxN_B];
   2043 
   2044     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] +
   2045                                     ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] +
   2046                                     pi4_sad_grid[PART_ID_2NxN_T];
   2047 
   2048     pi4_sad_grid[PART_ID_2NxnD_B] =
   2049         ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
   2050 }
   2051 
   2052 WORD32 hme_evalsatd_pt_pu_64x64_tu_rec(
   2053     err_prms_t *ps_prms,
   2054     WORD32 lambda,
   2055     WORD32 lambda_q_shift,
   2056     WORD32 i4_frm_qstep,
   2057     me_func_selector_t *ps_func_selector)
   2058 {
   2059     S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
   2060     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
   2061     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
   2062     S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
   2063 
   2064     S32 ai4_tu_split_8x8[16];
   2065     S32 ai4_tu_split_16x16[4];
   2066 
   2067     S32 ai4_tu_early_cbf_8x8[16];
   2068     S32 ai4_tu_early_cbf_16x16[4];
   2069 
   2070     S16 *pi2_had_out;
   2071     S32 i;
   2072 
   2073     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
   2074     S32 *api4_satd_pu[HAD_32x32 + 1];
   2075     S32 *api4_tu_split[HAD_32x32 + 1];
   2076     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
   2077 
   2078     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
   2079 
   2080     S32 tu_split_flag = 0;
   2081     S32 total_satd_cost = 0;
   2082 
   2083     U08 *pu1_inp = ps_prms->pu1_inp;
   2084     U08 *pu1_ref = ps_prms->pu1_ref;
   2085 
   2086     S32 inp_stride = ps_prms->i4_inp_stride;
   2087     S32 ref_stride = ps_prms->i4_ref_stride;
   2088 
   2089     /* Initialize tu_split_cost to "0" */
   2090     ps_prms->i4_tu_split_cost = 0;
   2091 
   2092     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
   2093 
   2094     for(i = 0; i < 4; i++)
   2095     {
   2096         S32 blkx = (i & 0x1);
   2097         S32 blky = (i >> 1);
   2098         U08 *pu1_pi0, *pu1_pi1;
   2099         tu_split_flag = 0;
   2100 
   2101         api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
   2102         api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
   2103         api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
   2104         api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
   2105 
   2106         api4_tu_split[HAD_4x4] = NULL;
   2107         api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
   2108         api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
   2109         api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i];
   2110 
   2111         api4_tu_early_cbf[HAD_4x4] = NULL;
   2112         api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
   2113         api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
   2114         api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i];
   2115 
   2116         pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
   2117         pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
   2118 
   2119         /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
   2120         ihevce_had_32x32_r(
   2121             pu1_pi0,
   2122             inp_stride,
   2123             pu1_pi1,
   2124             ref_stride,
   2125             pi2_had_out,
   2126             32,
   2127             api4_satd_pu,
   2128             api4_tu_split,
   2129             api4_tu_early_cbf,
   2130             0,
   2131             8,
   2132             lambda,
   2133             lambda_q_shift,
   2134             i4_frm_qstep,
   2135             1,
   2136             ps_prms->u1_max_tr_depth,
   2137             ps_prms->u1_max_tr_size,
   2138             &(ps_prms->i4_tu_split_cost),
   2139             ps_func_selector);
   2140     }
   2141 
   2142     total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
   2143 
   2144     /* Update 64x64 SATDs */
   2145     pi4_sad_grid[PART_ID_2Nx2N] =
   2146         ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
   2147 
   2148     return total_satd_cost;
   2149 }
   2150 
   2151 /**
   2152 ********************************************************************************
   2153 *  @fn     void hme_subpel_refine_search_node(search_node_t *ps_search_node,
   2154 *                                   hme_subpel_prms_t *ps_prms,
   2155 *                                   layer_ctxt_t *ps_curr_layer,
   2156 *                                   BLK_SIZE_T e_blk_size,
   2157 *                                   S32 x_off,
   2158 *                                   S32 y_off)
   2159 *
   2160 *  @brief  Refines a given partition within a CU
   2161 *
   2162 *  @param[in,out]  ps_search_node: supplies starting mv and also ref id.
   2163 *                   updated with the accurate subpel mv
   2164 *
   2165 *  @param[in]  ps_prms: subpel prms input to this function
   2166 *
   2167 *  @param[in]  ps_curr_layer : layer context
   2168 *
   2169 *  @param[in]  e_blk_size : Block size enumeration
   2170 *
   2171 *  @param[in]  x_off : x offset of the partition w.r.t. pic start
   2172 *
   2173 *  @param[in]  y_off : y offset of the partition w.r.t. pic start
   2174 *
   2175 *  @return None
   2176 ********************************************************************************
   2177 */
   2178 
   2179 static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn(
   2180     me_func_selector_t *ps_func_selector,
   2181     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list,
   2182     S32 i4_part_mask,
   2183     U08 u1_use_satd,
   2184     U08 u1_num_parts,
   2185     U08 u1_num_results)
   2186 {
   2187     PF_SAD_RESULT_FXN_T pf_err_compute;
   2188 
   2189     ASSERT((1 == u1_num_results) || (2 == u1_num_results));
   2190 
   2191     if(1 == u1_num_results)
   2192     {
   2193         if(u1_use_satd)
   2194         {
   2195             if(u1_num_parts == 1)
   2196             {
   2197                 pf_err_compute =
   2198                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1;
   2199             }
   2200             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
   2201             {
   2202                 pf_err_compute =
   2203                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9;
   2204             }
   2205             else
   2206             {
   2207                 pf_err_compute =
   2208                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17;
   2209             }
   2210         }
   2211         else
   2212         {
   2213             if(u1_num_parts == 1)
   2214             {
   2215                 pf_err_compute = ps_me_optimised_function_list
   2216                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1;
   2217             }
   2218             else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
   2219             {
   2220                 pf_err_compute =
   2221                     ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts;
   2222             }
   2223             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
   2224             {
   2225                 pf_err_compute = ps_me_optimised_function_list
   2226                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9;
   2227             }
   2228             else
   2229             {
   2230                 pf_err_compute = ps_me_optimised_function_list
   2231                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17;
   2232             }
   2233         }
   2234     }
   2235     else
   2236     {
   2237         if(u1_use_satd)
   2238         {
   2239             if(u1_num_parts == 1)
   2240             {
   2241                 pf_err_compute =
   2242                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1;
   2243             }
   2244             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
   2245             {
   2246                 pf_err_compute =
   2247                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9;
   2248             }
   2249             else
   2250             {
   2251                 pf_err_compute =
   2252                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17;
   2253             }
   2254         }
   2255         else
   2256         {
   2257             if(u1_num_parts == 1)
   2258             {
   2259                 pf_err_compute = ps_me_optimised_function_list
   2260                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1;
   2261             }
   2262             else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
   2263             {
   2264                 pf_err_compute = ps_me_optimised_function_list
   2265                                      ->pf_calc_sad_and_2_best_results_subpel_square_parts;
   2266             }
   2267             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
   2268             {
   2269                 pf_err_compute = ps_me_optimised_function_list
   2270                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9;
   2271             }
   2272             else
   2273             {
   2274                 pf_err_compute = ps_me_optimised_function_list
   2275                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17;
   2276             }
   2277         }
   2278     }
   2279 
   2280     return pf_err_compute;
   2281 }
   2282 
   2283 #if DIAMOND_GRID == 1
   2284 S32 hme_subpel_refine_search_node_high_speed(
   2285     search_node_t *ps_search_node,
   2286     hme_subpel_prms_t *ps_prms,
   2287     layer_ctxt_t *ps_curr_layer,
   2288     BLK_SIZE_T e_blk_size,
   2289     S32 x_off,
   2290     S32 y_off,
   2291     search_results_t *ps_search_results,
   2292     S32 pred_lx,
   2293     S32 i4_part_mask,
   2294     S32 *pi4_valid_part_ids,
   2295     S32 search_idx,
   2296     subpel_dedup_enabler_t *ps_dedup_enabler,
   2297     me_func_selector_t *ps_func_selector,
   2298     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
   2299 {
   2300     S32 i4_num_hpel_refine, i4_num_qpel_refine;
   2301     S32 i4_offset, i4_grid_mask;
   2302     S08 i1_ref_idx;
   2303     S32 i4_blk_wd, i4_blk_ht;
   2304     S32 i4_ref_stride, i4_i;
   2305     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
   2306     result_upd_prms_t s_result_prms;
   2307     search_node_t s_temp_search_node;
   2308 
   2309     /*************************************************************************/
   2310     /* Tracks current MV with the fractional component.                      */
   2311     /*************************************************************************/
   2312     S32 i4_mv_x, i4_mv_y;
   2313     S32 i4_frac_x, i4_frac_y;
   2314 
   2315     /*************************************************************************/
   2316     /* Function pointer for SAD/SATD, array and prms structure to pass to    */
   2317     /* This function                                                         */
   2318     /*************************************************************************/
   2319     PF_SAD_RESULT_FXN_T pf_err_compute;
   2320 
   2321     S32 ai4_sad_grid[17], i4_tot_cost;
   2322     err_prms_t s_err_prms;
   2323 
   2324     /*************************************************************************/
   2325     /* Allowed MV RANGE                                                      */
   2326     /*************************************************************************/
   2327     range_prms_t *ps_range_prms;
   2328 
   2329     /*************************************************************************/
   2330     /* stores min id in grid with associated min cost.                       */
   2331     /*************************************************************************/
   2332     S32 i4_min_cost, i4_min_sad;
   2333     GRID_PT_T e_min_id;
   2334 
   2335     PF_INTERP_FXN_T pf_qpel_interp;
   2336     /*************************************************************************/
   2337     /* For hpel and qpel we move in diamonds and hence each point in the     */
   2338     /* diamond will belong to a completely different plane. To simplify the  */
   2339     /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
   2340     /* hpel planes which are interpolated during recon.                      */
   2341     /*************************************************************************/
   2342     U08 *apu1_hpel_ref[4], *pu1_ref;
   2343 
   2344     interp_prms_t s_interp_prms;
   2345 
   2346     /*************************************************************************/
   2347     /* Maintains the minimum id of interpolated buffers, and the pointer that*/
   2348     /* points to the corresponding predicted buf with its stride.            */
   2349     /* Note that the pointer cannot be derived just from the id, since the   */
   2350     /* pointer may also point to the hpel buffer (in case we request interp  */
   2351     /* of a hpel pt, which already exists in the recon hpel planes)          */
   2352     /*************************************************************************/
   2353     U08 *pu1_final_out;
   2354     S32 i4_final_out_stride;
   2355     S32 part_id;
   2356     S32 check_for_duplicate = 0;
   2357 
   2358     subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
   2359 
   2360     S32 mvx_qpel;
   2361     S32 mvy_qpel;
   2362 
   2363     pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn(
   2364         ps_func_selector,
   2365         ps_me_optimised_function_list,
   2366         i4_part_mask,
   2367         ps_prms->i4_use_satd,
   2368         ps_subpel_refine_ctxt->i4_num_valid_parts,
   2369         ps_search_results->u1_num_results_per_part);
   2370 
   2371     i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
   2372     i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
   2373 
   2374     /* Prediction contet should now deal with qpel units */
   2375     HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
   2376 
   2377     /* Buffer allocation for subpel */
   2378     /* Current design is that there may be many partitions and different mvs */
   2379     /* that attempt subpel refinemnt. While there is possibility of overlap, the */
   2380     /* hashing to detect and avoid overlap may be very complex. So, currently,   */
   2381     /* the only thing done is to store the eventual predicted buffer with every  */
   2382     /* ctb node that holds the result of hte best subpel search */
   2383 
   2384     /* Compute the base pointer for input, interpolated buffers */
   2385     /* The base pointers point as follows: */
   2386     /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
   2387     /* To these, we need to add the offset of the current node */
   2388     i4_ref_stride = ps_curr_layer->i4_rec_stride;
   2389     i4_offset = x_off + (y_off * i4_ref_stride);
   2390     i1_ref_idx = ps_search_node->i1_ref_idx;
   2391 
   2392     apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
   2393     apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
   2394     apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
   2395     apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
   2396 
   2397     /* Initialize result params used for partition update */
   2398     s_result_prms.pf_mv_cost_compute = NULL;
   2399     s_result_prms.ps_search_results = ps_search_results;
   2400     s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
   2401     s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx;
   2402     s_result_prms.u1_pred_lx = search_idx;
   2403     s_result_prms.i4_part_mask = i4_part_mask;
   2404     s_result_prms.ps_search_node_base = ps_search_node;
   2405     s_result_prms.pi4_sad_grid = &ai4_sad_grid[0];
   2406     s_result_prms.i4_grid_mask = 1;
   2407     s_result_prms.ps_search_node = &s_temp_search_node;
   2408     s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx;
   2409 
   2410     /* convert to hpel units */
   2411     i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
   2412     i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
   2413 
   2414     /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
   2415     ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx];
   2416     i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL);
   2417     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
   2418 
   2419     i4_min_cost = MAX_32BIT_VAL;
   2420     i4_min_sad = MAX_32BIT_VAL;
   2421 
   2422     /*************************************************************************/
   2423     /* Prepare the input params to SAD/SATD function. Note that input is     */
   2424     /* passed from the calling funcion since it may be I (normal subpel      */
   2425     /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
   2426     /* Both cases are handled here.                                          */
   2427     /*************************************************************************/
   2428     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
   2429     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
   2430     s_err_prms.i4_ref_stride = i4_ref_stride;
   2431     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
   2432     s_err_prms.i4_grid_mask = 1;
   2433     s_err_prms.pi4_sad_grid = &ai4_sad_grid[0];
   2434     s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
   2435     s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
   2436 
   2437     s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt;
   2438 
   2439     part_id = ps_search_node->u1_part_id;
   2440     for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
   2441     {
   2442         e_min_id = PT_C;
   2443 
   2444         mvx_qpel = i4_mv_x << 1;
   2445         mvy_qpel = i4_mv_y << 1;
   2446 
   2447         /* Central pt */
   2448         if(i4_grid_mask & BIT_EN(PT_C))
   2449         {
   2450             //ps_search_node->i2_mv_x = (S16)i4_mv_x;
   2451             //ps_search_node->i2_mv_x = (S16)i4_mv_y;
   2452             /* central pt is i4_mv_x, i4_mv_y */
   2453             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2454                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
   2455 
   2456             i4_frac_x = i4_mv_x & 1;
   2457             i4_frac_y = i4_mv_y & 1;
   2458             pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   2459             s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
   2460 
   2461             /* Update the mv's with the current candt motion vectors */
   2462             s_result_prms.i2_mv_x = mvx_qpel;
   2463             s_result_prms.i2_mv_y = mvy_qpel;
   2464             s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
   2465             s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
   2466 
   2467             pf_err_compute(&s_err_prms, &s_result_prms);
   2468 
   2469             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2470             if(i4_tot_cost < i4_min_cost)
   2471             {
   2472                 i4_min_cost = i4_tot_cost;
   2473                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2474                 e_min_id = PT_C;
   2475                 pu1_final_out = s_err_prms.pu1_ref;
   2476             }
   2477         }
   2478 
   2479         /* left pt */
   2480         if(i4_grid_mask & BIT_EN(PT_L))
   2481         {
   2482             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2483                 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
   2484 
   2485             if(!check_for_duplicate)
   2486             {
   2487                 /* search node mv is stored in qpel units */
   2488                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
   2489                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
   2490                 /* central pt is i4_mv_x - 1, i4_mv_y */
   2491                 i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
   2492                 i4_frac_y = i4_mv_y & 1;
   2493                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   2494                 s_err_prms.pu1_ref =
   2495                     pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
   2496 
   2497                 /* Update the mv's with the current candt motion vectors */
   2498                 s_result_prms.i2_mv_x = mvx_qpel - 2;
   2499                 s_result_prms.i2_mv_y = mvy_qpel;
   2500                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2;
   2501                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
   2502 
   2503                 pf_err_compute(&s_err_prms, &s_result_prms);
   2504                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   2505                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2506                 if(i4_tot_cost < i4_min_cost)
   2507                 {
   2508                     i4_min_cost = i4_tot_cost;
   2509                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2510                     e_min_id = PT_L;
   2511                     pu1_final_out = s_err_prms.pu1_ref;
   2512                 }
   2513             }
   2514         }
   2515         /* top pt */
   2516         if(i4_grid_mask & BIT_EN(PT_T))
   2517         {
   2518             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2519                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
   2520 
   2521             if(!check_for_duplicate)
   2522             {
   2523                 /* search node mv is stored in qpel units */
   2524                 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
   2525                 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
   2526                 /* top pt is i4_mv_x, i4_mv_y - 1 */
   2527                 i4_frac_x = i4_mv_x & 1;
   2528                 i4_frac_y = (i4_mv_y - 1) & 1;
   2529                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   2530                 s_err_prms.pu1_ref =
   2531                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
   2532 
   2533                 /* Update the mv's with the current candt motion vectors */
   2534                 s_result_prms.i2_mv_x = mvx_qpel;
   2535                 s_result_prms.i2_mv_y = mvy_qpel - 2;
   2536                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
   2537                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2;
   2538 
   2539                 pf_err_compute(&s_err_prms, &s_result_prms);
   2540                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   2541                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2542                 if(i4_tot_cost < i4_min_cost)
   2543                 {
   2544                     i4_min_cost = i4_tot_cost;
   2545                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2546                     e_min_id = PT_T;
   2547                     pu1_final_out = s_err_prms.pu1_ref;
   2548                 }
   2549             }
   2550         }
   2551         /* right pt */
   2552         if(i4_grid_mask & BIT_EN(PT_R))
   2553         {
   2554             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2555                 ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
   2556             if(!check_for_duplicate)
   2557             {
   2558                 /* search node mv is stored in qpel units */
   2559                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
   2560                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
   2561                 /* right pt is i4_mv_x + 1, i4_mv_y */
   2562                 i4_frac_x = (i4_mv_x + 1) & 1;
   2563                 i4_frac_y = i4_mv_y & 1;
   2564 
   2565                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   2566                 s_err_prms.pu1_ref =
   2567                     pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
   2568 
   2569                 /* Update the mv's with the current candt motion vectors */
   2570                 s_result_prms.i2_mv_x = mvx_qpel + 2;
   2571                 s_result_prms.i2_mv_y = mvy_qpel;
   2572                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2;
   2573                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
   2574 
   2575                 pf_err_compute(&s_err_prms, &s_result_prms);
   2576                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   2577                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2578                 if(i4_tot_cost < i4_min_cost)
   2579                 {
   2580                     i4_min_cost = i4_tot_cost;
   2581                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2582                     e_min_id = PT_R;
   2583                     pu1_final_out = s_err_prms.pu1_ref;
   2584                 }
   2585             }
   2586         }
   2587         /* bottom pt */
   2588         if(i4_grid_mask & BIT_EN(PT_B))
   2589         {
   2590             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2591                 ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
   2592             if(!check_for_duplicate)
   2593             {
   2594                 /* search node mv is stored in qpel units */
   2595                 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
   2596                 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
   2597                 i4_frac_x = i4_mv_x & 1;
   2598                 i4_frac_y = (i4_mv_y + 1) & 1;
   2599                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   2600                 s_err_prms.pu1_ref =
   2601                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
   2602 
   2603                 /* Update the mv's with the current candt motion vectors */
   2604                 s_result_prms.i2_mv_x = mvx_qpel;
   2605                 s_result_prms.i2_mv_y = mvy_qpel + 2;
   2606                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
   2607                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2;
   2608 
   2609                 pf_err_compute(&s_err_prms, &s_result_prms);
   2610                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   2611                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2612                 if(i4_tot_cost < i4_min_cost)
   2613                 {
   2614                     i4_min_cost = i4_tot_cost;
   2615                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2616                     e_min_id = PT_B;
   2617                     pu1_final_out = s_err_prms.pu1_ref;
   2618                 }
   2619             }
   2620         }
   2621         /* Early exit in case of central point */
   2622         if(e_min_id == PT_C)
   2623             break;
   2624 
   2625         /*********************************************************************/
   2626         /* Depending on the best result location, we may be able to skip     */
   2627         /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
   2628         /* the best result, the next iteration need not do centre, left pts  */
   2629         /*********************************************************************/
   2630         i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
   2631         i4_mv_x += gai1_grid_id_to_x[e_min_id];
   2632         i4_mv_y += gai1_grid_id_to_y[e_min_id];
   2633         ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   2634         ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   2635         i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
   2636     }
   2637 
   2638     /* Convert to QPEL units */
   2639     i4_mv_x <<= 1;
   2640     i4_mv_y <<= 1;
   2641 
   2642     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   2643     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   2644 
   2645     /* Exact interpolation or averaging chosen here */
   2646     pf_qpel_interp = ps_prms->pf_qpel_interp;
   2647 
   2648     /* Next QPEL ME */
   2649     /* In this case, we have option of doing exact QPEL interpolation or avg */
   2650     /*************************************************************************/
   2651     /*        x                                                              */
   2652     /*    A b C d                                                            */
   2653     /*    e f g h                                                            */
   2654     /*    I j K l                                                            */
   2655     /*    m n o p                                                            */
   2656     /*    Q r S t                                                            */
   2657     /*                                                                       */
   2658     /*    Approximate QPEL logic                                             */
   2659     /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
   2660     /*    for any given pt, we can get all the information required about    */
   2661     /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
   2662     /*     surrounding pts info:                                             */
   2663     /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
   2664     /*           buffer 2: hxfy, offsets for both are 0, 0                   */
   2665     /*    similarly for other pts the info can be gotten                     */
   2666     /*************************************************************************/
   2667     i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C));
   2668     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
   2669 
   2670     /*************************************************************************/
   2671     /* One time preparation of non changing interpolation params. These      */
   2672     /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
   2673     /* working memory (not used though in case of averaging).                */
   2674     /*************************************************************************/
   2675     s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
   2676     s_interp_prms.i4_ref_stride = i4_ref_stride;
   2677     s_interp_prms.i4_blk_wd = i4_blk_wd;
   2678     s_interp_prms.i4_blk_ht = i4_blk_ht;
   2679 
   2680     i4_final_out_stride = i4_ref_stride;
   2681 
   2682     {
   2683         U08 *pu1_mem;
   2684         /*********************************************************************/
   2685         /* Allocation of working memory for interpolated buffers. We maintain*/
   2686         /* an intermediate working buffer, and 2 ping pong interpolated out  */
   2687         /* buffers, purpose of ping pong explained later below               */
   2688         /*********************************************************************/
   2689         pu1_mem = ps_prms->pu1_wkg_mem;
   2690         s_interp_prms.pu1_wkg_mem = pu1_mem;
   2691 
   2692         //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
   2693         s_interp_prms.apu1_interp_out[0] = pu1_mem;
   2694 
   2695         pu1_mem += (INTERP_OUT_BUF_SIZE);
   2696         s_interp_prms.apu1_interp_out[1] = pu1_mem;
   2697 
   2698         pu1_mem += (INTERP_OUT_BUF_SIZE);
   2699         s_interp_prms.apu1_interp_out[2] = pu1_mem;
   2700 
   2701         pu1_mem += (INTERP_OUT_BUF_SIZE);
   2702         s_interp_prms.apu1_interp_out[3] = pu1_mem;
   2703 
   2704         pu1_mem += (INTERP_OUT_BUF_SIZE);
   2705         s_interp_prms.apu1_interp_out[4] = pu1_mem;
   2706 
   2707         /*********************************************************************/
   2708         /* Stride of interpolated output is just a function of blk width of  */
   2709         /* this partition and hence remains constant for this partition      */
   2710         /*********************************************************************/
   2711         s_interp_prms.i4_out_stride = (i4_blk_wd);
   2712     }
   2713 
   2714     {
   2715         UWORD8 *apu1_final[4];
   2716         WORD32 ai4_ref_stride[4];
   2717         /*************************************************************************/
   2718         /* Ping pong design for interpolated buffers. We use a min id, which     */
   2719         /* tracks the id of the ppu1_interp_out that stores the best result.     */
   2720         /* When new interp to be done, it uses 1 - bes result id to do the interp*/
   2721         /* min id is toggled when any new result becomes the best result.        */
   2722         /*************************************************************************/
   2723 
   2724         for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
   2725         {
   2726             e_min_id = PT_C;
   2727 
   2728             mvx_qpel = i4_mv_x;
   2729             mvy_qpel = i4_mv_y;
   2730             hme_qpel_interp_comprehensive(
   2731                 &s_interp_prms,
   2732                 apu1_final,
   2733                 ai4_ref_stride,
   2734                 i4_mv_x,
   2735                 i4_mv_y,
   2736                 i4_grid_mask,
   2737                 ps_me_optimised_function_list);
   2738             if(i4_grid_mask & BIT_EN(PT_L))
   2739             {
   2740                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2741                     ps_dedup_enabler,
   2742                     num_unique_nodes,
   2743                     mvx_qpel - 1,
   2744                     mvy_qpel - 0,
   2745                     check_for_duplicate);
   2746 
   2747                 if(!check_for_duplicate)
   2748                 {
   2749                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
   2750                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   2751 
   2752                     s_err_prms.pu1_ref = apu1_final[0];
   2753                     s_err_prms.i4_ref_stride = ai4_ref_stride[0];
   2754 
   2755                     /* Update the mv's with the current candt motion vectors */
   2756                     s_result_prms.i2_mv_x = mvx_qpel - 1;
   2757                     s_result_prms.i2_mv_y = mvy_qpel;
   2758                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1;
   2759                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
   2760 
   2761                     pf_err_compute(&s_err_prms, &s_result_prms);
   2762                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   2763 
   2764                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2765                     if(i4_tot_cost < i4_min_cost)
   2766                     {
   2767                         e_min_id = PT_L;
   2768                         i4_min_cost = i4_tot_cost;
   2769                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2770                     }
   2771                 }
   2772             }
   2773             if(i4_grid_mask & BIT_EN(PT_T))
   2774             {
   2775                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2776                     ps_dedup_enabler,
   2777                     num_unique_nodes,
   2778                     mvx_qpel - 0,
   2779                     mvy_qpel - 1,
   2780                     check_for_duplicate);
   2781 
   2782                 if(!check_for_duplicate)
   2783                 {
   2784                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   2785                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
   2786 
   2787                     s_err_prms.pu1_ref = apu1_final[1];
   2788                     s_err_prms.i4_ref_stride = ai4_ref_stride[1];
   2789 
   2790                     /* Update the mv's with the current candt motion vectors */
   2791                     s_result_prms.i2_mv_x = mvx_qpel;
   2792                     s_result_prms.i2_mv_y = mvy_qpel - 1;
   2793 
   2794                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
   2795                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1;
   2796 
   2797                     pf_err_compute(&s_err_prms, &s_result_prms);
   2798 
   2799                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   2800                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2801                     if(i4_tot_cost < i4_min_cost)
   2802                     {
   2803                         e_min_id = PT_T;
   2804                         i4_min_cost = i4_tot_cost;
   2805                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2806                     }
   2807                 }
   2808             }
   2809             if(i4_grid_mask & BIT_EN(PT_R))
   2810             {
   2811                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2812                     ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
   2813 
   2814                 if(!check_for_duplicate)
   2815                 {
   2816                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
   2817                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   2818 
   2819                     s_err_prms.pu1_ref = apu1_final[2];
   2820                     s_err_prms.i4_ref_stride = ai4_ref_stride[2];
   2821 
   2822                     /* Update the mv's with the current candt motion vectors */
   2823                     s_result_prms.i2_mv_x = mvx_qpel + 1;
   2824                     s_result_prms.i2_mv_y = mvy_qpel;
   2825 
   2826                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1;
   2827                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
   2828 
   2829                     pf_err_compute(&s_err_prms, &s_result_prms);
   2830 
   2831                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   2832 
   2833                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2834                     if(i4_tot_cost < i4_min_cost)
   2835                     {
   2836                         e_min_id = PT_R;
   2837                         i4_min_cost = i4_tot_cost;
   2838                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2839                     }
   2840                 }
   2841             }
   2842             /* i4_mv_x and i4_mv_y will always be the centre pt */
   2843             /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
   2844             if(i4_grid_mask & BIT_EN(PT_B))
   2845             {
   2846                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   2847                     ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
   2848 
   2849                 if(!check_for_duplicate)
   2850                 {
   2851                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   2852                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
   2853 
   2854                     s_err_prms.pu1_ref = apu1_final[3];
   2855                     s_err_prms.i4_ref_stride = ai4_ref_stride[3];
   2856 
   2857                     /* Update the mv's with the current candt motion vectors */
   2858                     s_result_prms.i2_mv_x = mvx_qpel;
   2859                     s_result_prms.i2_mv_y = mvy_qpel + 1;
   2860 
   2861                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
   2862                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1;
   2863 
   2864                     pf_err_compute(&s_err_prms, &s_result_prms);
   2865 
   2866                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   2867                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   2868                     if(i4_tot_cost < i4_min_cost)
   2869                     {
   2870                         e_min_id = PT_B;
   2871                         i4_min_cost = i4_tot_cost;
   2872                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   2873                     }
   2874                 }
   2875             }
   2876 
   2877             /* New QPEL mv x and y */
   2878             if(e_min_id == PT_C)
   2879                 break;
   2880             i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
   2881             i4_mv_x += gai1_grid_id_to_x[e_min_id];
   2882             i4_mv_y += gai1_grid_id_to_y[e_min_id];
   2883             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   2884             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   2885             i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
   2886         }
   2887     }
   2888 
   2889     /* update modified motion vectors and cost at end of subpel */
   2890     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   2891     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   2892     ps_search_node->i4_tot_cost = i4_min_cost;
   2893     ps_search_node->i4_sad = i4_min_sad;
   2894 
   2895     /********************************************************************************/
   2896     /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
   2897     /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
   2898     /********************************************************************************/
   2899     //ps_pred_ctxt->lambda >>= 1;
   2900 
   2901     return (i4_min_cost);
   2902 }
   2903 #elif DIAMOND_GRID == 0
   2904 S32 hme_subpel_refine_search_node_high_speed(
   2905     search_node_t *ps_search_node,
   2906     hme_subpel_prms_t *ps_prms,
   2907     layer_ctxt_t *ps_curr_layer,
   2908     BLK_SIZE_T e_blk_size,
   2909     S32 x_off,
   2910     S32 y_off,
   2911     search_results_t *ps_search_results,
   2912     S32 pred_lx,
   2913     S32 i4_part_mask,
   2914     S32 *pi4_valid_part_ids,
   2915     S32 search_idx,
   2916     subpel_dedup_enabler_t *ps_dedup_enabler,
   2917     me_func_selector_t *ps_func_selector)
   2918 {
   2919     S32 i4_num_hpel_refine, i4_num_qpel_refine;
   2920     S32 i4_offset, i4_grid_mask;
   2921     S08 i1_ref_idx;
   2922     S32 i4_blk_wd, i4_blk_ht;
   2923     S32 i4_ref_stride, i4_i;
   2924     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
   2925     result_upd_prms_t s_result_prms;
   2926 
   2927     /*************************************************************************/
   2928     /* Tracks current MV with the fractional component.                      */
   2929     /*************************************************************************/
   2930     S32 i4_mv_x, i4_mv_y;
   2931     S32 i4_frac_x, i4_frac_y;
   2932 
   2933     /*************************************************************************/
   2934     /* Function pointer for SAD/SATD, array and prms structure to pass to    */
   2935     /* This function                                                         */
   2936     /*************************************************************************/
   2937     PF_SAD_FXN_T pf_err_compute;
   2938     S32 ai4_sad_grid[9][17], i4_tot_cost;
   2939     err_prms_t s_err_prms;
   2940 
   2941     /*************************************************************************/
   2942     /* Allowed MV RANGE                                                      */
   2943     /*************************************************************************/
   2944     range_prms_t *ps_range_prms;
   2945 
   2946     /*************************************************************************/
   2947     /* stores min id in grid with associated min cost.                       */
   2948     /*************************************************************************/
   2949     S32 i4_min_cost, i4_min_sad;
   2950     GRID_PT_T e_min_id;
   2951 
   2952     PF_INTERP_FXN_T pf_qpel_interp;
   2953     /*************************************************************************/
   2954     /* For hpel and qpel we move in diamonds and hence each point in the     */
   2955     /* diamond will belong to a completely different plane. To simplify the  */
   2956     /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
   2957     /* hpel planes which are interpolated during recon.                      */
   2958     /*************************************************************************/
   2959     U08 *apu1_hpel_ref[4], *pu1_ref;
   2960 
   2961     interp_prms_t s_interp_prms;
   2962 
   2963     /*************************************************************************/
   2964     /* Maintains the minimum id of interpolated buffers, and the pointer that*/
   2965     /* points to the corresponding predicted buf with its stride.            */
   2966     /* Note that the pointer cannot be derived just from the id, since the   */
   2967     /* pointer may also point to the hpel buffer (in case we request interp  */
   2968     /* of a hpel pt, which already exists in the recon hpel planes)          */
   2969     /*************************************************************************/
   2970     U08 *pu1_final_out;
   2971     S32 i4_final_out_stride;
   2972     S32 part_id;
   2973     S32 check_for_duplicate = 0;
   2974 
   2975     S32 mvx_qpel;
   2976     S32 mvy_qpel;
   2977 
   2978     /*************************************************************************/
   2979     /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/
   2980     /* fixed through this subpel refinement for this partition.              */
   2981     /* Note, we do not enable grid sads since each pt is different buffers.  */
   2982     /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled.   */
   2983     /*************************************************************************/
   2984     if(ps_prms->i4_use_satd)
   2985     {
   2986         pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16;
   2987     }
   2988     else
   2989     {
   2990         pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */
   2991     }
   2992 
   2993     i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
   2994     i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
   2995 
   2996     /* Prediction contet should now deal with qpel units */
   2997     HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
   2998 
   2999     /* Buffer allocation for subpel */
   3000     /* Current design is that there may be many partitions and different mvs */
   3001     /* that attempt subpel refinemnt. While there is possibility of overlap, the */
   3002     /* hashing to detect and avoid overlap may be very complex. So, currently,   */
   3003     /* the only thing done is to store the eventual predicted buffer with every  */
   3004     /* ctb node that holds the result of hte best subpel search */
   3005 
   3006     /* Compute the base pointer for input, interpolated buffers */
   3007     /* The base pointers point as follows:
   3008     /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
   3009     /* To these, we need to add the offset of the current node */
   3010     i4_ref_stride = ps_curr_layer->i4_rec_stride;
   3011     i4_offset = x_off + (y_off * i4_ref_stride);
   3012     i1_ref_idx = ps_search_node->i1_ref_idx;
   3013 
   3014     apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
   3015     apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
   3016     apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
   3017     apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
   3018 
   3019     /* Initialize result params used for partition update */
   3020     s_result_prms.pf_mv_cost_compute = NULL;
   3021     s_result_prms.ps_search_results = ps_search_results;
   3022     s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
   3023     s_result_prms.i1_ref_idx = search_idx;
   3024     s_result_prms.i4_part_mask = i4_part_mask;
   3025     s_result_prms.ps_search_node_base = ps_search_node;
   3026     s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
   3027     s_result_prms.i4_grid_mask = 1;
   3028 
   3029     /* convert to hpel units */
   3030     i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
   3031     i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
   3032 
   3033     /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
   3034     ps_range_prms = ps_prms->ps_mv_range_qpel;
   3035     i4_grid_mask = (GRID_ALL_PTS_VALID);
   3036     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
   3037 
   3038     i4_min_cost = MAX_32BIT_VAL;
   3039     i4_min_sad = MAX_32BIT_VAL;
   3040 
   3041     /*************************************************************************/
   3042     /* Prepare the input params to SAD/SATD function. Note that input is     */
   3043     /* passed from the calling funcion since it may be I (normal subpel      */
   3044     /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
   3045     /* Both cases are handled here.                                          */
   3046     /*************************************************************************/
   3047     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
   3048     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
   3049     s_err_prms.i4_ref_stride = i4_ref_stride;
   3050     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
   3051     s_err_prms.i4_grid_mask = 1;
   3052     s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
   3053     s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
   3054     s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
   3055 
   3056     /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */
   3057     //ps_pred_ctxt->lambda <<= 1;
   3058     part_id = ps_search_node->u1_part_id;
   3059     for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
   3060     {
   3061         e_min_id = PT_C;
   3062 
   3063         mvx_qpel = i4_mv_x << 1;
   3064         mvy_qpel = i4_mv_y << 1;
   3065 
   3066         /* Central pt */
   3067         if(i4_grid_mask & BIT_EN(PT_C))
   3068         {
   3069             //ps_search_node->i2_mv_x = (S16)i4_mv_x;
   3070             //ps_search_node->i2_mv_x = (S16)i4_mv_y;
   3071             /* central pt is i4_mv_x, i4_mv_y */
   3072             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3073                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
   3074 
   3075             i4_frac_x = i4_mv_x & 1;
   3076             i4_frac_y = i4_mv_y & 1;
   3077             pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3078             s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
   3079             pf_err_compute(&s_err_prms);
   3080             /* Update the mv's with the current candt motion vectors */
   3081             s_result_prms.i2_mv_x = mvx_qpel;
   3082             s_result_prms.i2_mv_y = mvy_qpel;
   3083             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3084             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3085             if(i4_tot_cost < i4_min_cost)
   3086             {
   3087                 i4_min_cost = i4_tot_cost;
   3088                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3089                 e_min_id = PT_C;
   3090                 pu1_final_out = s_err_prms.pu1_ref;
   3091             }
   3092         }
   3093 
   3094         /* left pt */
   3095         if(i4_grid_mask & BIT_EN(PT_L))
   3096         {
   3097             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3098                 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
   3099 
   3100             if(!check_for_duplicate)
   3101             {
   3102                 /* search node mv is stored in qpel units */
   3103                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
   3104                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
   3105                 /* central pt is i4_mv_x - 1, i4_mv_y */
   3106                 i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
   3107                 i4_frac_y = i4_mv_y & 1;
   3108                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3109                 s_err_prms.pu1_ref =
   3110                     pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
   3111 
   3112                 pf_err_compute(&s_err_prms);
   3113                 /* Update the mv's with the current candt motion vectors */
   3114                 s_result_prms.i2_mv_x = mvx_qpel;
   3115                 s_result_prms.i2_mv_y = mvy_qpel;
   3116                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3117 
   3118                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3119 
   3120                 if(i4_tot_cost < i4_min_cost)
   3121                 {
   3122                     i4_min_cost = i4_tot_cost;
   3123                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3124                     e_min_id = PT_L;
   3125                     pu1_final_out = s_err_prms.pu1_ref;
   3126                 }
   3127             }
   3128         }
   3129         /* top pt */
   3130         if(i4_grid_mask & BIT_EN(PT_T))
   3131         {
   3132             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3133                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
   3134 
   3135             if(!check_for_duplicate)
   3136             {
   3137                 /* search node mv is stored in qpel units */
   3138                 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
   3139                 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
   3140                 /* top pt is i4_mv_x, i4_mv_y - 1 */
   3141                 i4_frac_x = i4_mv_x & 1;
   3142                 i4_frac_y = (i4_mv_y - 1) & 1;
   3143                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3144                 s_err_prms.pu1_ref =
   3145                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
   3146                 pf_err_compute(&s_err_prms);
   3147                 /* Update the mv's with the current candt motion vectors */
   3148                 s_result_prms.i2_mv_x = mvx_qpel;
   3149                 s_result_prms.i2_mv_y = mvy_qpel - 2;
   3150                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3151 
   3152                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3153 
   3154                 if(i4_tot_cost < i4_min_cost)
   3155                 {
   3156                     i4_min_cost = i4_tot_cost;
   3157                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3158                     e_min_id = PT_T;
   3159                     pu1_final_out = s_err_prms.pu1_ref;
   3160                 }
   3161             }
   3162         }
   3163         /* right pt */
   3164         if(i4_grid_mask & BIT_EN(PT_R))
   3165         {
   3166             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3167                 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
   3168 
   3169             if(!check_for_duplicate)
   3170             {
   3171                 /* search node mv is stored in qpel units */
   3172                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
   3173                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
   3174                 /* right pt is i4_mv_x + 1, i4_mv_y */
   3175                 i4_frac_x = (i4_mv_x + 1) & 1;
   3176                 i4_frac_y = i4_mv_y & 1;
   3177 
   3178                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3179                 s_err_prms.pu1_ref =
   3180                     pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
   3181                 pf_err_compute(&s_err_prms);
   3182                 /* Update the mv's with the current candt motion vectors */
   3183                 s_result_prms.i2_mv_x = mvx_qpel + 2;
   3184                 s_result_prms.i2_mv_y = mvy_qpel;
   3185                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3186 
   3187                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3188 
   3189                 if(i4_tot_cost < i4_min_cost)
   3190                 {
   3191                     i4_min_cost = i4_tot_cost;
   3192                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3193                     e_min_id = PT_R;
   3194                     pu1_final_out = s_err_prms.pu1_ref;
   3195                 }
   3196             }
   3197         }
   3198         /* bottom pt */
   3199         if(i4_grid_mask & BIT_EN(PT_B))
   3200         {
   3201             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3202                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
   3203 
   3204             if(!check_for_duplicate)
   3205             {
   3206                 /* search node mv is stored in qpel units */
   3207                 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
   3208                 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
   3209                 i4_frac_x = i4_mv_x & 1;
   3210                 i4_frac_y = (i4_mv_y + 1) & 1;
   3211                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3212                 s_err_prms.pu1_ref =
   3213                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
   3214 
   3215                 pf_err_compute(&s_err_prms);
   3216                 /* Update the mv's with the current candt motion vectors */
   3217                 s_result_prms.i2_mv_x = mvx_qpel;
   3218                 s_result_prms.i2_mv_y = mvy_qpel + 2;
   3219                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3220 
   3221                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3222 
   3223                 if(i4_tot_cost < i4_min_cost)
   3224                 {
   3225                     i4_min_cost = i4_tot_cost;
   3226                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3227                     e_min_id = PT_B;
   3228                     pu1_final_out = s_err_prms.pu1_ref;
   3229                 }
   3230             }
   3231         }
   3232         if(e_min_id == PT_C)
   3233         {
   3234             if(!i4_i)
   3235             {
   3236                 /* TL pt */
   3237                 if(i4_grid_mask & BIT_EN(PT_TL))
   3238                 {
   3239                     S32 mvx_minus_1 = (i4_mv_x - 1);
   3240                     S32 mvy_minus_1 = (i4_mv_y - 1);
   3241 
   3242                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3243                         ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate);
   3244 
   3245                     if(!check_for_duplicate)
   3246                     {
   3247                         /* search node mv is stored in qpel units */
   3248                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
   3249                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
   3250                         i4_frac_x = mvx_minus_1 & 1;
   3251                         i4_frac_y = mvy_minus_1 & 1;
   3252                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3253                         s_err_prms.pu1_ref =
   3254                             pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
   3255 
   3256                         pf_err_compute(&s_err_prms);
   3257                         /* Update the mv's with the current candt motion vectors */
   3258                         s_result_prms.i2_mv_x = mvx_qpel - 2;
   3259                         s_result_prms.i2_mv_y = mvy_qpel - 2;
   3260                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3261 
   3262                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3263 
   3264                         if(i4_tot_cost < i4_min_cost)
   3265                         {
   3266                             i4_min_cost = i4_tot_cost;
   3267                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3268                             e_min_id = PT_TL;
   3269                             pu1_final_out = s_err_prms.pu1_ref;
   3270                         }
   3271                     }
   3272                 }
   3273                 /* TR pt */
   3274                 if(i4_grid_mask & BIT_EN(PT_TR))
   3275                 {
   3276                     S32 mvx_plus_1 = (i4_mv_x + 1);
   3277                     S32 mvy_minus_1 = (i4_mv_y - 1);
   3278 
   3279                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3280                         ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate);
   3281 
   3282                     if(!check_for_duplicate)
   3283                     {
   3284                         /* search node mv is stored in qpel units */
   3285                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
   3286                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
   3287                         i4_frac_x = mvx_plus_1 & 1;
   3288                         i4_frac_y = mvy_minus_1 & 1;
   3289                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3290                         s_err_prms.pu1_ref =
   3291                             pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
   3292 
   3293                         pf_err_compute(&s_err_prms);
   3294                         /* Update the mv's with the current candt motion vectors */
   3295                         s_result_prms.i2_mv_x = mvx_qpel + 2;
   3296                         s_result_prms.i2_mv_y = mvy_qpel - 2;
   3297                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3298 
   3299                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3300 
   3301                         if(i4_tot_cost < i4_min_cost)
   3302                         {
   3303                             i4_min_cost = i4_tot_cost;
   3304                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3305                             e_min_id = PT_TR;
   3306                             pu1_final_out = s_err_prms.pu1_ref;
   3307                         }
   3308                     }
   3309                 }
   3310                 /* BL pt */
   3311                 if(i4_grid_mask & BIT_EN(PT_BL))
   3312                 {
   3313                     S32 mvx_minus_1 = (i4_mv_x - 1);
   3314                     S32 mvy_plus_1 = (i4_mv_y + 1);
   3315 
   3316                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3317                         ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate);
   3318 
   3319                     if(!check_for_duplicate)
   3320                     {
   3321                         /* search node mv is stored in qpel units */
   3322                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
   3323                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
   3324                         i4_frac_x = mvx_minus_1 & 1;
   3325                         i4_frac_y = mvy_plus_1 & 1;
   3326                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3327                         s_err_prms.pu1_ref =
   3328                             pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
   3329 
   3330                         pf_err_compute(&s_err_prms);
   3331                         /* Update the mv's with the current candt motion vectors */
   3332                         s_result_prms.i2_mv_x = mvx_qpel - 2;
   3333                         s_result_prms.i2_mv_y = mvy_qpel + 2;
   3334                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3335 
   3336                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3337 
   3338                         if(i4_tot_cost < i4_min_cost)
   3339                         {
   3340                             i4_min_cost = i4_tot_cost;
   3341                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3342                             e_min_id = PT_BL;
   3343                             pu1_final_out = s_err_prms.pu1_ref;
   3344                         }
   3345                     }
   3346                 }
   3347                 /* BR pt */
   3348                 if(i4_grid_mask & BIT_EN(PT_BR))
   3349                 {
   3350                     S32 mvx_plus_1 = (i4_mv_x + 1);
   3351                     S32 mvy_plus_1 = (i4_mv_y + 1);
   3352                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3353                         ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate);
   3354 
   3355                     if(!check_for_duplicate)
   3356                     {
   3357                         /* search node mv is stored in qpel units */
   3358                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
   3359                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
   3360                         i4_frac_x = mvx_plus_1 & 1;
   3361                         i4_frac_y = mvy_plus_1 & 1;
   3362                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
   3363                         s_err_prms.pu1_ref =
   3364                             pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
   3365 
   3366                         pf_err_compute(&s_err_prms);
   3367                         /* Update the mv's with the current candt motion vectors */
   3368                         s_result_prms.i2_mv_x = mvx_qpel + 2;
   3369                         s_result_prms.i2_mv_y = mvy_qpel + 2;
   3370                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3371 
   3372                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3373 
   3374                         if(i4_tot_cost < i4_min_cost)
   3375                         {
   3376                             i4_min_cost = i4_tot_cost;
   3377                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3378                             e_min_id = PT_BR;
   3379                             pu1_final_out = s_err_prms.pu1_ref;
   3380                         }
   3381                     }
   3382                 }
   3383                 if(e_min_id == PT_C)
   3384                 {
   3385                     break;
   3386                 }
   3387             }
   3388             else
   3389             {
   3390                 break;
   3391             }
   3392         }
   3393 
   3394         /*********************************************************************/
   3395         /* Depending on the best result location, we may be able to skip     */
   3396         /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
   3397         /* the best result, the next iteration need not do centre, left pts  */
   3398         /*********************************************************************/
   3399         if(i4_i)
   3400         {
   3401             i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
   3402         }
   3403         else
   3404         {
   3405             i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
   3406         }
   3407         i4_mv_x += gai1_grid_id_to_x[e_min_id];
   3408         i4_mv_y += gai1_grid_id_to_y[e_min_id];
   3409         ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
   3410         ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
   3411         i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
   3412     }
   3413 
   3414     /* Convert to QPEL units */
   3415     i4_mv_x <<= 1;
   3416     i4_mv_y <<= 1;
   3417 
   3418     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   3419     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   3420 
   3421     /* Early exit if this partition is visiting same hpel mv again */
   3422     /* Assumption : Checkin for early exit in best result of partition */
   3423     if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x ==
   3424         ps_search_node->s_mv.i2_mvx) &&
   3425        (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y ==
   3426         ps_search_node->s_mv.i2_mvy))
   3427     {
   3428         return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost);
   3429     }
   3430     else
   3431     {
   3432         /* Store the best hpel mv for future early exit checks */
   3433         ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x =
   3434             (S16)i4_mv_x;
   3435         ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y =
   3436             (S16)i4_mv_y;
   3437     }
   3438 
   3439     /* Early exit if this partition is visiting same hpel mv again */
   3440     /* Assumption : Checkin for early exit in second best result of partition */
   3441     if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x ==
   3442         ps_search_node->s_mv.i2_mvx) &&
   3443        (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y ==
   3444         ps_search_node->s_mv.i2_mvy))
   3445     {
   3446         return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost);
   3447     }
   3448     else
   3449     {
   3450         /* Store the best hpel mv for future early exit checks */
   3451         ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x =
   3452             (S16)i4_mv_x;
   3453         ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y =
   3454             (S16)i4_mv_y;
   3455     }
   3456 
   3457     /* Exact interpolation or averaging chosen here */
   3458     pf_qpel_interp = ps_prms->pf_qpel_interp;
   3459 
   3460     /* Next QPEL ME */
   3461     /* In this case, we have option of doing exact QPEL interpolation or avg */
   3462     /*************************************************************************/
   3463     /*        x                                                              */
   3464     /*    A b C d                                                            */
   3465     /*    e f g h                                                            */
   3466     /*    I j K l                                                            */
   3467     /*    m n o p                                                            */
   3468     /*    Q r S t                                                            */
   3469     /*                                                                       */
   3470     /*    Approximate QPEL logic                                             */
   3471     /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
   3472     /*    for any given pt, we can get all the information required about    */
   3473     /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
   3474     /*     surrounding pts info:                                             */
   3475     /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
   3476     /*           buffer 2: hxfy, offsets for both are 0, 0                   */
   3477     /*    similarly for other pts the info can be gotten                     */
   3478     /*************************************************************************/
   3479     i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C));
   3480     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
   3481 
   3482     /*************************************************************************/
   3483     /* One time preparation of non changing interpolation params. These      */
   3484     /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
   3485     /* working memory (not used though in case of averaging).                */
   3486     /*************************************************************************/
   3487     s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
   3488     s_interp_prms.i4_ref_stride = i4_ref_stride;
   3489     s_interp_prms.i4_blk_wd = i4_blk_wd;
   3490     s_interp_prms.i4_blk_ht = i4_blk_ht;
   3491 
   3492     i4_final_out_stride = i4_ref_stride;
   3493 
   3494     {
   3495         U08 *pu1_mem;
   3496         /*********************************************************************/
   3497         /* Allocation of working memory for interpolated buffers. We maintain*/
   3498         /* an intermediate working buffer, and 2 ping pong interpolated out  */
   3499         /* buffers, purpose of ping pong explained later below               */
   3500         /*********************************************************************/
   3501         pu1_mem = ps_prms->pu1_wkg_mem;
   3502         s_interp_prms.pu1_wkg_mem = pu1_mem;
   3503 
   3504         //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
   3505         s_interp_prms.apu1_interp_out[0] = pu1_mem;
   3506 
   3507         pu1_mem += (INTERP_OUT_BUF_SIZE);
   3508         s_interp_prms.apu1_interp_out[1] = pu1_mem;
   3509 
   3510         pu1_mem += (INTERP_OUT_BUF_SIZE);
   3511         s_interp_prms.apu1_interp_out[2] = pu1_mem;
   3512 
   3513         pu1_mem += (INTERP_OUT_BUF_SIZE);
   3514         s_interp_prms.apu1_interp_out[3] = pu1_mem;
   3515 
   3516         pu1_mem += (INTERP_OUT_BUF_SIZE);
   3517         s_interp_prms.apu1_interp_out[4] = pu1_mem;
   3518 
   3519         /*********************************************************************/
   3520         /* Stride of interpolated output is just a function of blk width of  */
   3521         /* this partition and hence remains constant for this partition      */
   3522         /*********************************************************************/
   3523         s_interp_prms.i4_out_stride = (i4_blk_wd);
   3524     }
   3525 
   3526     {
   3527         UWORD8 *apu1_final[4];
   3528         WORD32 ai4_ref_stride[4];
   3529         /*************************************************************************/
   3530         /* Ping pong design for interpolated buffers. We use a min id, which     */
   3531         /* tracks the id of the ppu1_interp_out that stores the best result.     */
   3532         /* When new interp to be done, it uses 1 - bes result id to do the interp*/
   3533         /* min id is toggled when any new result becomes the best result.        */
   3534         /*************************************************************************/
   3535 
   3536         for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
   3537         {
   3538             e_min_id = PT_C;
   3539 
   3540             hme_qpel_interp_comprehensive(
   3541                 &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask);
   3542 
   3543             mvx_qpel = i4_mv_x;
   3544             mvy_qpel = i4_mv_y;
   3545 
   3546             if(i4_grid_mask & BIT_EN(PT_L))
   3547             {
   3548                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3549                     ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate);
   3550 
   3551                 if(!check_for_duplicate)
   3552                 {
   3553                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
   3554                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   3555 
   3556                     s_err_prms.pu1_ref = apu1_final[0];
   3557                     s_err_prms.i4_ref_stride = ai4_ref_stride[0];
   3558 
   3559                     pf_err_compute(&s_err_prms);
   3560                     /* Update the mv's with the current candt motion vectors */
   3561                     s_result_prms.i2_mv_x = mvx_qpel - 1;
   3562                     s_result_prms.i2_mv_y = mvy_qpel;
   3563                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3564 
   3565                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3566                     if(i4_tot_cost < i4_min_cost)
   3567                     {
   3568                         e_min_id = PT_L;
   3569                         i4_min_cost = i4_tot_cost;
   3570                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3571                     }
   3572                 }
   3573             }
   3574             if(i4_grid_mask & BIT_EN(PT_T))
   3575             {
   3576                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3577                     ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate);
   3578 
   3579                 if(!check_for_duplicate)
   3580                 {
   3581                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   3582                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
   3583 
   3584                     s_err_prms.pu1_ref = apu1_final[1];
   3585                     s_err_prms.i4_ref_stride = ai4_ref_stride[1];
   3586 
   3587                     pf_err_compute(&s_err_prms);
   3588                     /* Update the mv's with the current candt motion vectors */
   3589                     s_result_prms.i2_mv_x = mvx_qpel;
   3590                     s_result_prms.i2_mv_y = mvy_qpel - 1;
   3591                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3592                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3593                     if(i4_tot_cost < i4_min_cost)
   3594                     {
   3595                         e_min_id = PT_T;
   3596                         i4_min_cost = i4_tot_cost;
   3597                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3598                     }
   3599                 }
   3600             }
   3601             if(i4_grid_mask & BIT_EN(PT_R))
   3602             {
   3603                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3604                     ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
   3605 
   3606                 if(!check_for_duplicate)
   3607                 {
   3608                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
   3609                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   3610 
   3611                     s_err_prms.pu1_ref = apu1_final[2];
   3612                     s_err_prms.i4_ref_stride = ai4_ref_stride[2];
   3613 
   3614                     pf_err_compute(&s_err_prms);
   3615                     /* Update the mv's with the current candt motion vectors */
   3616                     s_result_prms.i2_mv_x = mvx_qpel + 1;
   3617                     s_result_prms.i2_mv_y = mvy_qpel;
   3618                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3619 
   3620                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3621                     if(i4_tot_cost < i4_min_cost)
   3622                     {
   3623                         e_min_id = PT_R;
   3624                         i4_min_cost = i4_tot_cost;
   3625                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3626                     }
   3627                 }
   3628             }
   3629             /* i4_mv_x and i4_mv_y will always be the centre pt */
   3630             /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
   3631             if(i4_grid_mask & BIT_EN(PT_B))
   3632             {
   3633                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3634                     ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
   3635 
   3636                 if(!check_for_duplicate)
   3637                 {
   3638                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   3639                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
   3640 
   3641                     s_err_prms.pu1_ref = apu1_final[3];
   3642                     s_err_prms.i4_ref_stride = ai4_ref_stride[3];
   3643 
   3644                     pf_err_compute(&s_err_prms);
   3645                     /* Update the mv's with the current candt motion vectors */
   3646                     s_result_prms.i2_mv_x = mvx_qpel;
   3647                     s_result_prms.i2_mv_y = mvy_qpel + 1;
   3648                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3649                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3650                     if(i4_tot_cost < i4_min_cost)
   3651                     {
   3652                         e_min_id = PT_B;
   3653                         i4_min_cost = i4_tot_cost;
   3654                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3655                     }
   3656                 }
   3657             }
   3658 
   3659             if(e_min_id == PT_C)
   3660             {
   3661                 if(!i4_i)
   3662                 {
   3663                     S32 i4_interp_buf_id = 0;
   3664 
   3665                     if(i4_grid_mask & BIT_EN(PT_TL))
   3666                     {
   3667                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3668                             ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate);
   3669 
   3670                         if(!check_for_duplicate)
   3671                         {
   3672                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
   3673                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
   3674 
   3675                             /* Carry out the interpolation */
   3676                             pf_qpel_interp(
   3677                                 &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id);
   3678 
   3679                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
   3680                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
   3681 
   3682                             pf_err_compute(&s_err_prms);
   3683                             /* Update the mv's with the current candt motion vectors */
   3684                             s_result_prms.i2_mv_x = mvx_qpel - 1;
   3685                             s_result_prms.i2_mv_y = mvy_qpel - 1;
   3686                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3687 
   3688                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3689 
   3690                             if(i4_tot_cost < i4_min_cost)
   3691                             {
   3692                                 e_min_id = PT_TL;
   3693                                 i4_min_cost = i4_tot_cost;
   3694                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3695                             }
   3696                         }
   3697                     }
   3698                     if(i4_grid_mask & BIT_EN(PT_TR))
   3699                     {
   3700                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3701                             ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate);
   3702 
   3703                         if(!check_for_duplicate)
   3704                         {
   3705                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
   3706                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
   3707 
   3708                             /* Carry out the interpolation */
   3709                             pf_qpel_interp(
   3710                                 &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id);
   3711 
   3712                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
   3713                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
   3714 
   3715                             pf_err_compute(&s_err_prms);
   3716                             /* Update the mv's with the current candt motion vectors */
   3717                             s_result_prms.i2_mv_x = mvx_qpel + 1;
   3718                             s_result_prms.i2_mv_y = mvy_qpel - 1;
   3719                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3720 
   3721                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3722 
   3723                             if(i4_tot_cost < i4_min_cost)
   3724                             {
   3725                                 e_min_id = PT_TR;
   3726                                 i4_min_cost = i4_tot_cost;
   3727                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3728                             }
   3729                         }
   3730                     }
   3731                     if(i4_grid_mask & BIT_EN(PT_BL))
   3732                     {
   3733                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3734                             ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate);
   3735 
   3736                         if(!check_for_duplicate)
   3737                         {
   3738                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
   3739                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
   3740 
   3741                             /* Carry out the interpolation */
   3742                             pf_qpel_interp(
   3743                                 &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id);
   3744 
   3745                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
   3746                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
   3747 
   3748                             pf_err_compute(&s_err_prms);
   3749                             /* Update the mv's with the current candt motion vectors */
   3750                             s_result_prms.i2_mv_x = mvx_qpel - 1;
   3751                             s_result_prms.i2_mv_y = mvy_qpel + 1;
   3752                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3753 
   3754                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3755 
   3756                             if(i4_tot_cost < i4_min_cost)
   3757                             {
   3758                                 e_min_id = PT_BL;
   3759                                 i4_min_cost = i4_tot_cost;
   3760                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3761                             }
   3762                         }
   3763                     }
   3764                     /* i4_mv_x and i4_mv_y will always be the centre pt */
   3765                     /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
   3766                     if(i4_grid_mask & BIT_EN(PT_BR))
   3767                     {
   3768                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
   3769                             ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate);
   3770 
   3771                         if(!check_for_duplicate)
   3772                         {
   3773                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
   3774                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
   3775 
   3776                             /* Carry out the interpolation */
   3777                             pf_qpel_interp(
   3778                                 &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id);
   3779 
   3780                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
   3781                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
   3782 
   3783                             pf_err_compute(&s_err_prms);
   3784                             /* Update the mv's with the current candt motion vectors */
   3785                             s_result_prms.i2_mv_x = mvx_qpel + 1;
   3786                             s_result_prms.i2_mv_y = mvy_qpel + 1;
   3787                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
   3788 
   3789                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
   3790 
   3791                             if(i4_tot_cost < i4_min_cost)
   3792                             {
   3793                                 e_min_id = PT_BR;
   3794                                 i4_min_cost = i4_tot_cost;
   3795                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
   3796                             }
   3797                         }
   3798                     }
   3799                     if(e_min_id == PT_C)
   3800                     {
   3801                         break;
   3802                     }
   3803                 }
   3804                 else
   3805                 {
   3806                     break;
   3807                 }
   3808             }
   3809 
   3810             if(i4_i)
   3811             {
   3812                 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
   3813             }
   3814             else
   3815             {
   3816                 i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
   3817             }
   3818             i4_mv_x += gai1_grid_id_to_x[e_min_id];
   3819             i4_mv_y += gai1_grid_id_to_y[e_min_id];
   3820             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   3821             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   3822             i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
   3823         }
   3824     }
   3825 
   3826     /* update modified motion vectors and cost at end of subpel */
   3827     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
   3828     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
   3829     ps_search_node->i4_tot_cost = i4_min_cost;
   3830     ps_search_node->i4_sad = i4_min_sad;
   3831 
   3832     /********************************************************************************/
   3833     /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
   3834     /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
   3835     /********************************************************************************/
   3836     //ps_pred_ctxt->lambda >>= 1;
   3837 
   3838     return (i4_min_cost);
   3839 }
   3840 #endif
   3841 
   3842 static void hme_subpel_refine_struct_to_search_results_struct_converter(
   3843     subpel_refine_ctxt_t *ps_subpel_refine_ctxt,
   3844     search_results_t *ps_search_results,
   3845     U08 u1_pred_dir,
   3846     ME_QUALITY_PRESETS_T e_quality_preset)
   3847 {
   3848     U08 i;
   3849 
   3850     U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part;
   3851 
   3852     for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
   3853     {
   3854         S32 index;
   3855         S32 i4_sad;
   3856 
   3857         S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
   3858 
   3859         search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id];
   3860 
   3861         if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
   3862         {
   3863             index = part_id;
   3864         }
   3865         else
   3866         {
   3867             index = i;
   3868         }
   3869 
   3870         if(!ps_best_node->u1_subpel_done)
   3871         {
   3872             i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
   3873                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3874             ps_best_node[0].i4_sdi = 0;
   3875             ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1);
   3876             ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
   3877 
   3878             if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
   3879             {
   3880                 i4_sad = MAX_SIGNED_16BIT_VAL;
   3881             }
   3882 
   3883             ps_best_node[0].i4_sad = i4_sad;
   3884             ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3885             ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
   3886             ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
   3887             ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
   3888             ps_best_node->u1_subpel_done = 1;
   3889 
   3890             if(2 == u1_num_results_per_part)
   3891             {
   3892                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
   3893                          ps_subpel_refine_ctxt->i2_mv_cost[1][index];
   3894                 ps_best_node[1].i4_sdi = 0;
   3895                 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
   3896 
   3897                 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
   3898                 {
   3899                     i4_sad = MAX_SIGNED_16BIT_VAL;
   3900                 }
   3901 
   3902                 ps_best_node[1].i4_sad = i4_sad;
   3903                 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
   3904                 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
   3905                 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
   3906                 ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
   3907                 ps_best_node[1].u1_subpel_done = 1;
   3908             }
   3909         }
   3910         else if(
   3911             (2 == u1_num_results_per_part) &&
   3912             (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost))
   3913         {
   3914             if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost)
   3915             {
   3916                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
   3917                          ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3918                 ps_best_node[0].i4_sdi = 0;
   3919                 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
   3920 
   3921                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
   3922                 {
   3923                     i4_sad = MAX_SIGNED_16BIT_VAL;
   3924                 }
   3925 
   3926                 ps_best_node[0].i4_sad = i4_sad;
   3927                 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3928                 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
   3929                 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
   3930                 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
   3931 
   3932                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
   3933                          ps_subpel_refine_ctxt->i2_mv_cost[1][index];
   3934                 ps_best_node[1].i4_sdi = 0;
   3935                 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
   3936 
   3937                 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
   3938                 {
   3939                     i4_sad = MAX_SIGNED_16BIT_VAL;
   3940                 }
   3941 
   3942                 ps_best_node[1].i4_sad = i4_sad;
   3943                 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
   3944                 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
   3945                 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
   3946                 ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
   3947             }
   3948             else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost)
   3949             {
   3950                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost)
   3951                 {
   3952                     i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
   3953                              ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3954                     ps_best_node[1].i4_sdi = 0;
   3955                     ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
   3956 
   3957                     if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
   3958                     {
   3959                         i4_sad = MAX_SIGNED_16BIT_VAL;
   3960                     }
   3961 
   3962                     ps_best_node[1].i4_sad = i4_sad;
   3963                     ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3964                     ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
   3965                     ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
   3966                     ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
   3967                 }
   3968                 else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)
   3969                 {
   3970                     memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t));
   3971 
   3972                     i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
   3973                              ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3974                     ps_best_node[0].i4_sdi = 0;
   3975                     ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
   3976 
   3977                     if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
   3978                     {
   3979                         i4_sad = MAX_SIGNED_16BIT_VAL;
   3980                     }
   3981 
   3982                     ps_best_node[0].i4_sad = i4_sad;
   3983                     ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3984                     ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
   3985                     ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
   3986                     ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
   3987                 }
   3988             }
   3989         }
   3990         else if(
   3991             (1 == u1_num_results_per_part) &&
   3992             (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost))
   3993         {
   3994             i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
   3995                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   3996             ps_best_node[0].i4_sdi = 0;
   3997             ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
   3998 
   3999             if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
   4000             {
   4001                 i4_sad = MAX_SIGNED_16BIT_VAL;
   4002             }
   4003 
   4004             ps_best_node[0].i4_sad = i4_sad;
   4005             ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   4006             ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
   4007             ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
   4008             ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
   4009         }
   4010     }
   4011 }
   4012 
   4013 /**
   4014 ********************************************************************************
   4015 *  @fn     S32 hme_subpel_refine_cu_hs
   4016 *
   4017 *  @brief  Evaluates the best subpel mvs for active partitions of an MB in L0
   4018 *          layer for the high speed preset. Recursive hadamard SATD / SAD
   4019 *          and mv cost is used for 2NxN and NxN partitions with active partition
   4020 *          update
   4021 *
   4022 *  @param[in]  ps_prms: subpel prms input to this function
   4023 *
   4024 *  @param[in]  ps_curr_layer: points to the current layer ctxt
   4025 *
   4026 *  @param[out] ps_search_results: points to the search resutls that get updated
   4027 *              with best results
   4028 *
   4029 *  @param[in]  search_idx:  ref id of the frame for which results get updated
   4030 *
   4031 *  @param[in]  ps_wt_inp_prms:  current frame input params
   4032 *
   4033 *  @return     None
   4034 ********************************************************************************
   4035 */
   4036 void hme_subpel_refine_cu_hs(
   4037     hme_subpel_prms_t *ps_prms,
   4038     layer_ctxt_t *ps_curr_layer,
   4039     search_results_t *ps_search_results,
   4040     S32 search_idx,
   4041     wgt_pred_ctxt_t *ps_wt_inp_prms,
   4042     WORD32 blk_8x8_mask,
   4043     me_func_selector_t *ps_func_selector,
   4044     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
   4045     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
   4046 {
   4047     /* Unique search node list for 2nx2n and nxn partitions */
   4048     search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5];
   4049     subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF];
   4050     search_node_t *ps_search_node;
   4051 
   4052     S32 i, i4_part_mask, j;
   4053     S32 i4_sad_grid;
   4054     S32 max_subpel_cand;
   4055     WORD32 index;
   4056     S32 num_unique_nodes_2nx2n;
   4057     S32 part_id;
   4058     S32 x_off, y_off;
   4059     S32 i4_inp_off;
   4060 
   4061     CU_SIZE_T e_cu_size;
   4062     BLK_SIZE_T e_blk_size;
   4063 
   4064     subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
   4065 
   4066     S32 i4_use_satd = ps_prms->i4_use_satd;
   4067     S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1;
   4068 
   4069     ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART);
   4070 
   4071     if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy)
   4072     {
   4073         e_cu_size = ps_search_results->e_cu_size;
   4074         i4_part_mask = ps_search_results->i4_part_mask;
   4075 
   4076         ps_prms->i4_inp_type = sizeof(U08);
   4077 
   4078         num_unique_nodes_2nx2n = 0;
   4079 
   4080         for(i = 0; i < i4_num_act_refs; i++)
   4081         {
   4082             as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF;
   4083         }
   4084 
   4085         /************************************************************************/
   4086         /*                                                                      */
   4087         /*  Initialize SATD cost for each valid partition id.one time before    */
   4088         /*  doing full pel time. This is because of the following reasons:      */
   4089         /*   1. Full pel cost was done in  SAD while subpel is in SATD mode     */
   4090         /*   2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */
   4091         /*      doing Diamond search for 2Nx2N and NxN. This partitions are     */
   4092         /*      not explicitly refine in high speed mode                        */
   4093         /*                                                                      */
   4094         /************************************************************************/
   4095         for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
   4096         {
   4097             S32 enable_subpel = 0;
   4098             S32 part_type;
   4099 
   4100             /* Derive the x and y offsets of this part id */
   4101             part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
   4102             if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
   4103             {
   4104                 index = part_id;
   4105             }
   4106             else
   4107             {
   4108                 index = i;
   4109             }
   4110 
   4111             part_type = ge_part_id_to_part_type[part_id];
   4112             x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size;
   4113             y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size;
   4114             x_off += ps_search_results->u1_x_off;
   4115             y_off += ps_search_results->u1_y_off;
   4116             i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
   4117             e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id];
   4118 
   4119             x_off += ps_prms->i4_ctb_x_off;
   4120             y_off += ps_prms->i4_ctb_y_off;
   4121 
   4122             max_subpel_cand = 0;
   4123 
   4124             /* Choose the minimum number of candidates to be used for Sub pel refinement */
   4125             if(PART_ID_2Nx2N == part_type)
   4126             {
   4127                 max_subpel_cand =
   4128                     MIN(ps_prms->u1_max_subpel_candts_2Nx2N,
   4129                         ps_search_results->u1_num_results_per_part);
   4130             }
   4131             else if(PRT_NxN == part_type)
   4132             {
   4133                 max_subpel_cand = MIN(
   4134                     ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part);
   4135             }
   4136 
   4137             /* If incomplete CTB, NxN num candidates should be forced to min 1 */
   4138             if((0 == max_subpel_cand) && (blk_8x8_mask != 15))
   4139             {
   4140                 max_subpel_cand = 1;
   4141             }
   4142 
   4143             if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type))
   4144             {
   4145                 enable_subpel = 1;
   4146             }
   4147 
   4148             /* Compute full pel SATD for each result per partition before subpel */
   4149             /* refinement starts.                                                */
   4150             /* Also prepare unique candidate list for 2Nx2N and NxN partitions   */
   4151             for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
   4152             {
   4153                 err_prms_t s_err_prms;
   4154                 S32 i4_satd = 0;
   4155                 S32 i1_ref_idx;
   4156                 U08 *pu1_ref_base;
   4157                 S32 i4_ref_stride = ps_curr_layer->i4_rec_stride;
   4158                 S32 i4_mv_x, i4_mv_y;
   4159 
   4160                 ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j;
   4161 
   4162                 if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV)
   4163                 {
   4164                     ps_search_node->u1_subpel_done = 1;
   4165                     continue;
   4166                 }
   4167 
   4168                 i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
   4169                 ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off);
   4170                 pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx];
   4171 
   4172                 i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index];
   4173                 i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index];
   4174 
   4175                 if(i4_use_satd)
   4176                 {
   4177                     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
   4178                     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
   4179                     s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x +
   4180                                          (i4_mv_y * i4_ref_stride);
   4181 
   4182                     s_err_prms.i4_ref_stride = i4_ref_stride;
   4183                     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
   4184                     s_err_prms.i4_grid_mask = 1;
   4185                     s_err_prms.pi4_sad_grid = &i4_sad_grid;
   4186                     s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
   4187                     s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
   4188 
   4189                     s_err_prms.ps_cmn_utils_optimised_function_list =
   4190                         ps_cmn_utils_optimised_function_list;
   4191 
   4192                     compute_satd_8bit(&s_err_prms);
   4193 
   4194                     i4_satd = s_err_prms.pi4_sad_grid[0];
   4195 
   4196                     ps_subpel_refine_ctxt->i2_tot_cost[j][index] =
   4197                         CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd);
   4198                     ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd;
   4199                 }
   4200 
   4201                 /* Sub-pel candidate filtration */
   4202                 if(j)
   4203                 {
   4204                     S16 i2_best_sad;
   4205                     S32 i4_best_mvx;
   4206                     S32 i4_best_mvy;
   4207 
   4208                     search_node_t *ps_node =
   4209                         ps_search_results->aps_part_results[search_idx][part_id];
   4210 
   4211                     U08 u1_is_subpel_done = ps_node->u1_subpel_done;
   4212                     S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index];
   4213                     S32 i4_curr_mvx = i4_mv_x << 2;
   4214                     S32 i4_curr_mvy = i4_mv_y << 2;
   4215 
   4216                     if(u1_is_subpel_done)
   4217                     {
   4218                         i2_best_sad = ps_node->i4_sad;
   4219 
   4220                         if(ps_node->i1_ref_idx == i1_ref_idx)
   4221                         {
   4222                             i4_best_mvx = ps_node->s_mv.i2_mvx;
   4223                             i4_best_mvy = ps_node->s_mv.i2_mvy;
   4224                         }
   4225                         else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
   4226                         {
   4227                             i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
   4228                             i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
   4229                         }
   4230                         else
   4231                         {
   4232                             i4_best_mvx = INTRA_MV;
   4233                             i4_best_mvy = INTRA_MV;
   4234                         }
   4235                     }
   4236                     else
   4237                     {
   4238                         i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
   4239                                       ps_subpel_refine_ctxt->i2_mv_cost[0][index];
   4240 
   4241                         if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
   4242                         {
   4243                             i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
   4244                             i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
   4245                         }
   4246                         else
   4247                         {
   4248                             i4_best_mvx = INTRA_MV;
   4249                             i4_best_mvy = INTRA_MV;
   4250                         }
   4251                     }
   4252 
   4253                     i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold);
   4254 
   4255                     if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) &&
   4256                         (ABS(i4_curr_mvy - i4_best_mvy) < 2)) ||
   4257                        (i2_curr_sad > i2_best_sad))
   4258                     {
   4259                         enable_subpel = 0;
   4260                     }
   4261                 }
   4262 
   4263                 ps_search_node->u1_part_id = part_id;
   4264 
   4265                 /* Convert mvs in part results from FPEL to QPEL units */
   4266                 ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2;
   4267                 ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2;
   4268 
   4269                 /* If the candidate number is more than the number of candts
   4270                 set initally, do not add those candts for refinement */
   4271                 if(j >= max_subpel_cand)
   4272                 {
   4273                     enable_subpel = 0;
   4274                 }
   4275 
   4276                 if(enable_subpel)
   4277                 {
   4278                     if(num_unique_nodes_2nx2n == 0)
   4279                     {
   4280                         S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
   4281 
   4282                         as_subpel_dedup_enabler[i4_index].i2_mv_x =
   4283                             ps_subpel_refine_ctxt->i2_mv_x[j][index];
   4284                         as_subpel_dedup_enabler[i4_index].i2_mv_y =
   4285                             ps_subpel_refine_ctxt->i2_mv_y[j][index];
   4286                         as_subpel_dedup_enabler[i4_index].u1_ref_idx =
   4287                             (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index];
   4288                         memset(
   4289                             as_subpel_dedup_enabler[i4_index].au4_node_map,
   4290                             0,
   4291                             sizeof(U32) * 2 * MAP_X_MAX);
   4292                     }
   4293                     INSERT_NEW_NODE_NOMAP_ALTERNATE(
   4294                         as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i);
   4295                 }
   4296             }
   4297 
   4298             /*********************************************************************************************/
   4299             /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */
   4300             /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/
   4301             /* for each partition again, based on the new costs                                          */
   4302             /*********************************************************************************************/
   4303             /*********************************************************************************************/
   4304             /* Because right now, we store only the two best candidates for each partition, the sort will*/
   4305             /* converge to a simple swap.                                                                */
   4306             /* ASSUMPTION : We store only two best results per partition                                 */
   4307             /*********************************************************************************************/
   4308             if(ps_search_results->u1_num_results_per_part == 2)
   4309             {
   4310                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >
   4311                    ps_subpel_refine_ctxt->i2_tot_cost[1][index])
   4312                 {
   4313                     SWAP(
   4314                         ps_subpel_refine_ctxt->i2_tot_cost[0][index],
   4315                         ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
   4316 
   4317                     SWAP(
   4318                         ps_subpel_refine_ctxt->i2_mv_cost[0][index],
   4319                         ps_subpel_refine_ctxt->i2_mv_cost[1][index]);
   4320 
   4321                     SWAP(
   4322                         ps_subpel_refine_ctxt->i2_mv_x[0][index],
   4323                         ps_subpel_refine_ctxt->i2_mv_x[1][index]);
   4324 
   4325                     SWAP(
   4326                         ps_subpel_refine_ctxt->i2_mv_y[0][index],
   4327                         ps_subpel_refine_ctxt->i2_mv_y[1][index]);
   4328 
   4329                     SWAP(
   4330                         ps_subpel_refine_ctxt->i2_ref_idx[0][index],
   4331                         ps_subpel_refine_ctxt->i2_ref_idx[1][index]);
   4332 
   4333                     SWAP(
   4334                         ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index],
   4335                         ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]);
   4336                 }
   4337             }
   4338         }
   4339 
   4340         if(blk_8x8_mask == 0xf)
   4341         {
   4342             num_unique_nodes_2nx2n =
   4343                 MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers);
   4344         }
   4345         {
   4346             x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size;
   4347             y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size;
   4348             x_off += ps_search_results->u1_x_off;
   4349             y_off += ps_search_results->u1_y_off;
   4350             i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
   4351             e_blk_size = ge_part_id_to_blk_size[e_cu_size][0];
   4352 
   4353             for(j = 0; j < num_unique_nodes_2nx2n; j++)
   4354             {
   4355                 S32 pred_lx;
   4356                 ps_search_node = &as_nodes_2nx2n[j];
   4357 
   4358                 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
   4359                 {
   4360                     continue;
   4361                 }
   4362 
   4363                 {
   4364                     S08 i1_ref_idx = ps_search_node->i1_ref_idx;
   4365                     subpel_dedup_enabler_t *ps_dedup_enabler =
   4366                         &(as_subpel_dedup_enabler[i1_ref_idx]);
   4367 
   4368                     if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF)
   4369                     {
   4370                         as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx;
   4371                         as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy;
   4372                         as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx;
   4373                         memset(
   4374                             as_subpel_dedup_enabler[i1_ref_idx].au4_node_map,
   4375                             0,
   4376                             sizeof(U32) * 2 * MAP_X_MAX);
   4377                     }
   4378                 }
   4379 
   4380                 pred_lx = search_idx;
   4381                 ps_prms->pv_inp =
   4382                     (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off);
   4383 
   4384                 hme_subpel_refine_search_node_high_speed(
   4385                     ps_search_node,
   4386                     ps_prms,
   4387                     ps_curr_layer,
   4388                     e_blk_size,
   4389                     x_off + ps_prms->i4_ctb_x_off,
   4390                     y_off + ps_prms->i4_ctb_y_off,
   4391                     ps_search_results,
   4392                     pred_lx,
   4393                     i4_part_mask,
   4394                     &ps_subpel_refine_ctxt->ai4_part_id[0],
   4395                     search_idx,
   4396                     &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]),
   4397                     ps_func_selector,
   4398                     ps_me_optimised_function_list);
   4399             }
   4400         }
   4401     }
   4402     else
   4403     {
   4404         for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
   4405         {
   4406             S32 i4_index;
   4407 
   4408             S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
   4409 
   4410             if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
   4411             {
   4412                 i4_index = i4_part_id;
   4413             }
   4414             else
   4415             {
   4416                 i4_index = i;
   4417             }
   4418 
   4419             for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
   4420             {
   4421                 ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2;
   4422                 ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2;
   4423             }
   4424         }
   4425     }
   4426 
   4427     hme_subpel_refine_struct_to_search_results_struct_converter(
   4428         ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets);
   4429 }
   4430