1 /****************************************************************************** 2 * 3 * Copyright (C) 2018 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 21 /** 22 ****************************************************************************** 23 * @file hme_subpel.c 24 * 25 * @brief 26 * Subpel refinement modules for ME algo 27 * 28 * @author 29 * Ittiam 30 * 31 * 32 * List of Functions 33 * hme_qpel_interp_avg() 34 * hme_subpel_refine_ctblist_bck() 35 * hme_subpel_refine_ctblist_fwd() 36 * hme_refine_bidirect() 37 * hme_subpel_refinement() 38 * hme_subpel_refine_ctb_fwd() 39 * hme_subpel_refine_ctb_bck() 40 * hme_create_bck_inp() 41 * hme_subpel_refine_search_node() 42 ****************************************************************************** 43 */ 44 45 /*****************************************************************************/ 46 /* File Includes */ 47 /*****************************************************************************/ 48 /* System include files */ 49 #include <stdio.h> 50 #include <string.h> 51 #include <stdlib.h> 52 #include <assert.h> 53 #include <stdarg.h> 54 #include <math.h> 55 #include <limits.h> 56 57 /* User include files */ 58 #include "ihevc_typedefs.h" 59 #include "itt_video_api.h" 60 #include "ihevce_api.h" 61 62 #include "rc_cntrl_param.h" 63 #include "rc_frame_info_collector.h" 64 #include "rc_look_ahead_params.h" 65 66 #include "ihevc_defs.h" 67 #include "ihevc_structs.h" 68 #include "ihevc_platform_macros.h" 69 #include "ihevc_deblk.h" 70 #include "ihevc_itrans_recon.h" 71 #include "ihevc_chroma_itrans_recon.h" 72 #include "ihevc_chroma_intra_pred.h" 73 #include "ihevc_intra_pred.h" 74 #include "ihevc_inter_pred.h" 75 #include "ihevc_mem_fns.h" 76 #include "ihevc_padding.h" 77 #include "ihevc_weighted_pred.h" 78 #include "ihevc_sao.h" 79 #include "ihevc_resi_trans.h" 80 #include "ihevc_quant_iquant_ssd.h" 81 #include "ihevc_cabac_tables.h" 82 83 #include "ihevce_defs.h" 84 #include "ihevce_lap_enc_structs.h" 85 #include "ihevce_multi_thrd_structs.h" 86 #include "ihevce_multi_thrd_funcs.h" 87 #include "ihevce_me_common_defs.h" 88 #include "ihevce_had_satd.h" 89 #include "ihevce_error_codes.h" 90 #include "ihevce_bitstream.h" 91 #include "ihevce_cabac.h" 92 #include "ihevce_rdoq_macros.h" 93 #include "ihevce_function_selector.h" 94 #include "ihevce_enc_structs.h" 95 #include "ihevce_entropy_structs.h" 96 #include "ihevce_cmn_utils_instr_set_router.h" 97 #include "ihevce_enc_loop_structs.h" 98 #include "ihevce_bs_compute_ctb.h" 99 #include "ihevce_global_tables.h" 100 #include "ihevce_dep_mngr_interface.h" 101 #include "hme_datatype.h" 102 #include "hme_interface.h" 103 #include "hme_common_defs.h" 104 #include "hme_defs.h" 105 #include "ihevce_me_instr_set_router.h" 106 #include "hme_globals.h" 107 #include "hme_utils.h" 108 #include "hme_coarse.h" 109 #include "hme_fullpel.h" 110 #include "hme_subpel.h" 111 #include "hme_refine.h" 112 #include "hme_err_compute.h" 113 #include "hme_common_utils.h" 114 #include "hme_search_algo.h" 115 #include "ihevce_stasino_helpers.h" 116 #include "ihevce_common_utils.h" 117 118 /*****************************************************************************/ 119 /* Function Definitions */ 120 /*****************************************************************************/ 121 void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id) 122 { 123 U08 *pu1_src1, *pu1_src2, *pu1_dst; 124 qpel_input_buf_cfg_t *ps_inp_cfg; 125 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 126 127 /*************************************************************************/ 128 /* For a given QPEL pt, we need to determine the 2 source pts that are */ 129 /* needed to do the QPEL averaging. The logic to do this is as follows */ 130 /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are */ 131 /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0 */ 132 /* pt of th reference blk that is colocated to the inp blk. */ 133 /* A j E k B */ 134 /* l m n o p */ 135 /* F q G r H */ 136 /* s t u v w */ 137 /* C x I y D */ 138 /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/ 139 /* and (1,1) respectively in the fpel buffer (id = 0) */ 140 /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf */ 141 /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf */ 142 /* G is hxhy pt in offset 0,0 in hxhy buf */ 143 /* All above offsets are computed w.r.t. motion displaced pt in */ 144 /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and */ 145 /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G */ 146 /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3 */ 147 /* If we consider pt v to be derived. v has a fractional comp of 3, 3 */ 148 /* v is avg of H and I. So the table look up of v should give following */ 149 /* buf 1 (H) : offset = (1, 0) buf id = 2. */ 150 /* buf 2 (I) : offset = 0 , 1) buf id = 1. */ 151 /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1. */ 152 /*************************************************************************/ 153 i4_mv_x_frac = i4_mv_x & 3; 154 i4_mv_y_frac = i4_mv_y & 3; 155 156 i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride; 157 158 /* Derive the descriptor that has all offset and size info */ 159 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 160 161 if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2) 162 { 163 /* This is case for fxfy/hxfy/fxhy/hxhy */ 164 ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 165 ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 166 ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride); 167 ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride; 168 169 return; 170 } 171 172 pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 173 pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 174 pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride); 175 176 pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2]; 177 pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset; 178 pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride); 179 180 pu1_dst = ps_prms->apu1_interp_out[i4_buf_id]; 181 hevc_avg_2d( 182 pu1_src1, 183 pu1_src2, 184 ps_prms->i4_ref_stride, 185 ps_prms->i4_ref_stride, 186 ps_prms->i4_blk_wd, 187 ps_prms->i4_blk_ht, 188 pu1_dst, 189 ps_prms->i4_out_stride); 190 ps_prms->pu1_final_out = pu1_dst; 191 ps_prms->i4_final_out_stride = ps_prms->i4_out_stride; 192 } 193 194 static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse( 195 interp_prms_t *ps_prms, 196 S32 i4_mv_x, 197 S32 i4_mv_y, 198 U08 **ppu1_final, 199 S32 *pi4_final_stride, 200 FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt) 201 { 202 pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride); 203 204 pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride); 205 } 206 207 static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse( 208 interp_prms_t *ps_prms, 209 S32 i4_mv_x, 210 S32 i4_mv_y, 211 U08 **ppu1_final, 212 S32 *pi4_final_stride, 213 FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt) 214 { 215 pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride); 216 217 pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride); 218 } 219 220 /******************************************************************************** 221 * @fn hme_qpel_interp_comprehensive 222 * 223 * @brief Interpolates 2 qpel points by hpel averaging 224 * 225 * @param[in,out] ps_prms: Both input buffer ptrs and location of output 226 * 227 * @param[in] i4_mv_x : x component of motion vector in QPEL units 228 * 229 * @param[in] i4_mv_y : y component of motion vector in QPEL units 230 * 231 * @param[in] i4_grid_mask : mask which determines qpels to be computed 232 * 233 * @param[out] ppu1_final : storage for final buffer pointers 234 * 235 * @param[out] pi4_final_stride : storage for final buffer strides 236 * 237 * @return None 238 ******************************************************************************** 239 */ 240 static __inline void hme_qpel_interp_comprehensive( 241 interp_prms_t *ps_prms, 242 U08 **ppu1_final, 243 S32 *pi4_final_stride, 244 S32 i4_mv_x, 245 S32 i4_mv_y, 246 S32 i4_grid_mask, 247 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) 248 { 249 S32 pt_select_for_TB, pt_select_for_LR; 250 S32 dx, dy, dydx; 251 S32 vert_func_selector, horz_func_selector; 252 253 S32 i4_ref_stride = ps_prms->i4_ref_stride; 254 255 pt_select_for_TB = 256 ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1)); 257 258 pt_select_for_LR = 259 ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1)); 260 261 dx = (i4_mv_x & 3); 262 dy = (i4_mv_y & 3); 263 dydx = (dx + (dy << 2)); 264 265 vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx]; 266 horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx]; 267 268 /* case descriptions */ 269 /* Let T = (gridmask & T) & B = (gridmask & B) */ 270 /* & hp = pt is an hpel or an fpel */ 271 /* & r = reuse possible */ 272 /* 0 => T || B = 0 */ 273 /* 1 => (!T) && (B) && hp */ 274 /* 2 => (T) && (!B) && hp */ 275 /* 3 => (!T) && (B) && !hp */ 276 /* 4 => (T) && (!B) && !hp */ 277 /* 5 => (T) && (B) && !hp && r */ 278 /* 6 => (T) && (B) && !hp && !r */ 279 /* 7 => (T) && (B) && hp */ 280 281 switch(vert_func_selector) 282 { 283 case 0: 284 { 285 break; 286 } 287 case 1: 288 { 289 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 290 qpel_input_buf_cfg_t *ps_inp_cfg; 291 S32 i4_mvyp1 = (i4_mv_y + 1); 292 293 i4_mv_x_frac = dx; 294 i4_mv_y_frac = i4_mvyp1 & 3; 295 296 i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride; 297 298 /* Derive the descriptor that has all offset and size info */ 299 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 300 301 ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 302 ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 303 ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 304 pi4_final_stride[3] = i4_ref_stride; 305 306 break; 307 } 308 case 2: 309 { 310 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 311 qpel_input_buf_cfg_t *ps_inp_cfg; 312 S32 i4_mvym1 = (i4_mv_y - 1); 313 314 i4_mv_x_frac = dx; 315 i4_mv_y_frac = i4_mvym1 & 3; 316 317 i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride; 318 319 /* Derive the descriptor that has all offset and size info */ 320 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 321 322 ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 323 ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 324 ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 325 pi4_final_stride[1] = i4_ref_stride; 326 327 break; 328 } 329 case 3: 330 { 331 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt( 332 ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride); 333 334 break; 335 } 336 case 4: 337 { 338 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt( 339 ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride); 340 341 break; 342 } 343 case 5: 344 { 345 ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse( 346 ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride); 347 break; 348 } 349 case 6: 350 { 351 hme_qpel_interp_avg_2pt_vert_no_reuse( 352 ps_prms, 353 i4_mv_x, 354 i4_mv_y, 355 ppu1_final, 356 pi4_final_stride, 357 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt); 358 break; 359 } 360 case 7: 361 { 362 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 363 qpel_input_buf_cfg_t *ps_inp_cfg; 364 365 S32 i4_mvyp1 = (i4_mv_y + 1); 366 S32 i4_mvym1 = (i4_mv_y - 1); 367 368 i4_mv_x_frac = dx; 369 i4_mv_y_frac = i4_mvyp1 & 3; 370 371 i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride; 372 373 /* Derive the descriptor that has all offset and size info */ 374 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 375 376 ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 377 ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 378 ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 379 pi4_final_stride[3] = i4_ref_stride; 380 381 i4_mv_y_frac = i4_mvym1 & 3; 382 383 i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride; 384 385 /* Derive the descriptor that has all offset and size info */ 386 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 387 388 ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 389 ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 390 ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 391 pi4_final_stride[1] = i4_ref_stride; 392 393 break; 394 } 395 } 396 397 /* case descriptions */ 398 /* Let L = (gridmask & L) & R = (gridmask & R) */ 399 /* & hp = pt is an hpel or an fpel */ 400 /* & r = reuse possible */ 401 /* 0 => L || R = 0 */ 402 /* 1 => (!L) && (R) && hp */ 403 /* 2 => (L) && (!R) && hp */ 404 /* 3 => (!L) && (R) && !hp */ 405 /* 4 => (L) && (!R) && !hp */ 406 /* 5 => (L) && (R) && !hp && r */ 407 /* 6 => (L) && (R) && !hp && !r */ 408 /* 7 => (L) && (R) && hp */ 409 410 switch(horz_func_selector) 411 { 412 case 0: 413 { 414 break; 415 } 416 case 1: 417 { 418 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 419 qpel_input_buf_cfg_t *ps_inp_cfg; 420 S32 i4_mvxp1 = (i4_mv_x + 1); 421 422 i4_mv_x_frac = i4_mvxp1 & 3; 423 i4_mv_y_frac = dy; 424 425 i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride; 426 427 /* Derive the descriptor that has all offset and size info */ 428 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 429 430 ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 431 ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 432 ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 433 pi4_final_stride[2] = i4_ref_stride; 434 435 break; 436 } 437 case 2: 438 { 439 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 440 qpel_input_buf_cfg_t *ps_inp_cfg; 441 S32 i4_mvxm1 = (i4_mv_x - 1); 442 443 i4_mv_x_frac = i4_mvxm1 & 3; 444 i4_mv_y_frac = dy; 445 446 i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride; 447 448 /* Derive the descriptor that has all offset and size info */ 449 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 450 451 ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 452 ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 453 ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 454 pi4_final_stride[0] = i4_ref_stride; 455 456 break; 457 } 458 case 3: 459 { 460 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt( 461 ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride); 462 463 break; 464 } 465 case 4: 466 { 467 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt( 468 ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride); 469 470 break; 471 } 472 case 5: 473 { 474 ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse( 475 ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride); 476 break; 477 } 478 case 6: 479 { 480 hme_qpel_interp_avg_2pt_horz_no_reuse( 481 ps_prms, 482 i4_mv_x, 483 i4_mv_y, 484 ppu1_final, 485 pi4_final_stride, 486 ps_me_optimised_function_list->pf_qpel_interp_avg_1pt); 487 break; 488 } 489 case 7: 490 { 491 S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset; 492 qpel_input_buf_cfg_t *ps_inp_cfg; 493 494 S32 i4_mvxp1 = (i4_mv_x + 1); 495 S32 i4_mvxm1 = (i4_mv_x - 1); 496 497 i4_mv_x_frac = i4_mvxp1 & 3; 498 i4_mv_y_frac = dy; 499 500 i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride; 501 502 /* Derive the descriptor that has all offset and size info */ 503 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 504 505 ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 506 ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 507 ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 508 pi4_final_stride[2] = i4_ref_stride; 509 510 i4_mv_x_frac = i4_mvxm1 & 3; 511 512 i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride; 513 514 /* Derive the descriptor that has all offset and size info */ 515 ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac]; 516 517 ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1]; 518 ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset; 519 ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride); 520 pi4_final_stride[0] = i4_ref_stride; 521 522 break; 523 } 524 } 525 } 526 527 /** 528 ******************************************************************************** 529 * @fn S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms, 530 * search_results_t *ps_search_results, 531 * layer_ctxt_t *ps_curr_layer, 532 * U08 **ppu1_pred) 533 * 534 * 535 * @brief Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are 536 * best L0 and L1 bufs respectively for the entire CU 537 * 538 * @param[in] ps_prms: subpel prms input to this function 539 * 540 * @param[in] ps_curr_layer: points to the current layer ctxt 541 * 542 * @return The best BI cost of best uni cost, whichever better 543 ******************************************************************************** 544 */ 545 void hme_compute_pred_and_evaluate_bi( 546 inter_cu_results_t *ps_cu_results, 547 inter_pu_results_t *ps_pu_results, 548 inter_ctb_prms_t *ps_inter_ctb_prms, 549 part_type_results_t *ps_part_type_result, 550 ULWORD64 *pu8_winning_pred_sigmaXSquare, 551 ULWORD64 *pu8_winning_pred_sigmaX, 552 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list, 553 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) 554 { 555 /* Idx0 - Uni winner */ 556 /* Idx1 - Uni runner-up */ 557 /* Idx2 - Bi winner */ 558 hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS]; 559 err_prms_t s_err_prms; 560 interp_prms_t s_interp_prms; 561 562 PF_SAD_FXN_T pf_err_compute; 563 564 S32 i, j; 565 S32 x_off, y_off, x_pic, y_pic; 566 S32 i4_sad_grid; 567 U08 e_cu_size; 568 S32 i4_part_type; 569 U08 u1_cu_size; 570 S32 shift; 571 S32 x_part, y_part, num_parts; 572 S32 inp_stride, ref_stride; 573 U08 au1_pred_buf_array_indixes[3]; 574 S32 cur_iter_best_cost; 575 S32 uni_cost, bi_cost, best_cost, tot_cost; 576 /* Idx0 - Uni winner */ 577 /* Idx1 - Bi winner */ 578 ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS]; 579 ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS]; 580 #if USE_NOISE_TERM_DURING_BICAND_SEARCH 581 S32 i4_noise_term; 582 #endif 583 584 interp_prms_t *ps_interp_prms = &s_interp_prms; 585 586 S32 best_cand_in_opp_dir_idx = 0; 587 S32 is_best_cand_an_intra = 0; 588 U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy; 589 #if USE_NOISE_TERM_DURING_BICAND_SEARCH 590 const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT; 591 #endif 592 tot_cost = 0; 593 594 /* Start of the CU w.r.t. CTB */ 595 x_off = ps_cu_results->u1_x_off; 596 y_off = ps_cu_results->u1_y_off; 597 598 inp_stride = ps_inter_ctb_prms->i4_inp_stride; 599 ref_stride = ps_inter_ctb_prms->i4_rec_stride; 600 601 ps_interp_prms->i4_ref_stride = ref_stride; 602 603 /* Start of the CU w.r.t. Pic 0,0 */ 604 x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off; 605 y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off; 606 607 u1_cu_size = ps_cu_results->u1_cu_size; 608 e_cu_size = u1_cu_size; 609 shift = (S32)e_cu_size; 610 i4_part_type = ps_part_type_result->u1_part_type; 611 num_parts = gau1_num_parts_in_part_type[i4_part_type]; 612 613 for(i = 0; i < 3; i++) 614 { 615 hme_init_pred_buf_info( 616 &as_pred_buf_data[i], 617 &ps_inter_ctb_prms->s_pred_buf_mngr, 618 (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2, 619 (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2, 620 (PART_TYPE_T)i4_part_type); 621 622 au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id; 623 } 624 625 for(j = 0; j < num_parts; j++) 626 { 627 UWORD8 *apu1_hpel_ref[2][4]; 628 PART_ID_T e_part_id; 629 BLK_SIZE_T e_blk_size; 630 WORD8 i1_ref_idx; 631 UWORD8 pred_dir; 632 WORD32 ref_offset, inp_offset, wd, ht; 633 pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result; 634 mv_t *aps_mv[2]; 635 UWORD8 num_active_ref_opp; 636 UWORD8 num_results_per_part; 637 WORD32 luma_weight_ref1, luma_offset_ref1; 638 WORD32 luma_weight_ref2, luma_offset_ref2; 639 WORD32 pu_node2_found = 0; 640 641 e_part_id = ge_part_type_to_part_id[i4_part_type][j]; 642 e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id]; 643 644 x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift; 645 y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift; 646 647 ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride; 648 inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset; 649 650 pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode; 651 652 ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]); 653 654 if(PRED_L0 == pred_dir) 655 { 656 i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx; 657 aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv); 658 659 num_active_ref_opp = 660 ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled); 661 num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id]; 662 663 ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id]; 664 665 ASSERT(i1_ref_idx >= 0); 666 667 apu1_hpel_ref[0][0] = 668 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) + 669 ref_offset; 670 apu1_hpel_ref[0][1] = 671 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] + 672 ref_offset; 673 apu1_hpel_ref[0][2] = 674 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] + 675 ref_offset; 676 apu1_hpel_ref[0][3] = 677 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] + 678 ref_offset; 679 680 luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] 681 ->s_weight_offset.i2_luma_weight; 682 luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] 683 ->s_weight_offset.i2_luma_offset; 684 } 685 else 686 { 687 i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx; 688 aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv); 689 690 ASSERT(i1_ref_idx >= 0); 691 692 num_active_ref_opp = 693 ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled); 694 num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id]; 695 696 ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id]; 697 698 apu1_hpel_ref[0][0] = 699 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) + 700 ref_offset; 701 apu1_hpel_ref[0][1] = 702 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] + 703 ref_offset; 704 apu1_hpel_ref[0][2] = 705 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] + 706 ref_offset; 707 apu1_hpel_ref[0][3] = 708 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] + 709 ref_offset; 710 711 luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] 712 ->s_weight_offset.i2_luma_weight; 713 luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] 714 ->s_weight_offset.i2_luma_offset; 715 } 716 717 if(aps_mv[0]->i2_mvx == INTRA_MV) 718 { 719 uni_cost = ps_pu_node1->i4_tot_cost; 720 cur_iter_best_cost = ps_pu_node1->i4_tot_cost; 721 best_cost = MIN(uni_cost, cur_iter_best_cost); 722 tot_cost += best_cost; 723 continue; 724 } 725 726 ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size]; 727 ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size]; 728 ps_interp_prms->i4_out_stride = MAX_CU_SIZE; 729 730 if(num_active_ref_opp) 731 { 732 if(PRED_L0 == pred_dir) 733 { 734 if(ps_pu_results->u1_num_results_per_part_l1[e_part_id]) 735 { 736 ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id]; 737 pu_node2_found = 1; 738 } 739 } 740 else 741 { 742 if(ps_pu_results->u1_num_results_per_part_l0[e_part_id]) 743 { 744 ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id]; 745 pu_node2_found = 1; 746 } 747 } 748 } 749 750 if(!pu_node2_found) 751 { 752 bi_cost = INT_MAX >> 1; 753 754 s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred; 755 ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0]; 756 757 ps_me_optimised_function_list->pf_qpel_interp_avg_generic( 758 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0); 759 760 if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0]) 761 { 762 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX; 763 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out; 764 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride; 765 } 766 767 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) 768 { 769 hme_compute_sigmaX_and_sigmaXSquared( 770 as_pred_buf_data[0][j].pu1_pred, 771 as_pred_buf_data[0][j].i4_pred_stride, 772 &au8_sigmaX[0][j], 773 &au8_sigmaXSquared[0][j], 774 ps_interp_prms->i4_blk_wd, 775 ps_interp_prms->i4_blk_ht, 776 ps_interp_prms->i4_blk_wd, 777 ps_interp_prms->i4_blk_ht, 778 0, 779 1); 780 } 781 } 782 else 783 { 784 i = 0; 785 bi_cost = MAX_32BIT_VAL; 786 is_best_cand_an_intra = 0; 787 best_cand_in_opp_dir_idx = 0; 788 789 pred_dir = ps_pu_node2[i].pu.b2_pred_mode; 790 791 if(PRED_L0 == pred_dir) 792 { 793 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx; 794 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv); 795 796 ASSERT(i1_ref_idx >= 0); 797 798 apu1_hpel_ref[1][0] = 799 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] 800 ->s_yuv_buf_desc.pv_y_buf) + 801 ref_offset; //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset; 802 apu1_hpel_ref[1][1] = 803 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] + 804 ref_offset; 805 apu1_hpel_ref[1][2] = 806 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] + 807 ref_offset; 808 apu1_hpel_ref[1][3] = 809 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] + 810 ref_offset; 811 812 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] 813 ->s_weight_offset.i2_luma_weight; 814 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx] 815 ->s_weight_offset.i2_luma_offset; 816 } 817 else 818 { 819 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx; 820 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv); 821 822 ASSERT(i1_ref_idx >= 0); 823 824 apu1_hpel_ref[1][0] = 825 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] 826 ->s_yuv_buf_desc.pv_y_buf) + 827 ref_offset; //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset; 828 apu1_hpel_ref[1][1] = 829 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] + 830 ref_offset; 831 apu1_hpel_ref[1][2] = 832 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] + 833 ref_offset; 834 apu1_hpel_ref[1][3] = 835 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] + 836 ref_offset; 837 838 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] 839 ->s_weight_offset.i2_luma_weight; 840 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx] 841 ->s_weight_offset.i2_luma_offset; 842 } 843 844 if(aps_mv[1]->i2_mvx == INTRA_MV) 845 { 846 uni_cost = ps_pu_node1->i4_tot_cost; 847 cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost; 848 849 if(cur_iter_best_cost < bi_cost) 850 { 851 bi_cost = cur_iter_best_cost; 852 best_cand_in_opp_dir_idx = i; 853 is_best_cand_an_intra = 1; 854 } 855 856 best_cost = MIN(uni_cost, bi_cost); 857 tot_cost += best_cost; 858 continue; 859 } 860 861 s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred; 862 ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0]; 863 864 ps_me_optimised_function_list->pf_qpel_interp_avg_generic( 865 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0); 866 867 if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0]) 868 { 869 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX; 870 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out; 871 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride; 872 } 873 874 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) 875 { 876 hme_compute_sigmaX_and_sigmaXSquared( 877 as_pred_buf_data[0][j].pu1_pred, 878 as_pred_buf_data[0][j].i4_pred_stride, 879 &au8_sigmaX[0][j], 880 &au8_sigmaXSquared[0][j], 881 ps_interp_prms->i4_blk_wd, 882 ps_interp_prms->i4_blk_ht, 883 ps_interp_prms->i4_blk_wd, 884 ps_interp_prms->i4_blk_ht, 885 0, 886 1); 887 } 888 889 s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred; 890 ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0]; 891 892 ps_me_optimised_function_list->pf_qpel_interp_avg_generic( 893 ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0); 894 895 if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0]) 896 { 897 as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX; 898 as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out; 899 as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride; 900 } 901 902 ps_cmn_utils_optimised_function_list->pf_wt_avg_2d( 903 as_pred_buf_data[0][j].pu1_pred, 904 as_pred_buf_data[1][j].pu1_pred, 905 as_pred_buf_data[0][j].i4_pred_stride, 906 as_pred_buf_data[1][j].i4_pred_stride, 907 wd, 908 ht, 909 as_pred_buf_data[2][j].pu1_pred, 910 as_pred_buf_data[2][j].i4_pred_stride, 911 luma_weight_ref1, 912 luma_weight_ref2, 913 luma_offset_ref1, 914 luma_offset_ref2, 915 ps_inter_ctb_prms->wpred_log_wdc); 916 917 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) 918 { 919 hme_compute_sigmaX_and_sigmaXSquared( 920 as_pred_buf_data[2][j].pu1_pred, 921 as_pred_buf_data[2][j].i4_pred_stride, 922 &au8_sigmaX[1][j], 923 &au8_sigmaXSquared[1][j], 924 ps_interp_prms->i4_blk_wd, 925 ps_interp_prms->i4_blk_ht, 926 ps_interp_prms->i4_blk_wd, 927 ps_interp_prms->i4_blk_ht, 928 0, 929 1); 930 } 931 932 s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset; 933 s_err_prms.i4_inp_stride = inp_stride; 934 s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride; 935 s_err_prms.i4_part_mask = (ENABLE_2Nx2N); 936 s_err_prms.i4_grid_mask = 1; 937 s_err_prms.pi4_sad_grid = &i4_sad_grid; 938 s_err_prms.i4_blk_wd = wd; 939 s_err_prms.i4_blk_ht = ht; 940 s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred; 941 s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list; 942 943 if(ps_inter_ctb_prms->u1_use_satd) 944 { 945 pf_err_compute = compute_satd_8bit; 946 } 947 else 948 { 949 pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit; 950 } 951 952 pf_err_compute(&s_err_prms); 953 954 #if USE_NOISE_TERM_DURING_BICAND_SEARCH 955 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) 956 { 957 unsigned long u4_shift_val; 958 ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX; 959 ULWORD64 u8_temp_var, u8_temp_var1; 960 S32 i4_bits_req; 961 962 S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT; 963 964 u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]); 965 u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX; 966 967 if(e_cu_size == CU_8x8) 968 { 969 PART_ID_T e_part_id = 970 (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1)); 971 972 u4_shift_val = ihevce_calc_stim_injected_variance( 973 ps_inter_ctb_prms->pu8_part_src_sigmaX, 974 ps_inter_ctb_prms->pu8_part_src_sigmaXSquared, 975 &u8_src_variance, 976 i4_default_src_wt, 977 0, 978 ps_inter_ctb_prms->wpred_log_wdc, 979 e_part_id); 980 } 981 else 982 { 983 u4_shift_val = ihevce_calc_stim_injected_variance( 984 ps_inter_ctb_prms->pu8_part_src_sigmaX, 985 ps_inter_ctb_prms->pu8_part_src_sigmaXSquared, 986 &u8_src_variance, 987 i4_default_src_wt, 988 0, 989 ps_inter_ctb_prms->wpred_log_wdc, 990 e_part_id); 991 } 992 993 u8_pred_variance = u8_pred_variance >> u4_shift_val; 994 995 GETRANGE64(i4_bits_req, u8_pred_variance); 996 997 if(i4_bits_req > 27) 998 { 999 u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27); 1000 u8_src_variance = u8_src_variance >> (i4_bits_req - 27); 1001 } 1002 1003 if(u8_src_variance == u8_pred_variance) 1004 { 1005 u8_temp_var = (1 << STIM_Q_FORMAT); 1006 } 1007 else 1008 { 1009 u8_temp_var = (2 * u8_src_variance * u8_pred_variance); 1010 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT)); 1011 u8_temp_var1 = 1012 (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance); 1013 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2)); 1014 u8_temp_var = (u8_temp_var / u8_temp_var1); 1015 } 1016 1017 i4_noise_term = (UWORD32)u8_temp_var; 1018 1019 i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier; 1020 1021 ASSERT(i4_noise_term >= 0); 1022 1023 u8_temp_var = i4_sad_grid; 1024 u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term)); 1025 u8_temp_var += (1 << ((i4_q_level)-1)); 1026 i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level)); 1027 } 1028 #endif 1029 1030 cur_iter_best_cost = i4_sad_grid; 1031 cur_iter_best_cost += ps_pu_node1->i4_mv_cost; 1032 cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost; 1033 1034 if(cur_iter_best_cost < bi_cost) 1035 { 1036 bi_cost = cur_iter_best_cost; 1037 best_cand_in_opp_dir_idx = i; 1038 is_best_cand_an_intra = 0; 1039 } 1040 } 1041 1042 uni_cost = ps_pu_node1->i4_tot_cost; 1043 1044 #if USE_NOISE_TERM_DURING_BICAND_SEARCH 1045 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) 1046 { 1047 unsigned long u4_shift_val; 1048 ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX; 1049 ULWORD64 u8_temp_var, u8_temp_var1; 1050 S32 i4_bits_req; 1051 1052 S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT; 1053 1054 S08 i1_ref_idx = 1055 (PRED_L0 == ps_pu_node1->pu.b2_pred_mode) 1056 ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx] 1057 : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx]; 1058 S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost; 1059 1060 u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]); 1061 u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX; 1062 1063 if(e_cu_size == CU_8x8) 1064 { 1065 PART_ID_T e_part_id = 1066 (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1)); 1067 1068 u4_shift_val = ihevce_calc_stim_injected_variance( 1069 ps_inter_ctb_prms->pu8_part_src_sigmaX, 1070 ps_inter_ctb_prms->pu8_part_src_sigmaXSquared, 1071 &u8_src_variance, 1072 ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx], 1073 ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx], 1074 ps_inter_ctb_prms->wpred_log_wdc, 1075 e_part_id); 1076 } 1077 else 1078 { 1079 u4_shift_val = ihevce_calc_stim_injected_variance( 1080 ps_inter_ctb_prms->pu8_part_src_sigmaX, 1081 ps_inter_ctb_prms->pu8_part_src_sigmaXSquared, 1082 &u8_src_variance, 1083 ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx], 1084 ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx], 1085 ps_inter_ctb_prms->wpred_log_wdc, 1086 e_part_id); 1087 } 1088 1089 u8_pred_variance = u8_pred_variance >> (u4_shift_val); 1090 1091 GETRANGE64(i4_bits_req, u8_pred_variance); 1092 1093 if(i4_bits_req > 27) 1094 { 1095 u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27); 1096 u8_src_variance = u8_src_variance >> (i4_bits_req - 27); 1097 } 1098 1099 if(u8_src_variance == u8_pred_variance) 1100 { 1101 u8_temp_var = (1 << STIM_Q_FORMAT); 1102 } 1103 else 1104 { 1105 u8_temp_var = (2 * u8_src_variance * u8_pred_variance); 1106 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT)); 1107 u8_temp_var1 = 1108 (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance); 1109 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2)); 1110 u8_temp_var = (u8_temp_var / u8_temp_var1); 1111 } 1112 1113 i4_noise_term = (UWORD32)u8_temp_var; 1114 1115 i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier; 1116 1117 ASSERT(i4_noise_term >= 0); 1118 1119 u8_temp_var = i4_sad; 1120 u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term)); 1121 u8_temp_var += (1 << ((i4_q_level)-1)); 1122 i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level)); 1123 1124 uni_cost = i4_sad + ps_pu_node1->i4_mv_cost; 1125 1126 pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j]; 1127 pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j]; 1128 } 1129 #endif 1130 1131 if((bi_cost < uni_cost) && (!is_best_cand_an_intra)) 1132 { 1133 if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier) 1134 { 1135 pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j]; 1136 pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j]; 1137 } 1138 1139 if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode) 1140 { 1141 ps_pu_node1->pu.b2_pred_mode = PRED_BI; 1142 1143 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode) 1144 { 1145 ps_pu_node1->pu.mv.i1_l1_ref_idx = 1146 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx; 1147 ps_pu_node1->pu.mv.s_l1_mv.i2_mvx = 1148 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx; 1149 ps_pu_node1->pu.mv.s_l1_mv.i2_mvy = 1150 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy; 1151 } 1152 else 1153 { 1154 ps_pu_node1->pu.mv.i1_l1_ref_idx = 1155 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx; 1156 ps_pu_node1->pu.mv.s_l1_mv.i2_mvx = 1157 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx; 1158 ps_pu_node1->pu.mv.s_l1_mv.i2_mvy = 1159 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy; 1160 } 1161 } 1162 else 1163 { 1164 ps_pu_node1->pu.b2_pred_mode = PRED_BI; 1165 1166 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode) 1167 { 1168 ps_pu_node1->pu.mv.i1_l0_ref_idx = 1169 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx; 1170 ps_pu_node1->pu.mv.s_l0_mv.i2_mvx = 1171 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx; 1172 ps_pu_node1->pu.mv.s_l0_mv.i2_mvy = 1173 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy; 1174 } 1175 else 1176 { 1177 ps_pu_node1->pu.mv.i1_l0_ref_idx = 1178 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx; 1179 ps_pu_node1->pu.mv.s_l0_mv.i2_mvx = 1180 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx; 1181 ps_pu_node1->pu.mv.s_l0_mv.i2_mvy = 1182 ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy; 1183 } 1184 } 1185 1186 ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost; 1187 } 1188 1189 best_cost = MIN(uni_cost, bi_cost); 1190 tot_cost += best_cost; 1191 } 1192 1193 hme_debrief_bipred_eval( 1194 ps_part_type_result, 1195 as_pred_buf_data, 1196 &ps_inter_ctb_prms->s_pred_buf_mngr, 1197 au1_pred_buf_array_indixes, 1198 ps_cmn_utils_optimised_function_list); 1199 1200 ps_part_type_result->i4_tot_cost = tot_cost; 1201 } 1202 1203 WORD32 hme_evalsatd_pt_pu_8x8_tu_rec( 1204 err_prms_t *ps_prms, 1205 WORD32 lambda, 1206 WORD32 lambda_q_shift, 1207 WORD32 i4_frm_qstep, 1208 me_func_selector_t *ps_func_selector) 1209 { 1210 S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */ 1211 S32 i4_satd_8x8; 1212 S16 *pi2_had_out; 1213 S32 i4_tu_split_flag = 0; 1214 S32 i4_tu_early_cbf = 0; 1215 1216 S32 i4_early_cbf = 1; 1217 // S32 i4_i, i4_k; 1218 S32 i4_total_satd_cost = 0; 1219 S32 best_cost_tu_split; 1220 1221 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ 1222 S32 *api4_satd_pu[HAD_32x32 + 1]; 1223 S32 *api4_tu_split[HAD_32x32 + 1]; 1224 S32 *api4_tu_early_cbf[HAD_32x32 + 1]; 1225 1226 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; 1227 S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags; 1228 S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf; 1229 1230 U08 *pu1_inp = ps_prms->pu1_inp; 1231 U08 *pu1_ref = ps_prms->pu1_ref; 1232 1233 S32 inp_stride = ps_prms->i4_inp_stride; 1234 S32 ref_stride = ps_prms->i4_ref_stride; 1235 1236 /* Initialize tu_split_cost to "0" */ 1237 ps_prms->i4_tu_split_cost = 0; 1238 pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem; 1239 1240 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; 1241 api4_satd_pu[HAD_8x8] = &i4_satd_8x8; 1242 api4_satd_pu[HAD_16x16] = NULL; 1243 api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ 1244 1245 api4_tu_split[HAD_4x4] = NULL; 1246 api4_tu_split[HAD_8x8] = &i4_tu_split_flag; 1247 api4_tu_split[HAD_16x16] = NULL; 1248 api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ 1249 1250 api4_tu_early_cbf[HAD_4x4] = NULL; 1251 api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf; 1252 api4_tu_early_cbf[HAD_16x16] = NULL; 1253 api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ 1254 1255 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ 1256 1257 /* Return value is merge of both best_stad_cost and tu_split_flags */ 1258 best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r( 1259 pu1_inp, 1260 inp_stride, 1261 pu1_ref, 1262 ref_stride, 1263 pi2_had_out, 1264 8, 1265 api4_satd_pu, 1266 api4_tu_split, 1267 api4_tu_early_cbf, 1268 0, 1269 2, 1270 0, 1271 0, 1272 i4_frm_qstep, 1273 0, 1274 ps_prms->u1_max_tr_depth, 1275 ps_prms->u1_max_tr_size, 1276 &(ps_prms->i4_tu_split_cost), 1277 NULL); 1278 1279 /* For SATD computation following TU size are assumed for a 8x8 CU */ 1280 /* 8 for 2Nx2N, 4 for Nx2N,2NxN */ 1281 1282 i4_total_satd_cost = best_cost_tu_split >> 2; 1283 1284 /* Second last bit has the tu pslit flag */ 1285 i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1; 1286 1287 /* Last bit corrsponds to the Early CBF flag */ 1288 i4_early_cbf = (best_cost_tu_split & 0x1); 1289 1290 /* Update 8x8 SATDs */ 1291 pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8; 1292 pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag; 1293 pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf; 1294 1295 return i4_total_satd_cost; 1296 } 1297 //#endif 1298 /** 1299 ******************************************************************************** 1300 * @fn S32 hme_evalsatd_update_1_best_result_pt_pu_16x16 1301 * 1302 * @brief Evaluates the SATD with partial updates for all the best partitions 1303 * of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds 1304 * 1305 * @param[inout] ps_prms: error prms containg current and ref ptr, strides, 1306 * pointer to sad grid of each partitions 1307 * 1308 * @return None 1309 ******************************************************************************** 1310 */ 1311 1312 void hme_evalsatd_update_2_best_results_pt_pu_16x16( 1313 err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms) 1314 { 1315 S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */ 1316 S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */ 1317 S32 i4_satd_16x16; /* 16x16 satd cost */ 1318 S32 i; 1319 S16 ai2_8x8_had[256]; 1320 S16 *pi2_y0; 1321 U08 *pu1_src, *pu1_pred; 1322 S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0; 1323 S32 *ppi4_hsad; 1324 1325 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ 1326 S32 *api4_satd_pu[HAD_32x32 + 1]; 1327 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; 1328 1329 U08 *pu1_inp = ps_prms->pu1_inp; 1330 U08 *pu1_ref = ps_prms->pu1_ref; 1331 1332 S32 inp_stride = ps_prms->i4_inp_stride; 1333 S32 ref_stride = ps_prms->i4_ref_stride; 1334 1335 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; 1336 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; 1337 api4_satd_pu[HAD_16x16] = &i4_satd_16x16; 1338 api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ 1339 1340 ppi4_hsad = api4_satd_pu[HAD_16x16]; 1341 1342 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ 1343 for(i = 0; i < 4; i++) 1344 { 1345 pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8; 1346 pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8; 1347 pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8; 1348 pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16); 1349 1350 ihevce_had_8x8_using_4_4x4( 1351 pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4); 1352 } 1353 1354 /* For SATD computation following TU size are assumed for a 16x16 CU */ 1355 /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs */ 1356 1357 /* Update 8x8 SATDs */ 1358 /* Modified to cost calculation using only 4x4 SATD */ 1359 1360 // ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5]; 1361 // ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7]; 1362 // ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13]; 1363 // ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; 1364 1365 /* Update 16x16 SATDs */ 1366 pi4_sad_grid[PART_ID_2Nx2N] = 1367 ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3]; 1368 1369 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0]; 1370 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1]; 1371 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2]; 1372 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3]; 1373 1374 /* Update 8x16 / 16x8 SATDs */ 1375 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2]; 1376 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3]; 1377 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1]; 1378 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3]; 1379 1380 /* Update AMP SATDs 16x12,16x4, 12x16,4x16 */ 1381 pi4_sad_grid[PART_ID_nLx2N_L] = 1382 ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12]; 1383 1384 pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] + 1385 ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R]; 1386 1387 pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] + 1388 ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L]; 1389 1390 pi4_sad_grid[PART_ID_nRx2N_R] = 1391 ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15]; 1392 1393 pi4_sad_grid[PART_ID_2NxnU_T] = 1394 ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3]; 1395 1396 pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] + 1397 ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B]; 1398 1399 pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] + 1400 ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T]; 1401 1402 pi4_sad_grid[PART_ID_2NxnD_B] = 1403 ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; 1404 1405 /* Call the update results function */ 1406 { 1407 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost; 1408 mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt; 1409 S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0]; 1410 S32 best_node_cost; 1411 S32 second_best_node_cost; 1412 1413 /*For each valid partition, update the refine_prm structure to reflect the best and second 1414 best candidates for that partition*/ 1415 1416 for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++) 1417 { 1418 S32 update_required = 0; 1419 S32 part_id = pi4_valid_part_ids[i4_count]; 1420 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count; 1421 1422 /* Use a pre-computed cost instead of freshly evaluating subpel cost */ 1423 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 1424 1425 /*Calculate total cost*/ 1426 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff); 1427 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost); 1428 1429 /*****************************************************************/ 1430 /* We do not labor through the results if the total cost worse */ 1431 /* than the last of the results. */ 1432 /*****************************************************************/ 1433 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]); 1434 second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]); 1435 1436 if(i4_tot_cost < second_best_node_cost) 1437 { 1438 update_required = 2; 1439 1440 /*************************************************************/ 1441 /* Identify where the current result isto be placed.Basically*/ 1442 /* find the node which has cost just higher thannodeundertest*/ 1443 /*************************************************************/ 1444 if(i4_tot_cost < best_node_cost) 1445 { 1446 update_required = 1; 1447 } 1448 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index]) 1449 { 1450 update_required = 0; 1451 } 1452 if(update_required == 2) 1453 { 1454 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost; 1455 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost; 1456 ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x; 1457 ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y; 1458 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx; 1459 } 1460 else if(update_required == 1) 1461 { 1462 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = 1463 ps_subpel_refine_ctxt->i2_tot_cost[0][index]; 1464 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = 1465 ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 1466 ps_subpel_refine_ctxt->i2_mv_x[1][index] = 1467 ps_subpel_refine_ctxt->i2_mv_x[0][index]; 1468 ps_subpel_refine_ctxt->i2_mv_y[1][index] = 1469 ps_subpel_refine_ctxt->i2_mv_y[0][index]; 1470 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = 1471 ps_subpel_refine_ctxt->i2_ref_idx[0][index]; 1472 1473 ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost; 1474 ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost; 1475 ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x; 1476 ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y; 1477 ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx; 1478 } 1479 } 1480 } 1481 } 1482 } 1483 1484 //#if COMPUTE_16x16_R == C 1485 void hme_evalsatd_update_1_best_result_pt_pu_16x16( 1486 err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms) 1487 { 1488 S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */ 1489 S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */ 1490 S32 i4_satd_16x16; /* 16x16 satd cost */ 1491 S32 i; 1492 S16 ai2_8x8_had[256]; 1493 S16 *pi2_y0; 1494 U08 *pu1_src, *pu1_pred; 1495 S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0; 1496 S32 *ppi4_hsad; 1497 1498 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ 1499 S32 *api4_satd_pu[HAD_32x32 + 1]; 1500 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; 1501 1502 U08 *pu1_inp = ps_prms->pu1_inp; 1503 U08 *pu1_ref = ps_prms->pu1_ref; 1504 1505 S32 inp_stride = ps_prms->i4_inp_stride; 1506 S32 ref_stride = ps_prms->i4_ref_stride; 1507 1508 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; 1509 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; 1510 api4_satd_pu[HAD_16x16] = &i4_satd_16x16; 1511 api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ 1512 1513 ppi4_hsad = api4_satd_pu[HAD_16x16]; 1514 1515 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ 1516 for(i = 0; i < 4; i++) 1517 { 1518 pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8; 1519 pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8; 1520 pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8; 1521 pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16); 1522 1523 ihevce_had_8x8_using_4_4x4( 1524 pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4); 1525 } 1526 1527 /* For SATD computation following TU size are assumed for a 16x16 CU */ 1528 /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs */ 1529 1530 /* Update 8x8 SATDs */ 1531 /* Modified to cost calculation using only 4x4 SATD */ 1532 1533 // ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5]; 1534 // ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7]; 1535 // ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13]; 1536 // ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; 1537 1538 /* Update 16x16 SATDs */ 1539 pi4_sad_grid[PART_ID_2Nx2N] = 1540 ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3]; 1541 1542 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0]; 1543 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1]; 1544 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2]; 1545 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3]; 1546 1547 /* Update 8x16 / 16x8 SATDs */ 1548 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2]; 1549 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3]; 1550 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1]; 1551 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3]; 1552 1553 /* Update AMP SATDs 16x12,16x4, 12x16,4x16 */ 1554 pi4_sad_grid[PART_ID_nLx2N_L] = 1555 ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10]; 1556 pi4_sad_grid[PART_ID_nRx2N_R] = 1557 ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15]; 1558 pi4_sad_grid[PART_ID_2NxnU_T] = 1559 ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5]; 1560 pi4_sad_grid[PART_ID_2NxnD_B] = 1561 ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15]; 1562 1563 pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L]; 1564 pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R]; 1565 pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T]; 1566 pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B]; 1567 1568 /* Call the update results function */ 1569 { 1570 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost; 1571 mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt; 1572 S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0]; 1573 S32 best_node_cost; 1574 S32 second_best_node_cost; 1575 1576 /*For each valid partition, update the refine_prm structure to reflect the best and second 1577 best candidates for that partition*/ 1578 1579 for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++) 1580 { 1581 S32 update_required = 0; 1582 S32 part_id = pi4_valid_part_ids[i4_count]; 1583 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count; 1584 1585 /* Use a pre-computed cost instead of freshly evaluating subpel cost */ 1586 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 1587 1588 /*Calculate total cost*/ 1589 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff); 1590 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost); 1591 1592 /*****************************************************************/ 1593 /* We do not labor through the results if the total cost worse */ 1594 /* than the last of the results. */ 1595 /*****************************************************************/ 1596 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]); 1597 second_best_node_cost = SHRT_MAX; 1598 1599 if(i4_tot_cost < second_best_node_cost) 1600 { 1601 update_required = 0; 1602 1603 /*************************************************************/ 1604 /* Identify where the current result isto be placed.Basically*/ 1605 /* find the node which has cost just higher thannodeundertest*/ 1606 /*************************************************************/ 1607 if(i4_tot_cost < best_node_cost) 1608 { 1609 update_required = 1; 1610 } 1611 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index]) 1612 { 1613 update_required = 0; 1614 } 1615 if(update_required == 2) 1616 { 1617 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost; 1618 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost; 1619 ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x; 1620 ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y; 1621 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx; 1622 } 1623 else if(update_required == 1) 1624 { 1625 ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost; 1626 ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost; 1627 ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x; 1628 ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y; 1629 ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx; 1630 } 1631 } 1632 } 1633 } 1634 } 1635 1636 WORD32 hme_evalsatd_pt_pu_16x16_tu_rec( 1637 err_prms_t *ps_prms, 1638 WORD32 lambda, 1639 WORD32 lambda_q_shift, 1640 WORD32 i4_frm_qstep, 1641 me_func_selector_t *ps_func_selector) 1642 { 1643 S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */ 1644 S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */ 1645 S32 ai4_tu_split_8x8[16]; 1646 S32 i4_satd_16x16; /* 16x16 satd cost */ 1647 1648 S32 ai4_tu_early_cbf_8x8[16]; 1649 1650 //S16 ai2_had_out[256]; 1651 S16 *pi2_had_out; 1652 S32 tu_split_flag = 0; 1653 S32 early_cbf_flag = 0; 1654 S32 total_satd_cost = 0; 1655 1656 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ 1657 S32 *api4_satd_pu[HAD_32x32 + 1]; 1658 S32 *api4_tu_split[HAD_32x32 + 1]; 1659 S32 *api4_tu_early_cbf[HAD_32x32 + 1]; 1660 1661 U08 *pu1_inp = ps_prms->pu1_inp; 1662 U08 *pu1_ref = ps_prms->pu1_ref; 1663 1664 S32 inp_stride = ps_prms->i4_inp_stride; 1665 S32 ref_stride = ps_prms->i4_ref_stride; 1666 1667 /* Initialize tu_split_cost to "0" */ 1668 ps_prms->i4_tu_split_cost = 0; 1669 1670 pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem; 1671 1672 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; 1673 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; 1674 api4_satd_pu[HAD_16x16] = &i4_satd_16x16; 1675 api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ 1676 1677 api4_tu_split[HAD_4x4] = NULL; 1678 api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0]; 1679 api4_tu_split[HAD_16x16] = &tu_split_flag; 1680 api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ 1681 1682 api4_tu_early_cbf[HAD_4x4] = NULL; 1683 api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0]; 1684 api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag; 1685 api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */ 1686 1687 /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */ 1688 ps_func_selector->pf_had_16x16_r( 1689 pu1_inp, 1690 inp_stride, 1691 pu1_ref, 1692 ref_stride, 1693 pi2_had_out, 1694 16, 1695 api4_satd_pu, 1696 api4_tu_split, 1697 api4_tu_early_cbf, 1698 0, 1699 4, 1700 lambda, 1701 lambda_q_shift, 1702 i4_frm_qstep, 1703 0, 1704 ps_prms->u1_max_tr_depth, 1705 ps_prms->u1_max_tr_size, 1706 &(ps_prms->i4_tu_split_cost), 1707 NULL); 1708 1709 total_satd_cost = i4_satd_16x16; 1710 1711 ps_prms->pi4_tu_split_flags[0] = tu_split_flag; 1712 1713 ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag; 1714 1715 return total_satd_cost; 1716 } 1717 1718 /** 1719 ******************************************************************************** 1720 * @fn S32 hme_evalsatd_pt_pu_32x32 1721 * 1722 * @brief Evaluates the SATD with partial updates for all the best partitions 1723 * of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds 1724 * 1725 * @param[inout] ps_prms: error prms containg current and ref ptr, strides, 1726 * pointer to sad grid of each partitions 1727 * 1728 * @return None 1729 ******************************************************************************** 1730 */ 1731 void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms) 1732 { 1733 //S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */ 1734 S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */ 1735 S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */ 1736 S32 i4_satd_32x32; 1737 // S16 ai2_had_out[32*32]; 1738 U08 *pu1_src; 1739 U08 *pu1_pred; 1740 S32 i; 1741 1742 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ 1743 S32 *api4_satd_pu[HAD_32x32 + 1]; 1744 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; 1745 1746 U08 *pu1_inp = ps_prms->pu1_inp; 1747 U08 *pu1_ref = ps_prms->pu1_ref; 1748 1749 S32 inp_stride = ps_prms->i4_inp_stride; 1750 S32 ref_stride = ps_prms->i4_ref_stride; 1751 1752 //api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; 1753 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; 1754 api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0]; 1755 api4_satd_pu[HAD_32x32] = &i4_satd_32x32; 1756 1757 /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */ 1758 for(i = 0; i < 16; i++) 1759 { 1760 pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8); 1761 1762 pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8); 1763 1764 ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit( 1765 pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1); 1766 } 1767 1768 /* Modified to cost calculation using only 8x8 SATD for 32x32*/ 1769 ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5]; 1770 ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7]; 1771 ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13]; 1772 ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15]; 1773 1774 /* Update 32x32 SATD */ 1775 pi4_sad_grid[PART_ID_2Nx2N] = 1776 ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3]; 1777 1778 /* Update 16x16 SATDs */ 1779 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0]; 1780 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1]; 1781 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2]; 1782 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3]; 1783 1784 /* Update 16x32 / 32x16 SATDs */ 1785 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2]; 1786 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3]; 1787 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1]; 1788 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3]; 1789 1790 /* Update AMP SATDs 32x24,32x8, 24x32,8x32 */ 1791 pi4_sad_grid[PART_ID_nLx2N_L] = 1792 ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12]; 1793 1794 pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] + 1795 ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R]; 1796 1797 pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] + 1798 ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L]; 1799 1800 pi4_sad_grid[PART_ID_nRx2N_R] = 1801 ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15]; 1802 1803 pi4_sad_grid[PART_ID_2NxnU_T] = 1804 ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3]; 1805 1806 pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] + 1807 ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B]; 1808 1809 pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] + 1810 ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T]; 1811 1812 pi4_sad_grid[PART_ID_2NxnD_B] = 1813 ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15]; 1814 } 1815 1816 WORD32 hme_evalsatd_pt_pu_32x32_tu_rec( 1817 err_prms_t *ps_prms, 1818 WORD32 lambda, 1819 WORD32 lambda_q_shift, 1820 WORD32 i4_frm_qstep, 1821 me_func_selector_t *ps_func_selector) 1822 { 1823 S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */ 1824 S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */ 1825 S32 ai4_tu_split_8x8[16]; 1826 S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */ 1827 S32 ai4_tu_split_16x16[4]; 1828 S32 i4_satd_32x32; 1829 1830 S32 ai4_tu_early_cbf_8x8[16]; 1831 S32 ai4_tu_early_cbf_16x16[4]; 1832 S32 early_cbf_flag; 1833 1834 S16 *pi2_had_out; 1835 1836 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ 1837 S32 *api4_satd_pu[HAD_32x32 + 1]; 1838 S32 *api4_tu_split[HAD_32x32 + 1]; 1839 S32 *api4_tu_early_cbf[HAD_32x32 + 1]; 1840 1841 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; 1842 S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags; 1843 S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf; 1844 1845 S32 tu_split_flag = 0; 1846 S32 total_satd_cost = 0; 1847 1848 U08 *pu1_inp = ps_prms->pu1_inp; 1849 U08 *pu1_ref = ps_prms->pu1_ref; 1850 1851 S32 inp_stride = ps_prms->i4_inp_stride; 1852 S32 ref_stride = ps_prms->i4_ref_stride; 1853 1854 /* Initialize tu_split_cost to "0" */ 1855 ps_prms->i4_tu_split_cost = 0; 1856 1857 pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem; 1858 1859 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; 1860 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; 1861 api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0]; 1862 api4_satd_pu[HAD_32x32] = &i4_satd_32x32; 1863 1864 api4_tu_split[HAD_4x4] = NULL; 1865 api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0]; 1866 api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0]; 1867 api4_tu_split[HAD_32x32] = &tu_split_flag; 1868 1869 api4_tu_early_cbf[HAD_4x4] = NULL; 1870 api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0]; 1871 api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0]; 1872 api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag; 1873 1874 /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */ 1875 ihevce_had_32x32_r( 1876 pu1_inp, 1877 inp_stride, 1878 pu1_ref, 1879 ref_stride, 1880 pi2_had_out, 1881 32, 1882 api4_satd_pu, 1883 api4_tu_split, 1884 api4_tu_early_cbf, 1885 0, 1886 8, 1887 lambda, 1888 lambda_q_shift, 1889 i4_frm_qstep, 1890 0, 1891 ps_prms->u1_max_tr_depth, 1892 ps_prms->u1_max_tr_size, 1893 &(ps_prms->i4_tu_split_cost), 1894 ps_func_selector); 1895 1896 total_satd_cost = i4_satd_32x32; 1897 1898 /*The structure of the TU_SPLIT flag for the current 32x32 is as follows 1899 TL_16x16 - 5bits (4 for child and LSBit for 16x16 split) 1900 TR_16x16 - 5bits (4 for child and LSBit for 16x16 split) 1901 BL_16x16 - 5bits (4 for child and LSBit for 16x16 split) 1902 BR_16x16 - 5bits (4 for child and LSBit for 16x16 split) 1903 32x32_split - 1bit (LSBit) 1904 1905 TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/ 1906 1907 pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost; 1908 pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag; 1909 pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag; 1910 1911 return total_satd_cost; 1912 } 1913 1914 /** 1915 ******************************************************************************** 1916 * @fn S32 hme_evalsatd_pt_pu_64x64 1917 * 1918 * @brief Evaluates the SATD with partial updates for all the best partitions 1919 * of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds 1920 * 1921 * Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard 1922 * outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as 1923 * TU size of 64 is not supported in HEVC 1924 * 1925 * @param[inout] ps_prms: error prms containg current and ref ptr, strides, 1926 * pointer to sad grid of each partitions 1927 * 1928 * @return None 1929 ******************************************************************************** 1930 */ 1931 1932 void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms) 1933 { 1934 //S32 ai4_satd_4x4[4][64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */ 1935 S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */ 1936 S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */ 1937 S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */ 1938 // S16 ai2_had_out[32*32]; 1939 S32 i, j; 1940 1941 // S32 ai4_tu_split_8x8[4][16]; 1942 // S32 ai4_tu_split_16x16[4][4]; 1943 // S32 ai4_tu_split_32x32[4]; 1944 1945 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ 1946 S32 *api4_satd_pu[HAD_32x32 + 1]; 1947 // S32 *api4_tu_split[HAD_32x32 + 1]; 1948 1949 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; 1950 1951 U08 *pu1_inp = ps_prms->pu1_inp; 1952 U08 *pu1_ref = ps_prms->pu1_ref; 1953 U08 *pu1_src; 1954 U08 *pu1_pred; 1955 1956 S32 inp_stride = ps_prms->i4_inp_stride; 1957 S32 ref_stride = ps_prms->i4_ref_stride; 1958 1959 for(i = 0; i < 4; i++) 1960 { 1961 S32 blkx = (i & 0x1); 1962 S32 blky = (i >> 1); 1963 U08 *pu1_pi0, *pu1_pi1; 1964 1965 //api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[i][0]; 1966 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0]; 1967 api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0]; 1968 api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i]; 1969 1970 pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride); 1971 pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride); 1972 1973 /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */ 1974 for(j = 0; j < 16; j++) 1975 { 1976 pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8); 1977 1978 pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8); 1979 1980 ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit( 1981 pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1); 1982 } 1983 1984 /* Modified to cost calculation using only 8x8 SATD for 32x32*/ 1985 ai4_satd_16x16[i][0] = 1986 ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5]; 1987 ai4_satd_16x16[i][1] = 1988 ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7]; 1989 ai4_satd_16x16[i][2] = 1990 ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13]; 1991 ai4_satd_16x16[i][3] = 1992 ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15]; 1993 } 1994 1995 /* Modified to cost calculation using only 8x8 SATD for 32x32*/ 1996 1997 ai4_satd_32x32[0] = 1998 ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3]; 1999 ai4_satd_32x32[1] = 2000 ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3]; 2001 ai4_satd_32x32[2] = 2002 ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3]; 2003 ai4_satd_32x32[3] = 2004 ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3]; 2005 2006 /* Update 64x64 SATDs */ 2007 pi4_sad_grid[PART_ID_2Nx2N] = 2008 ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3]; 2009 2010 /* Update 32x32 SATDs */ 2011 pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0]; 2012 pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1]; 2013 pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2]; 2014 pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3]; 2015 2016 /* Update 32x64 / 64x32 SATDs */ 2017 pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2]; 2018 pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3]; 2019 pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1]; 2020 pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3]; 2021 2022 /* Update AMP SATDs 64x48,64x16, 48x64,16x64 */ 2023 pi4_sad_grid[PART_ID_nLx2N_L] = 2024 ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2]; 2025 2026 pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] + 2027 ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] + 2028 pi4_sad_grid[PART_ID_Nx2N_R]; 2029 2030 pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] + 2031 ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] + 2032 pi4_sad_grid[PART_ID_Nx2N_L]; 2033 2034 pi4_sad_grid[PART_ID_nRx2N_R] = 2035 ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3]; 2036 2037 pi4_sad_grid[PART_ID_2NxnU_T] = 2038 ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1]; 2039 2040 pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] + 2041 ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] + 2042 pi4_sad_grid[PART_ID_2NxN_B]; 2043 2044 pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + 2045 ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + 2046 pi4_sad_grid[PART_ID_2NxN_T]; 2047 2048 pi4_sad_grid[PART_ID_2NxnD_B] = 2049 ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3]; 2050 } 2051 2052 WORD32 hme_evalsatd_pt_pu_64x64_tu_rec( 2053 err_prms_t *ps_prms, 2054 WORD32 lambda, 2055 WORD32 lambda_q_shift, 2056 WORD32 i4_frm_qstep, 2057 me_func_selector_t *ps_func_selector) 2058 { 2059 S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */ 2060 S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */ 2061 S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */ 2062 S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */ 2063 2064 S32 ai4_tu_split_8x8[16]; 2065 S32 ai4_tu_split_16x16[4]; 2066 2067 S32 ai4_tu_early_cbf_8x8[16]; 2068 S32 ai4_tu_early_cbf_16x16[4]; 2069 2070 S16 *pi2_had_out; 2071 S32 i; 2072 2073 /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */ 2074 S32 *api4_satd_pu[HAD_32x32 + 1]; 2075 S32 *api4_tu_split[HAD_32x32 + 1]; 2076 S32 *api4_tu_early_cbf[HAD_32x32 + 1]; 2077 2078 S32 *pi4_sad_grid = ps_prms->pi4_sad_grid; 2079 2080 S32 tu_split_flag = 0; 2081 S32 total_satd_cost = 0; 2082 2083 U08 *pu1_inp = ps_prms->pu1_inp; 2084 U08 *pu1_ref = ps_prms->pu1_ref; 2085 2086 S32 inp_stride = ps_prms->i4_inp_stride; 2087 S32 ref_stride = ps_prms->i4_ref_stride; 2088 2089 /* Initialize tu_split_cost to "0" */ 2090 ps_prms->i4_tu_split_cost = 0; 2091 2092 pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem; 2093 2094 for(i = 0; i < 4; i++) 2095 { 2096 S32 blkx = (i & 0x1); 2097 S32 blky = (i >> 1); 2098 U08 *pu1_pi0, *pu1_pi1; 2099 tu_split_flag = 0; 2100 2101 api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0]; 2102 api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0]; 2103 api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0]; 2104 api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i]; 2105 2106 api4_tu_split[HAD_4x4] = NULL; 2107 api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0]; 2108 api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0]; 2109 api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i]; 2110 2111 api4_tu_early_cbf[HAD_4x4] = NULL; 2112 api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0]; 2113 api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0]; 2114 api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i]; 2115 2116 pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride); 2117 pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride); 2118 2119 /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */ 2120 ihevce_had_32x32_r( 2121 pu1_pi0, 2122 inp_stride, 2123 pu1_pi1, 2124 ref_stride, 2125 pi2_had_out, 2126 32, 2127 api4_satd_pu, 2128 api4_tu_split, 2129 api4_tu_early_cbf, 2130 0, 2131 8, 2132 lambda, 2133 lambda_q_shift, 2134 i4_frm_qstep, 2135 1, 2136 ps_prms->u1_max_tr_depth, 2137 ps_prms->u1_max_tr_size, 2138 &(ps_prms->i4_tu_split_cost), 2139 ps_func_selector); 2140 } 2141 2142 total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3]; 2143 2144 /* Update 64x64 SATDs */ 2145 pi4_sad_grid[PART_ID_2Nx2N] = 2146 ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3]; 2147 2148 return total_satd_cost; 2149 } 2150 2151 /** 2152 ******************************************************************************** 2153 * @fn void hme_subpel_refine_search_node(search_node_t *ps_search_node, 2154 * hme_subpel_prms_t *ps_prms, 2155 * layer_ctxt_t *ps_curr_layer, 2156 * BLK_SIZE_T e_blk_size, 2157 * S32 x_off, 2158 * S32 y_off) 2159 * 2160 * @brief Refines a given partition within a CU 2161 * 2162 * @param[in,out] ps_search_node: supplies starting mv and also ref id. 2163 * updated with the accurate subpel mv 2164 * 2165 * @param[in] ps_prms: subpel prms input to this function 2166 * 2167 * @param[in] ps_curr_layer : layer context 2168 * 2169 * @param[in] e_blk_size : Block size enumeration 2170 * 2171 * @param[in] x_off : x offset of the partition w.r.t. pic start 2172 * 2173 * @param[in] y_off : y offset of the partition w.r.t. pic start 2174 * 2175 * @return None 2176 ******************************************************************************** 2177 */ 2178 2179 static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn( 2180 me_func_selector_t *ps_func_selector, 2181 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list, 2182 S32 i4_part_mask, 2183 U08 u1_use_satd, 2184 U08 u1_num_parts, 2185 U08 u1_num_results) 2186 { 2187 PF_SAD_RESULT_FXN_T pf_err_compute; 2188 2189 ASSERT((1 == u1_num_results) || (2 == u1_num_results)); 2190 2191 if(1 == u1_num_results) 2192 { 2193 if(u1_use_satd) 2194 { 2195 if(u1_num_parts == 1) 2196 { 2197 pf_err_compute = 2198 ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1; 2199 } 2200 else if((u1_num_parts > 1) && (u1_num_parts <= 8)) 2201 { 2202 pf_err_compute = 2203 ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9; 2204 } 2205 else 2206 { 2207 pf_err_compute = 2208 ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17; 2209 } 2210 } 2211 else 2212 { 2213 if(u1_num_parts == 1) 2214 { 2215 pf_err_compute = ps_me_optimised_function_list 2216 ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1; 2217 } 2218 else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5)) 2219 { 2220 pf_err_compute = 2221 ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts; 2222 } 2223 else if((u1_num_parts > 1) && (u1_num_parts <= 8)) 2224 { 2225 pf_err_compute = ps_me_optimised_function_list 2226 ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9; 2227 } 2228 else 2229 { 2230 pf_err_compute = ps_me_optimised_function_list 2231 ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17; 2232 } 2233 } 2234 } 2235 else 2236 { 2237 if(u1_use_satd) 2238 { 2239 if(u1_num_parts == 1) 2240 { 2241 pf_err_compute = 2242 ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1; 2243 } 2244 else if((u1_num_parts > 1) && (u1_num_parts <= 8)) 2245 { 2246 pf_err_compute = 2247 ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9; 2248 } 2249 else 2250 { 2251 pf_err_compute = 2252 ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17; 2253 } 2254 } 2255 else 2256 { 2257 if(u1_num_parts == 1) 2258 { 2259 pf_err_compute = ps_me_optimised_function_list 2260 ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1; 2261 } 2262 else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5)) 2263 { 2264 pf_err_compute = ps_me_optimised_function_list 2265 ->pf_calc_sad_and_2_best_results_subpel_square_parts; 2266 } 2267 else if((u1_num_parts > 1) && (u1_num_parts <= 8)) 2268 { 2269 pf_err_compute = ps_me_optimised_function_list 2270 ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9; 2271 } 2272 else 2273 { 2274 pf_err_compute = ps_me_optimised_function_list 2275 ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17; 2276 } 2277 } 2278 } 2279 2280 return pf_err_compute; 2281 } 2282 2283 #if DIAMOND_GRID == 1 2284 S32 hme_subpel_refine_search_node_high_speed( 2285 search_node_t *ps_search_node, 2286 hme_subpel_prms_t *ps_prms, 2287 layer_ctxt_t *ps_curr_layer, 2288 BLK_SIZE_T e_blk_size, 2289 S32 x_off, 2290 S32 y_off, 2291 search_results_t *ps_search_results, 2292 S32 pred_lx, 2293 S32 i4_part_mask, 2294 S32 *pi4_valid_part_ids, 2295 S32 search_idx, 2296 subpel_dedup_enabler_t *ps_dedup_enabler, 2297 me_func_selector_t *ps_func_selector, 2298 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) 2299 { 2300 S32 i4_num_hpel_refine, i4_num_qpel_refine; 2301 S32 i4_offset, i4_grid_mask; 2302 S08 i1_ref_idx; 2303 S32 i4_blk_wd, i4_blk_ht; 2304 S32 i4_ref_stride, i4_i; 2305 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx]; 2306 result_upd_prms_t s_result_prms; 2307 search_node_t s_temp_search_node; 2308 2309 /*************************************************************************/ 2310 /* Tracks current MV with the fractional component. */ 2311 /*************************************************************************/ 2312 S32 i4_mv_x, i4_mv_y; 2313 S32 i4_frac_x, i4_frac_y; 2314 2315 /*************************************************************************/ 2316 /* Function pointer for SAD/SATD, array and prms structure to pass to */ 2317 /* This function */ 2318 /*************************************************************************/ 2319 PF_SAD_RESULT_FXN_T pf_err_compute; 2320 2321 S32 ai4_sad_grid[17], i4_tot_cost; 2322 err_prms_t s_err_prms; 2323 2324 /*************************************************************************/ 2325 /* Allowed MV RANGE */ 2326 /*************************************************************************/ 2327 range_prms_t *ps_range_prms; 2328 2329 /*************************************************************************/ 2330 /* stores min id in grid with associated min cost. */ 2331 /*************************************************************************/ 2332 S32 i4_min_cost, i4_min_sad; 2333 GRID_PT_T e_min_id; 2334 2335 PF_INTERP_FXN_T pf_qpel_interp; 2336 /*************************************************************************/ 2337 /* For hpel and qpel we move in diamonds and hence each point in the */ 2338 /* diamond will belong to a completely different plane. To simplify the */ 2339 /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the */ 2340 /* hpel planes which are interpolated during recon. */ 2341 /*************************************************************************/ 2342 U08 *apu1_hpel_ref[4], *pu1_ref; 2343 2344 interp_prms_t s_interp_prms; 2345 2346 /*************************************************************************/ 2347 /* Maintains the minimum id of interpolated buffers, and the pointer that*/ 2348 /* points to the corresponding predicted buf with its stride. */ 2349 /* Note that the pointer cannot be derived just from the id, since the */ 2350 /* pointer may also point to the hpel buffer (in case we request interp */ 2351 /* of a hpel pt, which already exists in the recon hpel planes) */ 2352 /*************************************************************************/ 2353 U08 *pu1_final_out; 2354 S32 i4_final_out_stride; 2355 S32 part_id; 2356 S32 check_for_duplicate = 0; 2357 2358 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt; 2359 2360 S32 mvx_qpel; 2361 S32 mvy_qpel; 2362 2363 pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn( 2364 ps_func_selector, 2365 ps_me_optimised_function_list, 2366 i4_part_mask, 2367 ps_prms->i4_use_satd, 2368 ps_subpel_refine_ctxt->i4_num_valid_parts, 2369 ps_search_results->u1_num_results_per_part); 2370 2371 i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine; 2372 i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine; 2373 2374 /* Prediction contet should now deal with qpel units */ 2375 HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL); 2376 2377 /* Buffer allocation for subpel */ 2378 /* Current design is that there may be many partitions and different mvs */ 2379 /* that attempt subpel refinemnt. While there is possibility of overlap, the */ 2380 /* hashing to detect and avoid overlap may be very complex. So, currently, */ 2381 /* the only thing done is to store the eventual predicted buffer with every */ 2382 /* ctb node that holds the result of hte best subpel search */ 2383 2384 /* Compute the base pointer for input, interpolated buffers */ 2385 /* The base pointers point as follows: */ 2386 /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */ 2387 /* To these, we need to add the offset of the current node */ 2388 i4_ref_stride = ps_curr_layer->i4_rec_stride; 2389 i4_offset = x_off + (y_off * i4_ref_stride); 2390 i1_ref_idx = ps_search_node->i1_ref_idx; 2391 2392 apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset; 2393 apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset; 2394 apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset; 2395 apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset; 2396 2397 /* Initialize result params used for partition update */ 2398 s_result_prms.pf_mv_cost_compute = NULL; 2399 s_result_prms.ps_search_results = ps_search_results; 2400 s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids; 2401 s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx; 2402 s_result_prms.u1_pred_lx = search_idx; 2403 s_result_prms.i4_part_mask = i4_part_mask; 2404 s_result_prms.ps_search_node_base = ps_search_node; 2405 s_result_prms.pi4_sad_grid = &ai4_sad_grid[0]; 2406 s_result_prms.i4_grid_mask = 1; 2407 s_result_prms.ps_search_node = &s_temp_search_node; 2408 s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx; 2409 2410 /* convert to hpel units */ 2411 i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1; 2412 i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1; 2413 2414 /* for first pt, we compute at all locations in the grid, 4 + 1 centre */ 2415 ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx]; 2416 i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL); 2417 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms); 2418 2419 i4_min_cost = MAX_32BIT_VAL; 2420 i4_min_sad = MAX_32BIT_VAL; 2421 2422 /*************************************************************************/ 2423 /* Prepare the input params to SAD/SATD function. Note that input is */ 2424 /* passed from the calling funcion since it may be I (normal subpel */ 2425 /* refinement) or 2I - P0 in case of bidirect subpel refinement. */ 2426 /* Both cases are handled here. */ 2427 /*************************************************************************/ 2428 s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp; 2429 s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride; 2430 s_err_prms.i4_ref_stride = i4_ref_stride; 2431 s_err_prms.i4_part_mask = (ENABLE_2Nx2N); 2432 s_err_prms.i4_grid_mask = 1; 2433 s_err_prms.pi4_sad_grid = &ai4_sad_grid[0]; 2434 s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size]; 2435 s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size]; 2436 2437 s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt; 2438 2439 part_id = ps_search_node->u1_part_id; 2440 for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++) 2441 { 2442 e_min_id = PT_C; 2443 2444 mvx_qpel = i4_mv_x << 1; 2445 mvy_qpel = i4_mv_y << 1; 2446 2447 /* Central pt */ 2448 if(i4_grid_mask & BIT_EN(PT_C)) 2449 { 2450 //ps_search_node->i2_mv_x = (S16)i4_mv_x; 2451 //ps_search_node->i2_mv_x = (S16)i4_mv_y; 2452 /* central pt is i4_mv_x, i4_mv_y */ 2453 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2454 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate); 2455 2456 i4_frac_x = i4_mv_x & 1; 2457 i4_frac_y = i4_mv_y & 1; 2458 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 2459 s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); 2460 2461 /* Update the mv's with the current candt motion vectors */ 2462 s_result_prms.i2_mv_x = mvx_qpel; 2463 s_result_prms.i2_mv_y = mvy_qpel; 2464 s_temp_search_node.s_mv.i2_mvx = mvx_qpel; 2465 s_temp_search_node.s_mv.i2_mvy = mvy_qpel; 2466 2467 pf_err_compute(&s_err_prms, &s_result_prms); 2468 2469 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2470 if(i4_tot_cost < i4_min_cost) 2471 { 2472 i4_min_cost = i4_tot_cost; 2473 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2474 e_min_id = PT_C; 2475 pu1_final_out = s_err_prms.pu1_ref; 2476 } 2477 } 2478 2479 /* left pt */ 2480 if(i4_grid_mask & BIT_EN(PT_L)) 2481 { 2482 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2483 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate); 2484 2485 if(!check_for_duplicate) 2486 { 2487 /* search node mv is stored in qpel units */ 2488 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1); 2489 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); 2490 /* central pt is i4_mv_x - 1, i4_mv_y */ 2491 i4_frac_x = (i4_mv_x - 1) & 1; // same as (x-1)&1 2492 i4_frac_y = i4_mv_y & 1; 2493 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 2494 s_err_prms.pu1_ref = 2495 pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); 2496 2497 /* Update the mv's with the current candt motion vectors */ 2498 s_result_prms.i2_mv_x = mvx_qpel - 2; 2499 s_result_prms.i2_mv_y = mvy_qpel; 2500 s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2; 2501 s_temp_search_node.s_mv.i2_mvy = mvy_qpel; 2502 2503 pf_err_compute(&s_err_prms, &s_result_prms); 2504 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 2505 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2506 if(i4_tot_cost < i4_min_cost) 2507 { 2508 i4_min_cost = i4_tot_cost; 2509 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2510 e_min_id = PT_L; 2511 pu1_final_out = s_err_prms.pu1_ref; 2512 } 2513 } 2514 } 2515 /* top pt */ 2516 if(i4_grid_mask & BIT_EN(PT_T)) 2517 { 2518 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2519 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate); 2520 2521 if(!check_for_duplicate) 2522 { 2523 /* search node mv is stored in qpel units */ 2524 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1); 2525 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1); 2526 /* top pt is i4_mv_x, i4_mv_y - 1 */ 2527 i4_frac_x = i4_mv_x & 1; 2528 i4_frac_y = (i4_mv_y - 1) & 1; 2529 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 2530 s_err_prms.pu1_ref = 2531 pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride); 2532 2533 /* Update the mv's with the current candt motion vectors */ 2534 s_result_prms.i2_mv_x = mvx_qpel; 2535 s_result_prms.i2_mv_y = mvy_qpel - 2; 2536 s_temp_search_node.s_mv.i2_mvx = mvx_qpel; 2537 s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2; 2538 2539 pf_err_compute(&s_err_prms, &s_result_prms); 2540 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 2541 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2542 if(i4_tot_cost < i4_min_cost) 2543 { 2544 i4_min_cost = i4_tot_cost; 2545 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2546 e_min_id = PT_T; 2547 pu1_final_out = s_err_prms.pu1_ref; 2548 } 2549 } 2550 } 2551 /* right pt */ 2552 if(i4_grid_mask & BIT_EN(PT_R)) 2553 { 2554 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2555 ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate); 2556 if(!check_for_duplicate) 2557 { 2558 /* search node mv is stored in qpel units */ 2559 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1); 2560 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); 2561 /* right pt is i4_mv_x + 1, i4_mv_y */ 2562 i4_frac_x = (i4_mv_x + 1) & 1; 2563 i4_frac_y = i4_mv_y & 1; 2564 2565 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 2566 s_err_prms.pu1_ref = 2567 pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); 2568 2569 /* Update the mv's with the current candt motion vectors */ 2570 s_result_prms.i2_mv_x = mvx_qpel + 2; 2571 s_result_prms.i2_mv_y = mvy_qpel; 2572 s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2; 2573 s_temp_search_node.s_mv.i2_mvy = mvy_qpel; 2574 2575 pf_err_compute(&s_err_prms, &s_result_prms); 2576 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 2577 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2578 if(i4_tot_cost < i4_min_cost) 2579 { 2580 i4_min_cost = i4_tot_cost; 2581 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2582 e_min_id = PT_R; 2583 pu1_final_out = s_err_prms.pu1_ref; 2584 } 2585 } 2586 } 2587 /* bottom pt */ 2588 if(i4_grid_mask & BIT_EN(PT_B)) 2589 { 2590 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2591 ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate); 2592 if(!check_for_duplicate) 2593 { 2594 /* search node mv is stored in qpel units */ 2595 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1); 2596 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1); 2597 i4_frac_x = i4_mv_x & 1; 2598 i4_frac_y = (i4_mv_y + 1) & 1; 2599 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 2600 s_err_prms.pu1_ref = 2601 pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride); 2602 2603 /* Update the mv's with the current candt motion vectors */ 2604 s_result_prms.i2_mv_x = mvx_qpel; 2605 s_result_prms.i2_mv_y = mvy_qpel + 2; 2606 s_temp_search_node.s_mv.i2_mvx = mvx_qpel; 2607 s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2; 2608 2609 pf_err_compute(&s_err_prms, &s_result_prms); 2610 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 2611 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2612 if(i4_tot_cost < i4_min_cost) 2613 { 2614 i4_min_cost = i4_tot_cost; 2615 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2616 e_min_id = PT_B; 2617 pu1_final_out = s_err_prms.pu1_ref; 2618 } 2619 } 2620 } 2621 /* Early exit in case of central point */ 2622 if(e_min_id == PT_C) 2623 break; 2624 2625 /*********************************************************************/ 2626 /* Depending on the best result location, we may be able to skip */ 2627 /* atleast two pts, centre pt and one more pt. E.g. if right pt is */ 2628 /* the best result, the next iteration need not do centre, left pts */ 2629 /*********************************************************************/ 2630 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id]; 2631 i4_mv_x += gai1_grid_id_to_x[e_min_id]; 2632 i4_mv_y += gai1_grid_id_to_y[e_min_id]; 2633 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 2634 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 2635 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms); 2636 } 2637 2638 /* Convert to QPEL units */ 2639 i4_mv_x <<= 1; 2640 i4_mv_y <<= 1; 2641 2642 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 2643 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 2644 2645 /* Exact interpolation or averaging chosen here */ 2646 pf_qpel_interp = ps_prms->pf_qpel_interp; 2647 2648 /* Next QPEL ME */ 2649 /* In this case, we have option of doing exact QPEL interpolation or avg */ 2650 /*************************************************************************/ 2651 /* x */ 2652 /* A b C d */ 2653 /* e f g h */ 2654 /* I j K l */ 2655 /* m n o p */ 2656 /* Q r S t */ 2657 /* */ 2658 /* Approximate QPEL logic */ 2659 /* b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K) */ 2660 /* for any given pt, we can get all the information required about */ 2661 /* the surrounding 4 pts. For example, given point C (0.5, 0) */ 2662 /* surrounding pts info: */ 2663 /* b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf */ 2664 /* buffer 2: hxfy, offsets for both are 0, 0 */ 2665 /* similarly for other pts the info can be gotten */ 2666 /*************************************************************************/ 2667 i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C)); 2668 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms); 2669 2670 /*************************************************************************/ 2671 /* One time preparation of non changing interpolation params. These */ 2672 /* include a set of ping pong result buf ptrs, input buf ptrs and some */ 2673 /* working memory (not used though in case of averaging). */ 2674 /*************************************************************************/ 2675 s_interp_prms.ppu1_ref = &apu1_hpel_ref[0]; 2676 s_interp_prms.i4_ref_stride = i4_ref_stride; 2677 s_interp_prms.i4_blk_wd = i4_blk_wd; 2678 s_interp_prms.i4_blk_ht = i4_blk_ht; 2679 2680 i4_final_out_stride = i4_ref_stride; 2681 2682 { 2683 U08 *pu1_mem; 2684 /*********************************************************************/ 2685 /* Allocation of working memory for interpolated buffers. We maintain*/ 2686 /* an intermediate working buffer, and 2 ping pong interpolated out */ 2687 /* buffers, purpose of ping pong explained later below */ 2688 /*********************************************************************/ 2689 pu1_mem = ps_prms->pu1_wkg_mem; 2690 s_interp_prms.pu1_wkg_mem = pu1_mem; 2691 2692 //pu1_mem += (INTERP_INTERMED_BUF_SIZE); 2693 s_interp_prms.apu1_interp_out[0] = pu1_mem; 2694 2695 pu1_mem += (INTERP_OUT_BUF_SIZE); 2696 s_interp_prms.apu1_interp_out[1] = pu1_mem; 2697 2698 pu1_mem += (INTERP_OUT_BUF_SIZE); 2699 s_interp_prms.apu1_interp_out[2] = pu1_mem; 2700 2701 pu1_mem += (INTERP_OUT_BUF_SIZE); 2702 s_interp_prms.apu1_interp_out[3] = pu1_mem; 2703 2704 pu1_mem += (INTERP_OUT_BUF_SIZE); 2705 s_interp_prms.apu1_interp_out[4] = pu1_mem; 2706 2707 /*********************************************************************/ 2708 /* Stride of interpolated output is just a function of blk width of */ 2709 /* this partition and hence remains constant for this partition */ 2710 /*********************************************************************/ 2711 s_interp_prms.i4_out_stride = (i4_blk_wd); 2712 } 2713 2714 { 2715 UWORD8 *apu1_final[4]; 2716 WORD32 ai4_ref_stride[4]; 2717 /*************************************************************************/ 2718 /* Ping pong design for interpolated buffers. We use a min id, which */ 2719 /* tracks the id of the ppu1_interp_out that stores the best result. */ 2720 /* When new interp to be done, it uses 1 - bes result id to do the interp*/ 2721 /* min id is toggled when any new result becomes the best result. */ 2722 /*************************************************************************/ 2723 2724 for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++) 2725 { 2726 e_min_id = PT_C; 2727 2728 mvx_qpel = i4_mv_x; 2729 mvy_qpel = i4_mv_y; 2730 hme_qpel_interp_comprehensive( 2731 &s_interp_prms, 2732 apu1_final, 2733 ai4_ref_stride, 2734 i4_mv_x, 2735 i4_mv_y, 2736 i4_grid_mask, 2737 ps_me_optimised_function_list); 2738 if(i4_grid_mask & BIT_EN(PT_L)) 2739 { 2740 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2741 ps_dedup_enabler, 2742 num_unique_nodes, 2743 mvx_qpel - 1, 2744 mvy_qpel - 0, 2745 check_for_duplicate); 2746 2747 if(!check_for_duplicate) 2748 { 2749 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1; 2750 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 2751 2752 s_err_prms.pu1_ref = apu1_final[0]; 2753 s_err_prms.i4_ref_stride = ai4_ref_stride[0]; 2754 2755 /* Update the mv's with the current candt motion vectors */ 2756 s_result_prms.i2_mv_x = mvx_qpel - 1; 2757 s_result_prms.i2_mv_y = mvy_qpel; 2758 s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1; 2759 s_temp_search_node.s_mv.i2_mvy = mvy_qpel; 2760 2761 pf_err_compute(&s_err_prms, &s_result_prms); 2762 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 2763 2764 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2765 if(i4_tot_cost < i4_min_cost) 2766 { 2767 e_min_id = PT_L; 2768 i4_min_cost = i4_tot_cost; 2769 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2770 } 2771 } 2772 } 2773 if(i4_grid_mask & BIT_EN(PT_T)) 2774 { 2775 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2776 ps_dedup_enabler, 2777 num_unique_nodes, 2778 mvx_qpel - 0, 2779 mvy_qpel - 1, 2780 check_for_duplicate); 2781 2782 if(!check_for_duplicate) 2783 { 2784 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 2785 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1; 2786 2787 s_err_prms.pu1_ref = apu1_final[1]; 2788 s_err_prms.i4_ref_stride = ai4_ref_stride[1]; 2789 2790 /* Update the mv's with the current candt motion vectors */ 2791 s_result_prms.i2_mv_x = mvx_qpel; 2792 s_result_prms.i2_mv_y = mvy_qpel - 1; 2793 2794 s_temp_search_node.s_mv.i2_mvx = mvx_qpel; 2795 s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1; 2796 2797 pf_err_compute(&s_err_prms, &s_result_prms); 2798 2799 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 2800 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2801 if(i4_tot_cost < i4_min_cost) 2802 { 2803 e_min_id = PT_T; 2804 i4_min_cost = i4_tot_cost; 2805 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2806 } 2807 } 2808 } 2809 if(i4_grid_mask & BIT_EN(PT_R)) 2810 { 2811 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2812 ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate); 2813 2814 if(!check_for_duplicate) 2815 { 2816 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1; 2817 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 2818 2819 s_err_prms.pu1_ref = apu1_final[2]; 2820 s_err_prms.i4_ref_stride = ai4_ref_stride[2]; 2821 2822 /* Update the mv's with the current candt motion vectors */ 2823 s_result_prms.i2_mv_x = mvx_qpel + 1; 2824 s_result_prms.i2_mv_y = mvy_qpel; 2825 2826 s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1; 2827 s_temp_search_node.s_mv.i2_mvy = mvy_qpel; 2828 2829 pf_err_compute(&s_err_prms, &s_result_prms); 2830 2831 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 2832 2833 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2834 if(i4_tot_cost < i4_min_cost) 2835 { 2836 e_min_id = PT_R; 2837 i4_min_cost = i4_tot_cost; 2838 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2839 } 2840 } 2841 } 2842 /* i4_mv_x and i4_mv_y will always be the centre pt */ 2843 /* for qpel we start with least hpel, and hence compute of center pt never reqd */ 2844 if(i4_grid_mask & BIT_EN(PT_B)) 2845 { 2846 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 2847 ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate); 2848 2849 if(!check_for_duplicate) 2850 { 2851 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 2852 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1; 2853 2854 s_err_prms.pu1_ref = apu1_final[3]; 2855 s_err_prms.i4_ref_stride = ai4_ref_stride[3]; 2856 2857 /* Update the mv's with the current candt motion vectors */ 2858 s_result_prms.i2_mv_x = mvx_qpel; 2859 s_result_prms.i2_mv_y = mvy_qpel + 1; 2860 2861 s_temp_search_node.s_mv.i2_mvx = mvx_qpel; 2862 s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1; 2863 2864 pf_err_compute(&s_err_prms, &s_result_prms); 2865 2866 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 2867 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 2868 if(i4_tot_cost < i4_min_cost) 2869 { 2870 e_min_id = PT_B; 2871 i4_min_cost = i4_tot_cost; 2872 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 2873 } 2874 } 2875 } 2876 2877 /* New QPEL mv x and y */ 2878 if(e_min_id == PT_C) 2879 break; 2880 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id]; 2881 i4_mv_x += gai1_grid_id_to_x[e_min_id]; 2882 i4_mv_y += gai1_grid_id_to_y[e_min_id]; 2883 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 2884 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 2885 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms); 2886 } 2887 } 2888 2889 /* update modified motion vectors and cost at end of subpel */ 2890 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 2891 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 2892 ps_search_node->i4_tot_cost = i4_min_cost; 2893 ps_search_node->i4_sad = i4_min_sad; 2894 2895 /********************************************************************************/ 2896 /* TODO: Restoring back Sad lambda from Hadamard lambda */ 2897 /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */ 2898 /********************************************************************************/ 2899 //ps_pred_ctxt->lambda >>= 1; 2900 2901 return (i4_min_cost); 2902 } 2903 #elif DIAMOND_GRID == 0 2904 S32 hme_subpel_refine_search_node_high_speed( 2905 search_node_t *ps_search_node, 2906 hme_subpel_prms_t *ps_prms, 2907 layer_ctxt_t *ps_curr_layer, 2908 BLK_SIZE_T e_blk_size, 2909 S32 x_off, 2910 S32 y_off, 2911 search_results_t *ps_search_results, 2912 S32 pred_lx, 2913 S32 i4_part_mask, 2914 S32 *pi4_valid_part_ids, 2915 S32 search_idx, 2916 subpel_dedup_enabler_t *ps_dedup_enabler, 2917 me_func_selector_t *ps_func_selector) 2918 { 2919 S32 i4_num_hpel_refine, i4_num_qpel_refine; 2920 S32 i4_offset, i4_grid_mask; 2921 S08 i1_ref_idx; 2922 S32 i4_blk_wd, i4_blk_ht; 2923 S32 i4_ref_stride, i4_i; 2924 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx]; 2925 result_upd_prms_t s_result_prms; 2926 2927 /*************************************************************************/ 2928 /* Tracks current MV with the fractional component. */ 2929 /*************************************************************************/ 2930 S32 i4_mv_x, i4_mv_y; 2931 S32 i4_frac_x, i4_frac_y; 2932 2933 /*************************************************************************/ 2934 /* Function pointer for SAD/SATD, array and prms structure to pass to */ 2935 /* This function */ 2936 /*************************************************************************/ 2937 PF_SAD_FXN_T pf_err_compute; 2938 S32 ai4_sad_grid[9][17], i4_tot_cost; 2939 err_prms_t s_err_prms; 2940 2941 /*************************************************************************/ 2942 /* Allowed MV RANGE */ 2943 /*************************************************************************/ 2944 range_prms_t *ps_range_prms; 2945 2946 /*************************************************************************/ 2947 /* stores min id in grid with associated min cost. */ 2948 /*************************************************************************/ 2949 S32 i4_min_cost, i4_min_sad; 2950 GRID_PT_T e_min_id; 2951 2952 PF_INTERP_FXN_T pf_qpel_interp; 2953 /*************************************************************************/ 2954 /* For hpel and qpel we move in diamonds and hence each point in the */ 2955 /* diamond will belong to a completely different plane. To simplify the */ 2956 /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the */ 2957 /* hpel planes which are interpolated during recon. */ 2958 /*************************************************************************/ 2959 U08 *apu1_hpel_ref[4], *pu1_ref; 2960 2961 interp_prms_t s_interp_prms; 2962 2963 /*************************************************************************/ 2964 /* Maintains the minimum id of interpolated buffers, and the pointer that*/ 2965 /* points to the corresponding predicted buf with its stride. */ 2966 /* Note that the pointer cannot be derived just from the id, since the */ 2967 /* pointer may also point to the hpel buffer (in case we request interp */ 2968 /* of a hpel pt, which already exists in the recon hpel planes) */ 2969 /*************************************************************************/ 2970 U08 *pu1_final_out; 2971 S32 i4_final_out_stride; 2972 S32 part_id; 2973 S32 check_for_duplicate = 0; 2974 2975 S32 mvx_qpel; 2976 S32 mvy_qpel; 2977 2978 /*************************************************************************/ 2979 /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/ 2980 /* fixed through this subpel refinement for this partition. */ 2981 /* Note, we do not enable grid sads since each pt is different buffers. */ 2982 /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled. */ 2983 /*************************************************************************/ 2984 if(ps_prms->i4_use_satd) 2985 { 2986 pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16; 2987 } 2988 else 2989 { 2990 pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */ 2991 } 2992 2993 i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine; 2994 i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine; 2995 2996 /* Prediction contet should now deal with qpel units */ 2997 HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL); 2998 2999 /* Buffer allocation for subpel */ 3000 /* Current design is that there may be many partitions and different mvs */ 3001 /* that attempt subpel refinemnt. While there is possibility of overlap, the */ 3002 /* hashing to detect and avoid overlap may be very complex. So, currently, */ 3003 /* the only thing done is to store the eventual predicted buffer with every */ 3004 /* ctb node that holds the result of hte best subpel search */ 3005 3006 /* Compute the base pointer for input, interpolated buffers */ 3007 /* The base pointers point as follows: 3008 /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */ 3009 /* To these, we need to add the offset of the current node */ 3010 i4_ref_stride = ps_curr_layer->i4_rec_stride; 3011 i4_offset = x_off + (y_off * i4_ref_stride); 3012 i1_ref_idx = ps_search_node->i1_ref_idx; 3013 3014 apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset; 3015 apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset; 3016 apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset; 3017 apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset; 3018 3019 /* Initialize result params used for partition update */ 3020 s_result_prms.pf_mv_cost_compute = NULL; 3021 s_result_prms.ps_search_results = ps_search_results; 3022 s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids; 3023 s_result_prms.i1_ref_idx = search_idx; 3024 s_result_prms.i4_part_mask = i4_part_mask; 3025 s_result_prms.ps_search_node_base = ps_search_node; 3026 s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0]; 3027 s_result_prms.i4_grid_mask = 1; 3028 3029 /* convert to hpel units */ 3030 i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1; 3031 i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1; 3032 3033 /* for first pt, we compute at all locations in the grid, 4 + 1 centre */ 3034 ps_range_prms = ps_prms->ps_mv_range_qpel; 3035 i4_grid_mask = (GRID_ALL_PTS_VALID); 3036 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms); 3037 3038 i4_min_cost = MAX_32BIT_VAL; 3039 i4_min_sad = MAX_32BIT_VAL; 3040 3041 /*************************************************************************/ 3042 /* Prepare the input params to SAD/SATD function. Note that input is */ 3043 /* passed from the calling funcion since it may be I (normal subpel */ 3044 /* refinement) or 2I - P0 in case of bidirect subpel refinement. */ 3045 /* Both cases are handled here. */ 3046 /*************************************************************************/ 3047 s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp; 3048 s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride; 3049 s_err_prms.i4_ref_stride = i4_ref_stride; 3050 s_err_prms.i4_part_mask = (ENABLE_2Nx2N); 3051 s_err_prms.i4_grid_mask = 1; 3052 s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0]; 3053 s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size]; 3054 s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size]; 3055 3056 /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */ 3057 //ps_pred_ctxt->lambda <<= 1; 3058 part_id = ps_search_node->u1_part_id; 3059 for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++) 3060 { 3061 e_min_id = PT_C; 3062 3063 mvx_qpel = i4_mv_x << 1; 3064 mvy_qpel = i4_mv_y << 1; 3065 3066 /* Central pt */ 3067 if(i4_grid_mask & BIT_EN(PT_C)) 3068 { 3069 //ps_search_node->i2_mv_x = (S16)i4_mv_x; 3070 //ps_search_node->i2_mv_x = (S16)i4_mv_y; 3071 /* central pt is i4_mv_x, i4_mv_y */ 3072 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3073 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate); 3074 3075 i4_frac_x = i4_mv_x & 1; 3076 i4_frac_y = i4_mv_y & 1; 3077 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3078 s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); 3079 pf_err_compute(&s_err_prms); 3080 /* Update the mv's with the current candt motion vectors */ 3081 s_result_prms.i2_mv_x = mvx_qpel; 3082 s_result_prms.i2_mv_y = mvy_qpel; 3083 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3084 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3085 if(i4_tot_cost < i4_min_cost) 3086 { 3087 i4_min_cost = i4_tot_cost; 3088 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3089 e_min_id = PT_C; 3090 pu1_final_out = s_err_prms.pu1_ref; 3091 } 3092 } 3093 3094 /* left pt */ 3095 if(i4_grid_mask & BIT_EN(PT_L)) 3096 { 3097 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3098 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate); 3099 3100 if(!check_for_duplicate) 3101 { 3102 /* search node mv is stored in qpel units */ 3103 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1); 3104 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); 3105 /* central pt is i4_mv_x - 1, i4_mv_y */ 3106 i4_frac_x = (i4_mv_x - 1) & 1; // same as (x-1)&1 3107 i4_frac_y = i4_mv_y & 1; 3108 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3109 s_err_prms.pu1_ref = 3110 pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); 3111 3112 pf_err_compute(&s_err_prms); 3113 /* Update the mv's with the current candt motion vectors */ 3114 s_result_prms.i2_mv_x = mvx_qpel; 3115 s_result_prms.i2_mv_y = mvy_qpel; 3116 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3117 3118 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3119 3120 if(i4_tot_cost < i4_min_cost) 3121 { 3122 i4_min_cost = i4_tot_cost; 3123 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3124 e_min_id = PT_L; 3125 pu1_final_out = s_err_prms.pu1_ref; 3126 } 3127 } 3128 } 3129 /* top pt */ 3130 if(i4_grid_mask & BIT_EN(PT_T)) 3131 { 3132 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3133 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate); 3134 3135 if(!check_for_duplicate) 3136 { 3137 /* search node mv is stored in qpel units */ 3138 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1); 3139 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1); 3140 /* top pt is i4_mv_x, i4_mv_y - 1 */ 3141 i4_frac_x = i4_mv_x & 1; 3142 i4_frac_y = (i4_mv_y - 1) & 1; 3143 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3144 s_err_prms.pu1_ref = 3145 pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride); 3146 pf_err_compute(&s_err_prms); 3147 /* Update the mv's with the current candt motion vectors */ 3148 s_result_prms.i2_mv_x = mvx_qpel; 3149 s_result_prms.i2_mv_y = mvy_qpel - 2; 3150 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3151 3152 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3153 3154 if(i4_tot_cost < i4_min_cost) 3155 { 3156 i4_min_cost = i4_tot_cost; 3157 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3158 e_min_id = PT_T; 3159 pu1_final_out = s_err_prms.pu1_ref; 3160 } 3161 } 3162 } 3163 /* right pt */ 3164 if(i4_grid_mask & BIT_EN(PT_R)) 3165 { 3166 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3167 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate); 3168 3169 if(!check_for_duplicate) 3170 { 3171 /* search node mv is stored in qpel units */ 3172 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1); 3173 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); 3174 /* right pt is i4_mv_x + 1, i4_mv_y */ 3175 i4_frac_x = (i4_mv_x + 1) & 1; 3176 i4_frac_y = i4_mv_y & 1; 3177 3178 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3179 s_err_prms.pu1_ref = 3180 pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride); 3181 pf_err_compute(&s_err_prms); 3182 /* Update the mv's with the current candt motion vectors */ 3183 s_result_prms.i2_mv_x = mvx_qpel + 2; 3184 s_result_prms.i2_mv_y = mvy_qpel; 3185 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3186 3187 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3188 3189 if(i4_tot_cost < i4_min_cost) 3190 { 3191 i4_min_cost = i4_tot_cost; 3192 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3193 e_min_id = PT_R; 3194 pu1_final_out = s_err_prms.pu1_ref; 3195 } 3196 } 3197 } 3198 /* bottom pt */ 3199 if(i4_grid_mask & BIT_EN(PT_B)) 3200 { 3201 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3202 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate); 3203 3204 if(!check_for_duplicate) 3205 { 3206 /* search node mv is stored in qpel units */ 3207 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1); 3208 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1); 3209 i4_frac_x = i4_mv_x & 1; 3210 i4_frac_y = (i4_mv_y + 1) & 1; 3211 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3212 s_err_prms.pu1_ref = 3213 pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride); 3214 3215 pf_err_compute(&s_err_prms); 3216 /* Update the mv's with the current candt motion vectors */ 3217 s_result_prms.i2_mv_x = mvx_qpel; 3218 s_result_prms.i2_mv_y = mvy_qpel + 2; 3219 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3220 3221 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3222 3223 if(i4_tot_cost < i4_min_cost) 3224 { 3225 i4_min_cost = i4_tot_cost; 3226 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3227 e_min_id = PT_B; 3228 pu1_final_out = s_err_prms.pu1_ref; 3229 } 3230 } 3231 } 3232 if(e_min_id == PT_C) 3233 { 3234 if(!i4_i) 3235 { 3236 /* TL pt */ 3237 if(i4_grid_mask & BIT_EN(PT_TL)) 3238 { 3239 S32 mvx_minus_1 = (i4_mv_x - 1); 3240 S32 mvy_minus_1 = (i4_mv_y - 1); 3241 3242 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3243 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate); 3244 3245 if(!check_for_duplicate) 3246 { 3247 /* search node mv is stored in qpel units */ 3248 ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1); 3249 ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1); 3250 i4_frac_x = mvx_minus_1 & 1; 3251 i4_frac_y = mvy_minus_1 & 1; 3252 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3253 s_err_prms.pu1_ref = 3254 pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride); 3255 3256 pf_err_compute(&s_err_prms); 3257 /* Update the mv's with the current candt motion vectors */ 3258 s_result_prms.i2_mv_x = mvx_qpel - 2; 3259 s_result_prms.i2_mv_y = mvy_qpel - 2; 3260 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3261 3262 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3263 3264 if(i4_tot_cost < i4_min_cost) 3265 { 3266 i4_min_cost = i4_tot_cost; 3267 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3268 e_min_id = PT_TL; 3269 pu1_final_out = s_err_prms.pu1_ref; 3270 } 3271 } 3272 } 3273 /* TR pt */ 3274 if(i4_grid_mask & BIT_EN(PT_TR)) 3275 { 3276 S32 mvx_plus_1 = (i4_mv_x + 1); 3277 S32 mvy_minus_1 = (i4_mv_y - 1); 3278 3279 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3280 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate); 3281 3282 if(!check_for_duplicate) 3283 { 3284 /* search node mv is stored in qpel units */ 3285 ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1); 3286 ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1); 3287 i4_frac_x = mvx_plus_1 & 1; 3288 i4_frac_y = mvy_minus_1 & 1; 3289 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3290 s_err_prms.pu1_ref = 3291 pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride); 3292 3293 pf_err_compute(&s_err_prms); 3294 /* Update the mv's with the current candt motion vectors */ 3295 s_result_prms.i2_mv_x = mvx_qpel + 2; 3296 s_result_prms.i2_mv_y = mvy_qpel - 2; 3297 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3298 3299 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3300 3301 if(i4_tot_cost < i4_min_cost) 3302 { 3303 i4_min_cost = i4_tot_cost; 3304 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3305 e_min_id = PT_TR; 3306 pu1_final_out = s_err_prms.pu1_ref; 3307 } 3308 } 3309 } 3310 /* BL pt */ 3311 if(i4_grid_mask & BIT_EN(PT_BL)) 3312 { 3313 S32 mvx_minus_1 = (i4_mv_x - 1); 3314 S32 mvy_plus_1 = (i4_mv_y + 1); 3315 3316 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3317 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate); 3318 3319 if(!check_for_duplicate) 3320 { 3321 /* search node mv is stored in qpel units */ 3322 ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1); 3323 ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1); 3324 i4_frac_x = mvx_minus_1 & 1; 3325 i4_frac_y = mvy_plus_1 & 1; 3326 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3327 s_err_prms.pu1_ref = 3328 pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride); 3329 3330 pf_err_compute(&s_err_prms); 3331 /* Update the mv's with the current candt motion vectors */ 3332 s_result_prms.i2_mv_x = mvx_qpel - 2; 3333 s_result_prms.i2_mv_y = mvy_qpel + 2; 3334 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3335 3336 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3337 3338 if(i4_tot_cost < i4_min_cost) 3339 { 3340 i4_min_cost = i4_tot_cost; 3341 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3342 e_min_id = PT_BL; 3343 pu1_final_out = s_err_prms.pu1_ref; 3344 } 3345 } 3346 } 3347 /* BR pt */ 3348 if(i4_grid_mask & BIT_EN(PT_BR)) 3349 { 3350 S32 mvx_plus_1 = (i4_mv_x + 1); 3351 S32 mvy_plus_1 = (i4_mv_y + 1); 3352 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3353 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate); 3354 3355 if(!check_for_duplicate) 3356 { 3357 /* search node mv is stored in qpel units */ 3358 ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1); 3359 ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1); 3360 i4_frac_x = mvx_plus_1 & 1; 3361 i4_frac_y = mvy_plus_1 & 1; 3362 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x]; 3363 s_err_prms.pu1_ref = 3364 pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride); 3365 3366 pf_err_compute(&s_err_prms); 3367 /* Update the mv's with the current candt motion vectors */ 3368 s_result_prms.i2_mv_x = mvx_qpel + 2; 3369 s_result_prms.i2_mv_y = mvy_qpel + 2; 3370 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3371 3372 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3373 3374 if(i4_tot_cost < i4_min_cost) 3375 { 3376 i4_min_cost = i4_tot_cost; 3377 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3378 e_min_id = PT_BR; 3379 pu1_final_out = s_err_prms.pu1_ref; 3380 } 3381 } 3382 } 3383 if(e_min_id == PT_C) 3384 { 3385 break; 3386 } 3387 } 3388 else 3389 { 3390 break; 3391 } 3392 } 3393 3394 /*********************************************************************/ 3395 /* Depending on the best result location, we may be able to skip */ 3396 /* atleast two pts, centre pt and one more pt. E.g. if right pt is */ 3397 /* the best result, the next iteration need not do centre, left pts */ 3398 /*********************************************************************/ 3399 if(i4_i) 3400 { 3401 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id]; 3402 } 3403 else 3404 { 3405 i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id]; 3406 } 3407 i4_mv_x += gai1_grid_id_to_x[e_min_id]; 3408 i4_mv_y += gai1_grid_id_to_y[e_min_id]; 3409 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1); 3410 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1); 3411 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms); 3412 } 3413 3414 /* Convert to QPEL units */ 3415 i4_mv_x <<= 1; 3416 i4_mv_y <<= 1; 3417 3418 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 3419 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 3420 3421 /* Early exit if this partition is visiting same hpel mv again */ 3422 /* Assumption : Checkin for early exit in best result of partition */ 3423 if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x == 3424 ps_search_node->s_mv.i2_mvx) && 3425 (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y == 3426 ps_search_node->s_mv.i2_mvy)) 3427 { 3428 return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost); 3429 } 3430 else 3431 { 3432 /* Store the best hpel mv for future early exit checks */ 3433 ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x = 3434 (S16)i4_mv_x; 3435 ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y = 3436 (S16)i4_mv_y; 3437 } 3438 3439 /* Early exit if this partition is visiting same hpel mv again */ 3440 /* Assumption : Checkin for early exit in second best result of partition */ 3441 if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x == 3442 ps_search_node->s_mv.i2_mvx) && 3443 (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y == 3444 ps_search_node->s_mv.i2_mvy)) 3445 { 3446 return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost); 3447 } 3448 else 3449 { 3450 /* Store the best hpel mv for future early exit checks */ 3451 ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x = 3452 (S16)i4_mv_x; 3453 ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y = 3454 (S16)i4_mv_y; 3455 } 3456 3457 /* Exact interpolation or averaging chosen here */ 3458 pf_qpel_interp = ps_prms->pf_qpel_interp; 3459 3460 /* Next QPEL ME */ 3461 /* In this case, we have option of doing exact QPEL interpolation or avg */ 3462 /*************************************************************************/ 3463 /* x */ 3464 /* A b C d */ 3465 /* e f g h */ 3466 /* I j K l */ 3467 /* m n o p */ 3468 /* Q r S t */ 3469 /* */ 3470 /* Approximate QPEL logic */ 3471 /* b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K) */ 3472 /* for any given pt, we can get all the information required about */ 3473 /* the surrounding 4 pts. For example, given point C (0.5, 0) */ 3474 /* surrounding pts info: */ 3475 /* b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf */ 3476 /* buffer 2: hxfy, offsets for both are 0, 0 */ 3477 /* similarly for other pts the info can be gotten */ 3478 /*************************************************************************/ 3479 i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C)); 3480 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms); 3481 3482 /*************************************************************************/ 3483 /* One time preparation of non changing interpolation params. These */ 3484 /* include a set of ping pong result buf ptrs, input buf ptrs and some */ 3485 /* working memory (not used though in case of averaging). */ 3486 /*************************************************************************/ 3487 s_interp_prms.ppu1_ref = &apu1_hpel_ref[0]; 3488 s_interp_prms.i4_ref_stride = i4_ref_stride; 3489 s_interp_prms.i4_blk_wd = i4_blk_wd; 3490 s_interp_prms.i4_blk_ht = i4_blk_ht; 3491 3492 i4_final_out_stride = i4_ref_stride; 3493 3494 { 3495 U08 *pu1_mem; 3496 /*********************************************************************/ 3497 /* Allocation of working memory for interpolated buffers. We maintain*/ 3498 /* an intermediate working buffer, and 2 ping pong interpolated out */ 3499 /* buffers, purpose of ping pong explained later below */ 3500 /*********************************************************************/ 3501 pu1_mem = ps_prms->pu1_wkg_mem; 3502 s_interp_prms.pu1_wkg_mem = pu1_mem; 3503 3504 //pu1_mem += (INTERP_INTERMED_BUF_SIZE); 3505 s_interp_prms.apu1_interp_out[0] = pu1_mem; 3506 3507 pu1_mem += (INTERP_OUT_BUF_SIZE); 3508 s_interp_prms.apu1_interp_out[1] = pu1_mem; 3509 3510 pu1_mem += (INTERP_OUT_BUF_SIZE); 3511 s_interp_prms.apu1_interp_out[2] = pu1_mem; 3512 3513 pu1_mem += (INTERP_OUT_BUF_SIZE); 3514 s_interp_prms.apu1_interp_out[3] = pu1_mem; 3515 3516 pu1_mem += (INTERP_OUT_BUF_SIZE); 3517 s_interp_prms.apu1_interp_out[4] = pu1_mem; 3518 3519 /*********************************************************************/ 3520 /* Stride of interpolated output is just a function of blk width of */ 3521 /* this partition and hence remains constant for this partition */ 3522 /*********************************************************************/ 3523 s_interp_prms.i4_out_stride = (i4_blk_wd); 3524 } 3525 3526 { 3527 UWORD8 *apu1_final[4]; 3528 WORD32 ai4_ref_stride[4]; 3529 /*************************************************************************/ 3530 /* Ping pong design for interpolated buffers. We use a min id, which */ 3531 /* tracks the id of the ppu1_interp_out that stores the best result. */ 3532 /* When new interp to be done, it uses 1 - bes result id to do the interp*/ 3533 /* min id is toggled when any new result becomes the best result. */ 3534 /*************************************************************************/ 3535 3536 for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++) 3537 { 3538 e_min_id = PT_C; 3539 3540 hme_qpel_interp_comprehensive( 3541 &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask); 3542 3543 mvx_qpel = i4_mv_x; 3544 mvy_qpel = i4_mv_y; 3545 3546 if(i4_grid_mask & BIT_EN(PT_L)) 3547 { 3548 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3549 ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate); 3550 3551 if(!check_for_duplicate) 3552 { 3553 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1; 3554 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 3555 3556 s_err_prms.pu1_ref = apu1_final[0]; 3557 s_err_prms.i4_ref_stride = ai4_ref_stride[0]; 3558 3559 pf_err_compute(&s_err_prms); 3560 /* Update the mv's with the current candt motion vectors */ 3561 s_result_prms.i2_mv_x = mvx_qpel - 1; 3562 s_result_prms.i2_mv_y = mvy_qpel; 3563 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3564 3565 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3566 if(i4_tot_cost < i4_min_cost) 3567 { 3568 e_min_id = PT_L; 3569 i4_min_cost = i4_tot_cost; 3570 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3571 } 3572 } 3573 } 3574 if(i4_grid_mask & BIT_EN(PT_T)) 3575 { 3576 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3577 ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate); 3578 3579 if(!check_for_duplicate) 3580 { 3581 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 3582 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1; 3583 3584 s_err_prms.pu1_ref = apu1_final[1]; 3585 s_err_prms.i4_ref_stride = ai4_ref_stride[1]; 3586 3587 pf_err_compute(&s_err_prms); 3588 /* Update the mv's with the current candt motion vectors */ 3589 s_result_prms.i2_mv_x = mvx_qpel; 3590 s_result_prms.i2_mv_y = mvy_qpel - 1; 3591 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3592 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3593 if(i4_tot_cost < i4_min_cost) 3594 { 3595 e_min_id = PT_T; 3596 i4_min_cost = i4_tot_cost; 3597 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3598 } 3599 } 3600 } 3601 if(i4_grid_mask & BIT_EN(PT_R)) 3602 { 3603 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3604 ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate); 3605 3606 if(!check_for_duplicate) 3607 { 3608 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1; 3609 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 3610 3611 s_err_prms.pu1_ref = apu1_final[2]; 3612 s_err_prms.i4_ref_stride = ai4_ref_stride[2]; 3613 3614 pf_err_compute(&s_err_prms); 3615 /* Update the mv's with the current candt motion vectors */ 3616 s_result_prms.i2_mv_x = mvx_qpel + 1; 3617 s_result_prms.i2_mv_y = mvy_qpel; 3618 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3619 3620 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3621 if(i4_tot_cost < i4_min_cost) 3622 { 3623 e_min_id = PT_R; 3624 i4_min_cost = i4_tot_cost; 3625 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3626 } 3627 } 3628 } 3629 /* i4_mv_x and i4_mv_y will always be the centre pt */ 3630 /* for qpel we start with least hpel, and hence compute of center pt never reqd */ 3631 if(i4_grid_mask & BIT_EN(PT_B)) 3632 { 3633 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3634 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate); 3635 3636 if(!check_for_duplicate) 3637 { 3638 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 3639 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1; 3640 3641 s_err_prms.pu1_ref = apu1_final[3]; 3642 s_err_prms.i4_ref_stride = ai4_ref_stride[3]; 3643 3644 pf_err_compute(&s_err_prms); 3645 /* Update the mv's with the current candt motion vectors */ 3646 s_result_prms.i2_mv_x = mvx_qpel; 3647 s_result_prms.i2_mv_y = mvy_qpel + 1; 3648 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3649 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3650 if(i4_tot_cost < i4_min_cost) 3651 { 3652 e_min_id = PT_B; 3653 i4_min_cost = i4_tot_cost; 3654 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3655 } 3656 } 3657 } 3658 3659 if(e_min_id == PT_C) 3660 { 3661 if(!i4_i) 3662 { 3663 S32 i4_interp_buf_id = 0; 3664 3665 if(i4_grid_mask & BIT_EN(PT_TL)) 3666 { 3667 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3668 ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate); 3669 3670 if(!check_for_duplicate) 3671 { 3672 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1; 3673 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1; 3674 3675 /* Carry out the interpolation */ 3676 pf_qpel_interp( 3677 &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id); 3678 3679 s_err_prms.pu1_ref = s_interp_prms.pu1_final_out; 3680 s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride; 3681 3682 pf_err_compute(&s_err_prms); 3683 /* Update the mv's with the current candt motion vectors */ 3684 s_result_prms.i2_mv_x = mvx_qpel - 1; 3685 s_result_prms.i2_mv_y = mvy_qpel - 1; 3686 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3687 3688 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3689 3690 if(i4_tot_cost < i4_min_cost) 3691 { 3692 e_min_id = PT_TL; 3693 i4_min_cost = i4_tot_cost; 3694 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3695 } 3696 } 3697 } 3698 if(i4_grid_mask & BIT_EN(PT_TR)) 3699 { 3700 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3701 ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate); 3702 3703 if(!check_for_duplicate) 3704 { 3705 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1; 3706 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1; 3707 3708 /* Carry out the interpolation */ 3709 pf_qpel_interp( 3710 &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id); 3711 3712 s_err_prms.pu1_ref = s_interp_prms.pu1_final_out; 3713 s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride; 3714 3715 pf_err_compute(&s_err_prms); 3716 /* Update the mv's with the current candt motion vectors */ 3717 s_result_prms.i2_mv_x = mvx_qpel + 1; 3718 s_result_prms.i2_mv_y = mvy_qpel - 1; 3719 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3720 3721 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3722 3723 if(i4_tot_cost < i4_min_cost) 3724 { 3725 e_min_id = PT_TR; 3726 i4_min_cost = i4_tot_cost; 3727 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3728 } 3729 } 3730 } 3731 if(i4_grid_mask & BIT_EN(PT_BL)) 3732 { 3733 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3734 ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate); 3735 3736 if(!check_for_duplicate) 3737 { 3738 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1; 3739 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1; 3740 3741 /* Carry out the interpolation */ 3742 pf_qpel_interp( 3743 &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id); 3744 3745 s_err_prms.pu1_ref = s_interp_prms.pu1_final_out; 3746 s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride; 3747 3748 pf_err_compute(&s_err_prms); 3749 /* Update the mv's with the current candt motion vectors */ 3750 s_result_prms.i2_mv_x = mvx_qpel - 1; 3751 s_result_prms.i2_mv_y = mvy_qpel + 1; 3752 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3753 3754 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3755 3756 if(i4_tot_cost < i4_min_cost) 3757 { 3758 e_min_id = PT_BL; 3759 i4_min_cost = i4_tot_cost; 3760 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3761 } 3762 } 3763 } 3764 /* i4_mv_x and i4_mv_y will always be the centre pt */ 3765 /* for qpel we start with least hpel, and hence compute of center pt never reqd */ 3766 if(i4_grid_mask & BIT_EN(PT_BR)) 3767 { 3768 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES( 3769 ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate); 3770 3771 if(!check_for_duplicate) 3772 { 3773 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1; 3774 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1; 3775 3776 /* Carry out the interpolation */ 3777 pf_qpel_interp( 3778 &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id); 3779 3780 s_err_prms.pu1_ref = s_interp_prms.pu1_final_out; 3781 s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride; 3782 3783 pf_err_compute(&s_err_prms); 3784 /* Update the mv's with the current candt motion vectors */ 3785 s_result_prms.i2_mv_x = mvx_qpel + 1; 3786 s_result_prms.i2_mv_y = mvy_qpel + 1; 3787 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms); 3788 3789 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id]; 3790 3791 if(i4_tot_cost < i4_min_cost) 3792 { 3793 e_min_id = PT_BR; 3794 i4_min_cost = i4_tot_cost; 3795 i4_min_sad = s_err_prms.pi4_sad_grid[part_id]; 3796 } 3797 } 3798 } 3799 if(e_min_id == PT_C) 3800 { 3801 break; 3802 } 3803 } 3804 else 3805 { 3806 break; 3807 } 3808 } 3809 3810 if(i4_i) 3811 { 3812 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id]; 3813 } 3814 else 3815 { 3816 i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id]; 3817 } 3818 i4_mv_x += gai1_grid_id_to_x[e_min_id]; 3819 i4_mv_y += gai1_grid_id_to_y[e_min_id]; 3820 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 3821 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 3822 i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms); 3823 } 3824 } 3825 3826 /* update modified motion vectors and cost at end of subpel */ 3827 ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x; 3828 ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y; 3829 ps_search_node->i4_tot_cost = i4_min_cost; 3830 ps_search_node->i4_sad = i4_min_sad; 3831 3832 /********************************************************************************/ 3833 /* TODO: Restoring back Sad lambda from Hadamard lambda */ 3834 /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */ 3835 /********************************************************************************/ 3836 //ps_pred_ctxt->lambda >>= 1; 3837 3838 return (i4_min_cost); 3839 } 3840 #endif 3841 3842 static void hme_subpel_refine_struct_to_search_results_struct_converter( 3843 subpel_refine_ctxt_t *ps_subpel_refine_ctxt, 3844 search_results_t *ps_search_results, 3845 U08 u1_pred_dir, 3846 ME_QUALITY_PRESETS_T e_quality_preset) 3847 { 3848 U08 i; 3849 3850 U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part; 3851 3852 for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++) 3853 { 3854 S32 index; 3855 S32 i4_sad; 3856 3857 S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i]; 3858 3859 search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id]; 3860 3861 if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8) 3862 { 3863 index = part_id; 3864 } 3865 else 3866 { 3867 index = i; 3868 } 3869 3870 if(!ps_best_node->u1_subpel_done) 3871 { 3872 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - 3873 ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3874 ps_best_node[0].i4_sdi = 0; 3875 ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1); 3876 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; 3877 3878 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) 3879 { 3880 i4_sad = MAX_SIGNED_16BIT_VAL; 3881 } 3882 3883 ps_best_node[0].i4_sad = i4_sad; 3884 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3885 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; 3886 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; 3887 ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; 3888 ps_best_node->u1_subpel_done = 1; 3889 3890 if(2 == u1_num_results_per_part) 3891 { 3892 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] - 3893 ps_subpel_refine_ctxt->i2_mv_cost[1][index]; 3894 ps_best_node[1].i4_sdi = 0; 3895 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index]; 3896 3897 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL) 3898 { 3899 i4_sad = MAX_SIGNED_16BIT_VAL; 3900 } 3901 3902 ps_best_node[1].i4_sad = i4_sad; 3903 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index]; 3904 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index]; 3905 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index]; 3906 ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index]; 3907 ps_best_node[1].u1_subpel_done = 1; 3908 } 3909 } 3910 else if( 3911 (2 == u1_num_results_per_part) && 3912 (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost)) 3913 { 3914 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost) 3915 { 3916 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - 3917 ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3918 ps_best_node[0].i4_sdi = 0; 3919 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; 3920 3921 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) 3922 { 3923 i4_sad = MAX_SIGNED_16BIT_VAL; 3924 } 3925 3926 ps_best_node[0].i4_sad = i4_sad; 3927 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3928 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; 3929 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; 3930 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; 3931 3932 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] - 3933 ps_subpel_refine_ctxt->i2_mv_cost[1][index]; 3934 ps_best_node[1].i4_sdi = 0; 3935 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index]; 3936 3937 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL) 3938 { 3939 i4_sad = MAX_SIGNED_16BIT_VAL; 3940 } 3941 3942 ps_best_node[1].i4_sad = i4_sad; 3943 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index]; 3944 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index]; 3945 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index]; 3946 ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index]; 3947 } 3948 else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost) 3949 { 3950 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost) 3951 { 3952 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - 3953 ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3954 ps_best_node[1].i4_sdi = 0; 3955 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; 3956 3957 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) 3958 { 3959 i4_sad = MAX_SIGNED_16BIT_VAL; 3960 } 3961 3962 ps_best_node[1].i4_sad = i4_sad; 3963 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3964 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; 3965 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; 3966 ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; 3967 } 3968 else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost) 3969 { 3970 memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t)); 3971 3972 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - 3973 ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3974 ps_best_node[0].i4_sdi = 0; 3975 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; 3976 3977 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) 3978 { 3979 i4_sad = MAX_SIGNED_16BIT_VAL; 3980 } 3981 3982 ps_best_node[0].i4_sad = i4_sad; 3983 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3984 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; 3985 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; 3986 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; 3987 } 3988 } 3989 } 3990 else if( 3991 (1 == u1_num_results_per_part) && 3992 (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)) 3993 { 3994 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - 3995 ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 3996 ps_best_node[0].i4_sdi = 0; 3997 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index]; 3998 3999 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL) 4000 { 4001 i4_sad = MAX_SIGNED_16BIT_VAL; 4002 } 4003 4004 ps_best_node[0].i4_sad = i4_sad; 4005 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 4006 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; 4007 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; 4008 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index]; 4009 } 4010 } 4011 } 4012 4013 /** 4014 ******************************************************************************** 4015 * @fn S32 hme_subpel_refine_cu_hs 4016 * 4017 * @brief Evaluates the best subpel mvs for active partitions of an MB in L0 4018 * layer for the high speed preset. Recursive hadamard SATD / SAD 4019 * and mv cost is used for 2NxN and NxN partitions with active partition 4020 * update 4021 * 4022 * @param[in] ps_prms: subpel prms input to this function 4023 * 4024 * @param[in] ps_curr_layer: points to the current layer ctxt 4025 * 4026 * @param[out] ps_search_results: points to the search resutls that get updated 4027 * with best results 4028 * 4029 * @param[in] search_idx: ref id of the frame for which results get updated 4030 * 4031 * @param[in] ps_wt_inp_prms: current frame input params 4032 * 4033 * @return None 4034 ******************************************************************************** 4035 */ 4036 void hme_subpel_refine_cu_hs( 4037 hme_subpel_prms_t *ps_prms, 4038 layer_ctxt_t *ps_curr_layer, 4039 search_results_t *ps_search_results, 4040 S32 search_idx, 4041 wgt_pred_ctxt_t *ps_wt_inp_prms, 4042 WORD32 blk_8x8_mask, 4043 me_func_selector_t *ps_func_selector, 4044 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list, 4045 ihevce_me_optimised_function_list_t *ps_me_optimised_function_list) 4046 { 4047 /* Unique search node list for 2nx2n and nxn partitions */ 4048 search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5]; 4049 subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF]; 4050 search_node_t *ps_search_node; 4051 4052 S32 i, i4_part_mask, j; 4053 S32 i4_sad_grid; 4054 S32 max_subpel_cand; 4055 WORD32 index; 4056 S32 num_unique_nodes_2nx2n; 4057 S32 part_id; 4058 S32 x_off, y_off; 4059 S32 i4_inp_off; 4060 4061 CU_SIZE_T e_cu_size; 4062 BLK_SIZE_T e_blk_size; 4063 4064 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt; 4065 4066 S32 i4_use_satd = ps_prms->i4_use_satd; 4067 S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1; 4068 4069 ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART); 4070 4071 if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy) 4072 { 4073 e_cu_size = ps_search_results->e_cu_size; 4074 i4_part_mask = ps_search_results->i4_part_mask; 4075 4076 ps_prms->i4_inp_type = sizeof(U08); 4077 4078 num_unique_nodes_2nx2n = 0; 4079 4080 for(i = 0; i < i4_num_act_refs; i++) 4081 { 4082 as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF; 4083 } 4084 4085 /************************************************************************/ 4086 /* */ 4087 /* Initialize SATD cost for each valid partition id.one time before */ 4088 /* doing full pel time. This is because of the following reasons: */ 4089 /* 1. Full pel cost was done in SAD while subpel is in SATD mode */ 4090 /* 2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */ 4091 /* doing Diamond search for 2Nx2N and NxN. This partitions are */ 4092 /* not explicitly refine in high speed mode */ 4093 /* */ 4094 /************************************************************************/ 4095 for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++) 4096 { 4097 S32 enable_subpel = 0; 4098 S32 part_type; 4099 4100 /* Derive the x and y offsets of this part id */ 4101 part_id = ps_subpel_refine_ctxt->ai4_part_id[i]; 4102 if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8) 4103 { 4104 index = part_id; 4105 } 4106 else 4107 { 4108 index = i; 4109 } 4110 4111 part_type = ge_part_id_to_part_type[part_id]; 4112 x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size; 4113 y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size; 4114 x_off += ps_search_results->u1_x_off; 4115 y_off += ps_search_results->u1_y_off; 4116 i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride; 4117 e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id]; 4118 4119 x_off += ps_prms->i4_ctb_x_off; 4120 y_off += ps_prms->i4_ctb_y_off; 4121 4122 max_subpel_cand = 0; 4123 4124 /* Choose the minimum number of candidates to be used for Sub pel refinement */ 4125 if(PART_ID_2Nx2N == part_type) 4126 { 4127 max_subpel_cand = 4128 MIN(ps_prms->u1_max_subpel_candts_2Nx2N, 4129 ps_search_results->u1_num_results_per_part); 4130 } 4131 else if(PRT_NxN == part_type) 4132 { 4133 max_subpel_cand = MIN( 4134 ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part); 4135 } 4136 4137 /* If incomplete CTB, NxN num candidates should be forced to min 1 */ 4138 if((0 == max_subpel_cand) && (blk_8x8_mask != 15)) 4139 { 4140 max_subpel_cand = 1; 4141 } 4142 4143 if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type)) 4144 { 4145 enable_subpel = 1; 4146 } 4147 4148 /* Compute full pel SATD for each result per partition before subpel */ 4149 /* refinement starts. */ 4150 /* Also prepare unique candidate list for 2Nx2N and NxN partitions */ 4151 for(j = 0; j < ps_search_results->u1_num_results_per_part; j++) 4152 { 4153 err_prms_t s_err_prms; 4154 S32 i4_satd = 0; 4155 S32 i1_ref_idx; 4156 U08 *pu1_ref_base; 4157 S32 i4_ref_stride = ps_curr_layer->i4_rec_stride; 4158 S32 i4_mv_x, i4_mv_y; 4159 4160 ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j; 4161 4162 if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV) 4163 { 4164 ps_search_node->u1_subpel_done = 1; 4165 continue; 4166 } 4167 4168 i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index]; 4169 ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off); 4170 pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx]; 4171 4172 i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index]; 4173 i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index]; 4174 4175 if(i4_use_satd) 4176 { 4177 s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp; 4178 s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride; 4179 s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x + 4180 (i4_mv_y * i4_ref_stride); 4181 4182 s_err_prms.i4_ref_stride = i4_ref_stride; 4183 s_err_prms.i4_part_mask = (ENABLE_2Nx2N); 4184 s_err_prms.i4_grid_mask = 1; 4185 s_err_prms.pi4_sad_grid = &i4_sad_grid; 4186 s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size]; 4187 s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size]; 4188 4189 s_err_prms.ps_cmn_utils_optimised_function_list = 4190 ps_cmn_utils_optimised_function_list; 4191 4192 compute_satd_8bit(&s_err_prms); 4193 4194 i4_satd = s_err_prms.pi4_sad_grid[0]; 4195 4196 ps_subpel_refine_ctxt->i2_tot_cost[j][index] = 4197 CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd); 4198 ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd; 4199 } 4200 4201 /* Sub-pel candidate filtration */ 4202 if(j) 4203 { 4204 S16 i2_best_sad; 4205 S32 i4_best_mvx; 4206 S32 i4_best_mvy; 4207 4208 search_node_t *ps_node = 4209 ps_search_results->aps_part_results[search_idx][part_id]; 4210 4211 U08 u1_is_subpel_done = ps_node->u1_subpel_done; 4212 S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index]; 4213 S32 i4_curr_mvx = i4_mv_x << 2; 4214 S32 i4_curr_mvy = i4_mv_y << 2; 4215 4216 if(u1_is_subpel_done) 4217 { 4218 i2_best_sad = ps_node->i4_sad; 4219 4220 if(ps_node->i1_ref_idx == i1_ref_idx) 4221 { 4222 i4_best_mvx = ps_node->s_mv.i2_mvx; 4223 i4_best_mvy = ps_node->s_mv.i2_mvy; 4224 } 4225 else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index]) 4226 { 4227 i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; 4228 i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; 4229 } 4230 else 4231 { 4232 i4_best_mvx = INTRA_MV; 4233 i4_best_mvy = INTRA_MV; 4234 } 4235 } 4236 else 4237 { 4238 i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] - 4239 ps_subpel_refine_ctxt->i2_mv_cost[0][index]; 4240 4241 if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index]) 4242 { 4243 i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index]; 4244 i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index]; 4245 } 4246 else 4247 { 4248 i4_best_mvx = INTRA_MV; 4249 i4_best_mvy = INTRA_MV; 4250 } 4251 } 4252 4253 i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold); 4254 4255 if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) && 4256 (ABS(i4_curr_mvy - i4_best_mvy) < 2)) || 4257 (i2_curr_sad > i2_best_sad)) 4258 { 4259 enable_subpel = 0; 4260 } 4261 } 4262 4263 ps_search_node->u1_part_id = part_id; 4264 4265 /* Convert mvs in part results from FPEL to QPEL units */ 4266 ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2; 4267 ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2; 4268 4269 /* If the candidate number is more than the number of candts 4270 set initally, do not add those candts for refinement */ 4271 if(j >= max_subpel_cand) 4272 { 4273 enable_subpel = 0; 4274 } 4275 4276 if(enable_subpel) 4277 { 4278 if(num_unique_nodes_2nx2n == 0) 4279 { 4280 S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index]; 4281 4282 as_subpel_dedup_enabler[i4_index].i2_mv_x = 4283 ps_subpel_refine_ctxt->i2_mv_x[j][index]; 4284 as_subpel_dedup_enabler[i4_index].i2_mv_y = 4285 ps_subpel_refine_ctxt->i2_mv_y[j][index]; 4286 as_subpel_dedup_enabler[i4_index].u1_ref_idx = 4287 (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index]; 4288 memset( 4289 as_subpel_dedup_enabler[i4_index].au4_node_map, 4290 0, 4291 sizeof(U32) * 2 * MAP_X_MAX); 4292 } 4293 INSERT_NEW_NODE_NOMAP_ALTERNATE( 4294 as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i); 4295 } 4296 } 4297 4298 /*********************************************************************************************/ 4299 /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */ 4300 /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/ 4301 /* for each partition again, based on the new costs */ 4302 /*********************************************************************************************/ 4303 /*********************************************************************************************/ 4304 /* Because right now, we store only the two best candidates for each partition, the sort will*/ 4305 /* converge to a simple swap. */ 4306 /* ASSUMPTION : We store only two best results per partition */ 4307 /*********************************************************************************************/ 4308 if(ps_search_results->u1_num_results_per_part == 2) 4309 { 4310 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] > 4311 ps_subpel_refine_ctxt->i2_tot_cost[1][index]) 4312 { 4313 SWAP( 4314 ps_subpel_refine_ctxt->i2_tot_cost[0][index], 4315 ps_subpel_refine_ctxt->i2_tot_cost[1][index]); 4316 4317 SWAP( 4318 ps_subpel_refine_ctxt->i2_mv_cost[0][index], 4319 ps_subpel_refine_ctxt->i2_mv_cost[1][index]); 4320 4321 SWAP( 4322 ps_subpel_refine_ctxt->i2_mv_x[0][index], 4323 ps_subpel_refine_ctxt->i2_mv_x[1][index]); 4324 4325 SWAP( 4326 ps_subpel_refine_ctxt->i2_mv_y[0][index], 4327 ps_subpel_refine_ctxt->i2_mv_y[1][index]); 4328 4329 SWAP( 4330 ps_subpel_refine_ctxt->i2_ref_idx[0][index], 4331 ps_subpel_refine_ctxt->i2_ref_idx[1][index]); 4332 4333 SWAP( 4334 ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index], 4335 ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]); 4336 } 4337 } 4338 } 4339 4340 if(blk_8x8_mask == 0xf) 4341 { 4342 num_unique_nodes_2nx2n = 4343 MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers); 4344 } 4345 { 4346 x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size; 4347 y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size; 4348 x_off += ps_search_results->u1_x_off; 4349 y_off += ps_search_results->u1_y_off; 4350 i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride; 4351 e_blk_size = ge_part_id_to_blk_size[e_cu_size][0]; 4352 4353 for(j = 0; j < num_unique_nodes_2nx2n; j++) 4354 { 4355 S32 pred_lx; 4356 ps_search_node = &as_nodes_2nx2n[j]; 4357 4358 if(ps_search_node->s_mv.i2_mvx == INTRA_MV) 4359 { 4360 continue; 4361 } 4362 4363 { 4364 S08 i1_ref_idx = ps_search_node->i1_ref_idx; 4365 subpel_dedup_enabler_t *ps_dedup_enabler = 4366 &(as_subpel_dedup_enabler[i1_ref_idx]); 4367 4368 if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF) 4369 { 4370 as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx; 4371 as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy; 4372 as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx; 4373 memset( 4374 as_subpel_dedup_enabler[i1_ref_idx].au4_node_map, 4375 0, 4376 sizeof(U32) * 2 * MAP_X_MAX); 4377 } 4378 } 4379 4380 pred_lx = search_idx; 4381 ps_prms->pv_inp = 4382 (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off); 4383 4384 hme_subpel_refine_search_node_high_speed( 4385 ps_search_node, 4386 ps_prms, 4387 ps_curr_layer, 4388 e_blk_size, 4389 x_off + ps_prms->i4_ctb_x_off, 4390 y_off + ps_prms->i4_ctb_y_off, 4391 ps_search_results, 4392 pred_lx, 4393 i4_part_mask, 4394 &ps_subpel_refine_ctxt->ai4_part_id[0], 4395 search_idx, 4396 &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]), 4397 ps_func_selector, 4398 ps_me_optimised_function_list); 4399 } 4400 } 4401 } 4402 else 4403 { 4404 for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++) 4405 { 4406 S32 i4_index; 4407 4408 S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i]; 4409 4410 if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8) 4411 { 4412 i4_index = i4_part_id; 4413 } 4414 else 4415 { 4416 i4_index = i; 4417 } 4418 4419 for(j = 0; j < ps_search_results->u1_num_results_per_part; j++) 4420 { 4421 ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2; 4422 ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2; 4423 } 4424 } 4425 } 4426 4427 hme_subpel_refine_struct_to_search_results_struct_converter( 4428 ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets); 4429 } 4430