1 /****************************************************************************** 2 * 3 * Copyright (C) 2018 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 21 /*! 22 ****************************************************************************** 23 * \file ihevce_decomp_pre_intra_pass.c 24 * 25 * \brief 26 * This file contains definitions related to frame decomposition done during 27 * pre intra processing 28 * 29 * \date 30 * 19/02/2013 31 * 32 * \author 33 * Ittiam 34 * 35 * List of Functions 36 * ihevce_intra_populate_mode_bits_cost() 37 * ihevce_8x8_sad_computer() 38 * ihevce_4x4_sad_computer() 39 * ihevce_ed_4x4_find_best_modes() 40 * ihevce_ed_calc_4x4_blk() 41 * ihevce_ed_calc_8x8_blk() 42 * ihevce_ed_calc_incomplete_ctb() 43 * ihevce_cu_level_qp_mod() 44 * ihevce_ed_calc_ctb() 45 * ihevce_ed_frame_init() 46 * ihevce_scale_by_2() 47 * ihevce_decomp_pre_intra_process_row() 48 * ihevce_decomp_pre_intra_process() 49 * ihevce_decomp_pre_intra_get_num_mem_recs() 50 * ihevce_decomp_pre_intra_get_mem_recs() 51 * ihevce_decomp_pre_intra_init() 52 * ihevce_decomp_pre_intra_frame_init() 53 * ihevce_merge_sort() 54 * ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit() 55 * 56 ****************************************************************************** 57 */ 58 59 /*****************************************************************************/ 60 /* File Includes */ 61 /*****************************************************************************/ 62 /* System include files */ 63 #include <stdio.h> 64 #include <string.h> 65 #include <stdlib.h> 66 #include <assert.h> 67 #include <stdarg.h> 68 #include <math.h> 69 #include <limits.h> 70 71 /* User include files */ 72 #include "ihevc_typedefs.h" 73 #include "itt_video_api.h" 74 #include "ihevce_api.h" 75 76 #include "rc_cntrl_param.h" 77 #include "rc_frame_info_collector.h" 78 #include "rc_look_ahead_params.h" 79 80 #include "ihevc_defs.h" 81 #include "ihevc_debug.h" 82 #include "ihevc_structs.h" 83 #include "ihevc_platform_macros.h" 84 #include "ihevc_deblk.h" 85 #include "ihevc_itrans_recon.h" 86 #include "ihevc_chroma_itrans_recon.h" 87 #include "ihevc_chroma_intra_pred.h" 88 #include "ihevc_intra_pred.h" 89 #include "ihevc_inter_pred.h" 90 #include "ihevc_mem_fns.h" 91 #include "ihevc_padding.h" 92 #include "ihevc_weighted_pred.h" 93 #include "ihevc_sao.h" 94 #include "ihevc_resi_trans.h" 95 #include "ihevc_quant_iquant_ssd.h" 96 #include "ihevc_cabac_tables.h" 97 98 #include "ihevce_defs.h" 99 #include "ihevce_hle_interface.h" 100 #include "ihevce_lap_enc_structs.h" 101 #include "ihevce_multi_thrd_structs.h" 102 #include "ihevce_multi_thrd_funcs.h" 103 #include "ihevce_me_common_defs.h" 104 #include "ihevce_had_satd.h" 105 #include "ihevce_error_codes.h" 106 #include "ihevce_bitstream.h" 107 #include "ihevce_cabac.h" 108 #include "ihevce_rdoq_macros.h" 109 #include "ihevce_function_selector.h" 110 #include "ihevce_enc_structs.h" 111 #include "ihevce_entropy_structs.h" 112 #include "ihevce_cmn_utils_instr_set_router.h" 113 #include "ihevce_ipe_instr_set_router.h" 114 #include "ihevce_decomp_pre_intra_structs.h" 115 #include "ihevce_decomp_pre_intra_pass.h" 116 #include "ihevce_enc_loop_structs.h" 117 #include "hme_datatype.h" 118 #include "hme_interface.h" 119 #include "hme_common_defs.h" 120 #include "ihevce_global_tables.h" 121 122 /*****************************************************************************/ 123 /* Typedefs */ 124 /*****************************************************************************/ 125 typedef void (*pf_ed_calc_ctb)( 126 ihevce_ed_ctxt_t *ps_ed_ctxt, 127 ihevce_ed_blk_t *ps_ed_ctb, 128 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1, 129 UWORD8 *pu1_src, 130 WORD32 src_stride, 131 WORD32 num_4x4_blks_x, 132 WORD32 num_4x4_blks_y, 133 WORD32 *nbr_flags, 134 WORD32 i4_layer_id, 135 WORD32 row_block_no, 136 WORD32 col_block_no, 137 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list, 138 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list); 139 140 /*****************************************************************************/ 141 /* Constant Macros */ 142 /*****************************************************************************/ 143 #define SATD_NOISE_FLOOR_THRESHOLD 16 144 #define MINIMUM_VARIANCE 15 145 #define SCALE_FACTOR_VARIANCE 20 146 #define SCALE_FACTOR_VARIANCE_8x8 60 147 #define MIN_SATD_THRSHLD 0 148 #define MAX_SATD_THRSHLD 64 149 #define SUB_NOISE_THRSHLD 0 150 #define MIN_BLKS 2 151 152 /*****************************************************************************/ 153 /* Global variables */ 154 /*****************************************************************************/ 155 156 /** 157 ***************************************************************************** 158 * @brief list of pointers to luma intra pred functions 159 ***************************************************************************** 160 */ 161 pf_intra_pred g_apf_lum_ip[NUM_IP_FUNCS]; 162 163 /*****************************************************************************/ 164 /* Function Definitions */ 165 /*****************************************************************************/ 166 167 /*! 168 ****************************************************************************** 169 * \if Function name : ihevce_intra_populate_mode_bits_cost \endif 170 * 171 * \brief: look-up table of cost of signalling an intra mode in the 172 * bitstream 173 * 174 ***************************************************************************** 175 */ 176 void ihevce_intra_populate_mode_bits_cost( 177 WORD32 top_intra_mode, 178 WORD32 left_intra_mode, 179 WORD32 available_top, 180 WORD32 available_left, 181 WORD32 cu_pos_y, 182 UWORD16 *mode_bits_cost, 183 WORD32 lambda) 184 { 185 WORD32 i; 186 // 5.5 * lambda 187 UWORD16 five_bits_cost = COMPUTE_RATE_COST_CLIP30(11, lambda, (LAMBDA_Q_SHIFT + 1)); 188 189 (void)top_intra_mode; 190 (void)left_intra_mode; 191 (void)available_top; 192 (void)available_left; 193 (void)cu_pos_y; 194 for(i = 0; i < NUM_MODES; i++) 195 { 196 mode_bits_cost[i] = five_bits_cost; 197 } 198 } 199 200 /*! 201 ****************************************************************************** 202 * \if Function name : ihevce_8x8_sad_computer \endif 203 * 204 * \brief: compute sad between 2 8x8 blocks 205 * 206 ***************************************************************************** 207 */ 208 UWORD16 209 ihevce_8x8_sad_computer(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd) 210 { 211 UWORD16 sad = 0; 212 WORD32 i, j; 213 214 for(i = 0; i < 8; i++) 215 { 216 for(j = 0; j < 8; j++) 217 { 218 sad += ABS(*pu1_src - *pu1_pred); 219 pu1_src++; 220 pu1_pred++; 221 } 222 pu1_src = pu1_src + (src_strd - 8); 223 pu1_pred = pu1_pred + (pred_strd - 8); 224 } 225 226 return sad; 227 } 228 229 /*! 230 ****************************************************************************** 231 * \if Function name : ihevce_4x4_sad_computer \endif 232 * 233 * \brief: compute sad between 2 4x4 blocks 234 * 235 ***************************************************************************** 236 */ 237 UWORD16 238 ihevce_4x4_sad_computer(UWORD8 *pu1_src, UWORD8 *pu1_pred, WORD32 src_strd, WORD32 pred_strd) 239 { 240 UWORD16 sad = 0; 241 WORD32 i, j; 242 243 for(i = 0; i < 4; i++) 244 { 245 for(j = 0; j < 4; j++) 246 { 247 sad += ABS(*pu1_src - *pu1_pred); 248 pu1_src++; 249 pu1_pred++; 250 } 251 pu1_src = pu1_src + (src_strd - 4); 252 pu1_pred = pu1_pred + (pred_strd - 4); 253 } 254 255 return sad; 256 } 257 258 /*! 259 ****************************************************************************** 260 * \if Function name : ihevce_ed_4x4_find_best_modes \endif 261 * 262 * \brief: evaluate input 4x4 block for pre-selected list of angular and normal 263 * intra modes and return best sad, cost 264 * 265 ***************************************************************************** 266 */ 267 void ihevce_ed_4x4_find_best_modes( 268 UWORD8 *pu1_src, 269 WORD32 src_stride, 270 UWORD8 *ref, 271 UWORD16 *mode_bits_cost, 272 UWORD8 *pu1_best_modes, 273 WORD32 *pu1_best_sad_costs, 274 WORD32 u1_low_resol, 275 FT_SAD_COMPUTER *pf_4x4_sad_computer) 276 { 277 WORD32 i; 278 UWORD8 mode = 0, best_amode = 0, best_nmode = 0; 279 UWORD8 pred[16]; 280 WORD32 sad = 0; 281 WORD32 sad_cost = 0; 282 WORD32 best_asad_cost = 0xFFFFF; 283 WORD32 best_nsad_cost = 0xFFFFF; 284 285 /* If lower layers, l1 or l2, all the 11 modes are evaluated */ 286 /* If L0 layer, all modes excluding DC and Planar are evaluated */ 287 if(1 == u1_low_resol) 288 i = 0; 289 else 290 i = 2; 291 292 /* Find the best non-angular and angular mode till level 4 */ 293 for(; i < 11; i++) 294 { 295 mode = gau1_modes_to_eval[i]; 296 g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode); 297 sad = pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4); 298 sad_cost = sad; 299 sad_cost += mode_bits_cost[mode]; 300 if(mode < 2) 301 { 302 if(sad_cost < best_nsad_cost) 303 { 304 best_nmode = mode; 305 best_nsad_cost = sad_cost; 306 } 307 } 308 else 309 { 310 if(sad_cost < best_asad_cost) 311 { 312 best_amode = mode; 313 best_asad_cost = sad_cost; 314 } 315 } 316 } 317 318 pu1_best_modes[0] = best_amode; 319 pu1_best_sad_costs[0] = best_asad_cost; 320 321 /* Accumalate the best non-angular mode and cost for the l1 and l2 layers */ 322 if(1 == u1_low_resol) 323 { 324 pu1_best_modes[1] = best_nmode; 325 pu1_best_sad_costs[1] = best_nsad_cost; 326 } 327 } 328 329 /*! 330 ****************************************************************************** 331 * \if Function name : ihevce_ed_calc_4x4_blk \endif 332 * 333 * \brief: evaluate input 4x4 block for all intra modes and return best sad & 334 * cost 335 * 336 ***************************************************************************** 337 */ 338 static void ihevce_ed_calc_4x4_blk( 339 ihevce_ed_blk_t *ps_ed, 340 UWORD8 *pu1_src, 341 WORD32 src_stride, 342 UWORD8 *ref, 343 UWORD16 *mode_bits_cost, 344 WORD32 *sad_ptr, 345 WORD32 *pi4_best_satd, 346 WORD32 i4_quality_preset, 347 WORD32 *pi4_best_sad_cost, 348 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list) 349 { 350 WORD32 i, i_end; 351 UWORD8 mode, best_amode, best_nmode; 352 UWORD8 pred[16]; 353 354 UWORD16 sad; 355 WORD32 sad_cost = 0; 356 WORD32 best_asad_cost = 0xFFFFF; 357 WORD32 best_nsad_cost = 0xFFFFF; 358 359 UWORD8 au1_best_modes[2]; 360 WORD32 ai4_best_sad_costs[2]; 361 362 /* L1/L2 resolution hence low resolution enable */ 363 WORD32 u1_low_resol = 1; 364 365 UWORD8 modes_to_eval[2]; 366 367 /* The *pi4_best_satd will be consumed only if current 368 layer has odd number of 4x4 blocks in either x or y 369 direction. But the function hme_derive_num_layers() makes 370 sure that every layer has width and height such that each one 371 is a multiple of 16. Which makes pi4_best_satd useless. Hence 372 feel free to remove pi4_best_satd. Concluded on 29th Aug13 */ 373 *pi4_best_satd = -1; 374 ps_ipe_optimised_function_list->pf_ed_4x4_find_best_modes( 375 pu1_src, 376 src_stride, 377 ref, 378 mode_bits_cost, 379 au1_best_modes, 380 ai4_best_sad_costs, 381 u1_low_resol, 382 ps_ipe_optimised_function_list->pf_4x4_sad_computer); 383 384 best_nmode = au1_best_modes[1]; 385 best_amode = au1_best_modes[0]; 386 best_nsad_cost = ai4_best_sad_costs[1]; 387 best_asad_cost = ai4_best_sad_costs[0]; 388 389 /* Updation of pi4_best_satd here needed iff the mode given by 390 ihevce_ed_4x4_find_best_modes() comes out to be 391 the best mode at the end of the function */ 392 *pi4_best_satd = best_asad_cost - mode_bits_cost[best_amode]; 393 394 /* Around best level 4 angular mode, search for best level 2 mode */ 395 modes_to_eval[0] = best_amode - 2; 396 modes_to_eval[1] = best_amode + 2; 397 i = 0; 398 i_end = 2; 399 if(best_amode == 2) 400 i = 1; 401 else if(best_amode == 34) 402 i_end = 1; 403 for(; i < i_end; i++) 404 { 405 mode = modes_to_eval[i]; 406 g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode); 407 sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer(pu1_src, &pred[0], src_stride, 4); 408 sad_cost = sad; 409 sad_cost += mode_bits_cost[mode]; 410 if(sad_cost < best_asad_cost) 411 { 412 best_amode = mode; 413 best_asad_cost = sad_cost; 414 *pi4_best_satd = sad; 415 } 416 sad_ptr[mode] = sad; 417 } 418 419 /*To be done : Add a flag here instead of preset condn*/ 420 if((i4_quality_preset < IHEVCE_QUALITY_P4)) 421 { 422 /* Around best level 2 angular mode, search for best level 1 mode */ 423 modes_to_eval[0] = best_amode - 1; 424 modes_to_eval[1] = best_amode + 1; 425 i = 0; 426 i_end = 2; 427 if(best_amode == 2) 428 i = 1; 429 else if(best_amode == 34) 430 i_end = 1; 431 for(; i < i_end; i++) 432 { 433 mode = modes_to_eval[i]; 434 g_apf_lum_ip[g_i4_ip_funcs[mode]](&ref[0], 0, &pred[0], 4, 4, mode); 435 sad = ps_ipe_optimised_function_list->pf_4x4_sad_computer( 436 pu1_src, &pred[0], src_stride, 4); 437 sad_cost = sad; 438 sad_cost += mode_bits_cost[mode]; 439 if(sad_cost < best_asad_cost) 440 { 441 best_amode = mode; 442 best_asad_cost = sad_cost; 443 *pi4_best_satd = sad; 444 } 445 sad_ptr[mode] = sad; 446 } 447 } 448 449 if(best_asad_cost < best_nsad_cost) 450 { 451 ps_ed->best_mode = best_amode; 452 *pi4_best_sad_cost = best_asad_cost; 453 } 454 else 455 { 456 ps_ed->best_mode = best_nmode; 457 *pi4_best_sad_cost = best_nsad_cost; 458 } 459 ps_ed->intra_or_inter = 0; 460 ps_ed->merge_success = 0; 461 } 462 463 /*! 464 ****************************************************************************** 465 * \if Function name : ihevce_ed_calc_8x8_blk \endif 466 * 467 * \brief: evaluate input 8x8 block for intra modes basing on the intra mode 468 * decisions made at 4x4 level. This function also makes a decision whether 469 * to split blk in to 4x4 partitions or not. 470 * 471 ***************************************************************************** 472 */ 473 static void ihevce_ed_calc_8x8_blk( 474 ihevce_ed_ctxt_t *ps_ed_ctxt, 475 ihevce_ed_blk_t *ps_ed_8x8, 476 UWORD8 *pu1_src, 477 WORD32 src_stride, 478 WORD32 *nbr_flags_ptr, 479 WORD32 *top_intra_mode_ptr, 480 WORD32 *left_intra_mode_ptr, 481 WORD32 cu_pos_y, 482 WORD32 lambda, 483 WORD32 *sad_ptr_8x8, 484 WORD32 *pi4_best_satd, 485 WORD32 i4_layer_id, 486 WORD32 i4_quality_preset, 487 WORD32 i4_slice_type, 488 WORD32 *pi4_best_sad_cost_8x8_l1_ipe, 489 WORD32 *pi4_best_sad_8x8_l1_ipe, 490 WORD32 *pi4_sum_4x4_satd, 491 WORD32 *pi4_min_4x4_satd, 492 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list, 493 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list) 494 { 495 WORD32 i, j; 496 WORD32 nbr_flags, nbr_flags_TR; 497 UWORD8 *pu1_src_4x4; 498 WORD32 top_available; 499 WORD32 left_available; 500 ihevce_ed_blk_t *ps_ed_4x4 = ps_ed_8x8; 501 WORD32 top_intra_mode; 502 WORD32 left_intra_mode; 503 WORD32 next_left_intra_mode; 504 WORD32 *sad_ptr = sad_ptr_8x8; 505 UWORD8 *pu1_src_arr[4]; 506 WORD32 i4_4x4_best_sad_cost[4]; 507 func_selector_t *ps_func_selector = ps_ed_ctxt->ps_func_selector; 508 ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution = 509 ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr; 510 511 (void)i4_slice_type; 512 513 /* Compute ref samples for 8x8 merge block */ 514 nbr_flags = nbr_flags_ptr[0]; 515 nbr_flags_TR = nbr_flags_ptr[1]; 516 517 if(CHECK_TR_AVAILABLE(nbr_flags_TR)) 518 { 519 SET_TR_AVAILABLE(nbr_flags); 520 } 521 else 522 { 523 SET_TR_UNAVAILABLE(nbr_flags); 524 } 525 526 if(CHECK_BL_AVAILABLE(nbr_flags)) 527 { 528 SET_BL_AVAILABLE(nbr_flags); 529 } 530 else 531 { 532 SET_BL_UNAVAILABLE(nbr_flags); 533 } 534 535 /* call the function which populates ref data for intra predicion */ 536 pf_intra_pred_luma_ref_substitution( 537 pu1_src - src_stride - 1, 538 pu1_src - src_stride, 539 pu1_src - 1, 540 src_stride, 541 8, 542 nbr_flags, 543 &ps_ed_ctxt->au1_ref_8x8[0][0], 544 0); 545 546 for(i = 0; i < 2; i++) 547 { 548 pu1_src_4x4 = pu1_src + i * 4 * src_stride; 549 cu_pos_y += i * 4; 550 next_left_intra_mode = left_intra_mode_ptr[i]; 551 for(j = 0; j < 2; j++) 552 { 553 WORD32 i4_best_satd; 554 pu1_src_arr[i * 2 + j] = pu1_src_4x4; 555 nbr_flags = nbr_flags_ptr[i * 8 + j]; 556 top_intra_mode = top_intra_mode_ptr[j]; 557 left_intra_mode = next_left_intra_mode; 558 /* call the function which populates ref data for intra predicion */ 559 pf_intra_pred_luma_ref_substitution( 560 pu1_src_4x4 - src_stride - 1, 561 pu1_src_4x4 - src_stride, 562 pu1_src_4x4 - 1, 563 src_stride, 564 4, 565 nbr_flags, 566 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0], 567 0); 568 569 top_available = CHECK_T_AVAILABLE(nbr_flags); 570 left_available = CHECK_L_AVAILABLE(nbr_flags); 571 /* call the function which populates sad cost for all the modes */ 572 ihevce_intra_populate_mode_bits_cost( 573 top_intra_mode, 574 left_intra_mode, 575 top_available, 576 left_available, 577 cu_pos_y, 578 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0], 579 lambda); 580 ihevce_ed_calc_4x4_blk( 581 ps_ed_4x4, 582 pu1_src_4x4, 583 src_stride, 584 &ps_ed_ctxt->au1_ref_full_ctb[i * 2 + j][0], 585 &ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i * 2 + j][0], 586 sad_ptr, 587 &i4_best_satd, 588 i4_quality_preset, 589 &i4_4x4_best_sad_cost[i * 2 + j], 590 ps_ipe_optimised_function_list); 591 592 top_intra_mode_ptr[j] = ps_ed_4x4->best_mode; 593 next_left_intra_mode = ps_ed_4x4->best_mode; 594 pu1_src_4x4 += 4; 595 ps_ed_4x4 += 1; 596 sad_ptr += NUM_MODES; 597 } 598 left_intra_mode_ptr[i] = next_left_intra_mode; 599 } 600 601 /* 8x8 merge */ 602 { 603 UWORD8 modes_to_eval[6]; 604 WORD32 sad; 605 UWORD8 pred[16]; 606 UWORD8 pred_8x8[64] = { 0 }; 607 WORD32 merge_success; 608 UWORD8 mode; 609 610 ps_ed_4x4 = ps_ed_8x8; 611 mode = (ps_ed_4x4)->best_mode; 612 613 *pi4_best_satd = -1; 614 615 merge_success = 616 ((((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 1)->best_mode) + 617 ((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 2)->best_mode) + 618 ((ps_ed_4x4)->best_mode == (ps_ed_4x4 + 3)->best_mode)) == 3); 619 620 { 621 WORD32 i4_satd; 622 //UWORD16 au2_4x4_sad_cost_array[4];/*SAD of 4x4 blocks*/ 623 UWORD16 u2_sum_best_4x4_sad_cost; /*Sum of 4x4 sad costs*/ 624 UWORD16 u2_sum_best_4x4_satd_cost; /*Sum of 4x4 satd costs*/ 625 UWORD8 u1_best_8x8_mode; /*8x8 mode.*/ 626 UWORD16 u2_best_8x8_cost; /*8x8 Cost. Can store SATD/SAD cost*/ 627 WORD32 i4_best_8x8_sad_satd; /* SATD/SAD value of 8x8 block*/ 628 UWORD16 au2_8x8_costs[6] = { 0 }; /*Cost of 8x8 block for 6 modes*/ 629 UWORD8 u1_cond_4x4_satd; /*condition if 4x4 SATD needs to be done*/ 630 UWORD8 u1_cond_8x8_satd; /*condition if 8x8 SATD needs to be done*/ 631 UWORD8 u1_good_quality; 632 WORD32 i4_merge_success_stage2; 633 634 /*Initiallization*/ 635 *pi4_best_satd = 0; 636 u2_best_8x8_cost = (UWORD16)(-1) /*max value*/; 637 u2_sum_best_4x4_sad_cost = 0; 638 *pi4_sum_4x4_satd = -1; 639 *pi4_min_4x4_satd = 0x7FFFFFFF; 640 i4_best_8x8_sad_satd = 0; 641 u2_sum_best_4x4_satd_cost = 0; 642 u1_best_8x8_mode = ps_ed_4x4->best_mode; 643 644 /*We thought of "replacing" SATDs by SADs for 4x4 vs 8x8 decision 645 for speed improvement, but it gave opposite results. Setting 646 good_quality to 1 in order to throw away the idea of "replacing".*/ 647 u1_good_quality = 1; 648 //u1_good_quality = ((i4_quality_preset != IHEVCE_QUALITY_P5) 649 // && (i4_quality_preset != IHEVCE_QUALITY_P4)); 650 651 /*Needed to disable some processing based on speed preset*/ 652 i4_merge_success_stage2 = 0; 653 654 /*Store SAD cost of 4x4 blocks */ 655 for(i = 0; i < 4; i++) 656 { 657 //au2_4x4_sad_cost_array[i] = (ps_ed_4x4 + i)->best_sad_cost; 658 u2_sum_best_4x4_sad_cost += 659 i4_4x4_best_sad_cost[i]; //(ps_ed_4x4 + i)->best_sad_cost; 660 modes_to_eval[i] = (ps_ed_4x4 + i)->best_mode; 661 /*NOTE_01: i4_4x4_satd is not used anywhere at present. 662 Setting it to zero to avoid ASSERT failure */ 663 /*Now taken care of incomplete CTB*/ 664 //(ps_ed_4x4 + i)->i4_4x4_satd = 0; 665 } 666 667 /*Calculate SATD/SAd for 4x4 blocks*/ 668 /*For (layer_2 && high_speed): No need to get 4x4 SATDs bcoz 669 it won't have any impact on quality but speed will improve.*/ 670 u1_cond_4x4_satd = ((1 == i4_layer_id) || (u1_good_quality && (!merge_success))); 671 672 if(u1_cond_4x4_satd) 673 { 674 *pi4_sum_4x4_satd = 0; 675 /*FYI: 1. Level 2 doesn't need the SATD. 676 2. The 4x4 vs. 8x8 decision for high_speed will 677 happen based on SAD. */ 678 /*Get SATD for 4x4 blocks */ 679 for(i = 0; i < 4; i++) 680 { 681 mode = modes_to_eval[i]; 682 g_apf_lum_ip[g_i4_ip_funcs[mode]]( 683 &ps_ed_ctxt->au1_ref_full_ctb[i][0], 0, &pred[0], 4, 4, mode); 684 685 i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit( 686 pu1_src_arr[i], src_stride, &pred[0], 4, NULL, 0); 687 688 { 689 /*Save 4x4x satd in ed blk struct */ 690 (ps_ed_4x4 + i)->i4_4x4_satd = i4_satd; 691 } 692 693 /*(ps_ed_4x4 + i)->i4_4x4_satd = i4_satd; // See NOTE_01*/ 694 u2_sum_best_4x4_satd_cost += 695 ((UWORD16)i4_satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]); 696 *pi4_best_satd += i4_satd; 697 } 698 } 699 /* Not being used in current code */ 700 else /* (Level_2 && extreme_speed) */ 701 { 702 /******DONT ENTER HERE AT aNY COST***************************/ 703 /* Transistor killers lie ahead!!!!!!! */ 704 /*This else part is not getting executed as of now*/ 705 if(2 != i4_layer_id) 706 ASSERT(0); 707 /*Update values by SAD_cost_array */ 708 for(i = 0; i < 4; i++) 709 { 710 mode = modes_to_eval[i]; 711 //u2_sum_best_4x4_satd_cost += au2_4x4_sad_cost_array[i]; 712 //sad = (WORD32)((ps_ed_4x4 + i)->best_sad_cost - ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]); 713 sad = (WORD32)( 714 i4_4x4_best_sad_cost[i] - ps_ed_ctxt->au2_mode_bits_cost_full_ctb[i][mode]); 715 *pi4_sum_4x4_satd += sad; 716 /*(ps_ed_4x4 + i)->i4_4x4_satd = sad;// See NOTE_01*/ 717 *pi4_best_satd += sad; 718 719 if(*pi4_min_4x4_satd > sad) 720 *pi4_min_4x4_satd = sad; 721 } 722 } 723 if(!merge_success) /*If the modes are not identical*/ 724 { 725 UWORD8 i1_start; /* no of modes to evaluate */ 726 UWORD8 ai1_modes[6]; 727 728 /* Prepare 6 candidates for 8x8 block. Two are DC and planar */ 729 ai1_modes[4] = 0; 730 ai1_modes[5] = 1; 731 i1_start = 4; 732 733 /*Assign along with removing duplicates rest 4 candidates. */ 734 for(i = 3; i >= 0; i--) 735 { 736 WORD8 i1_fresh_mode_flag = 1; 737 mode = modes_to_eval[i]; 738 /*Check if duplicate already exists in ai1_modes*/ 739 for(j = i1_start; j < 6; j++) 740 { 741 if(mode == ai1_modes[j]) 742 i1_fresh_mode_flag = 0; 743 } 744 if(i1_fresh_mode_flag) 745 { 746 i1_start--; 747 ai1_modes[i1_start] = mode; 748 } 749 } 750 751 /*Calculate SATD/SAD of 8x8 block for all modes*/ 752 /*If (u1_good_quality == 0) then SATD gets replaced by SAD*/ 753 if(u1_good_quality && (i4_quality_preset <= IHEVCE_QUALITY_P4)) 754 { 755 //7.5 * lambda to incorporate transfrom flags 756 u2_sum_best_4x4_satd_cost += 757 (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1))); 758 759 /*Loop over all modes for calculating SATD*/ 760 for(i = i1_start; i < 6; i++) 761 { 762 mode = ai1_modes[i]; 763 g_apf_lum_ip[g_i4_ip_funcs[mode]]( 764 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode); 765 766 i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit( 767 pu1_src_arr[0], src_stride, &pred_8x8[0], 8, NULL, 0); 768 769 au2_8x8_costs[i] = 770 ((UWORD16)i4_satd + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode]); 771 772 /*Update data correspoinding to least 8x8 cost */ 773 if(au2_8x8_costs[i] <= u2_best_8x8_cost) 774 { 775 u2_best_8x8_cost = au2_8x8_costs[i]; 776 i4_best_8x8_sad_satd = i4_satd; 777 u1_best_8x8_mode = mode; 778 } 779 } 780 /*8x8 vs 4x4 decision based on SATD values*/ 781 if((u2_best_8x8_cost <= u2_sum_best_4x4_satd_cost) || (u2_best_8x8_cost <= 300)) 782 { 783 i4_merge_success_stage2 = 1; 784 } 785 786 /* EIID: Early inter-intra decision */ 787 /* Find the SAD based cost for 8x8 block for best mode */ 788 if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id)) 789 { 790 UWORD8 i4_best_8x8_mode = u1_best_8x8_mode; 791 WORD32 i4_best_8x8_sad_curr; 792 793 g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]]( 794 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, i4_best_8x8_mode); 795 796 i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer( 797 pu1_src_arr[0], &pred_8x8[0], src_stride, 8); 798 799 //register best sad in the context 800 //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr; 801 802 //register the best cost in the context 803 //[0]th index is used since all 4 blocks are having same cost right now 804 //also it doesnt depends on mode. It only depends on the lambda 805 806 *pi4_best_sad_cost_8x8_l1_ipe = 807 i4_best_8x8_sad_curr + 808 ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode]; 809 *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr; 810 } 811 } 812 else /*If high_speed or extreme speed*/ 813 { 814 //7.5 * lambda to incorporate transfrom flags 815 u2_sum_best_4x4_sad_cost += 816 (COMPUTE_RATE_COST_CLIP30(12, lambda, (LAMBDA_Q_SHIFT + 1))); 817 818 /*Loop over all modes for calculating SAD*/ 819 for(i = i1_start; i < 6; i++) 820 { 821 mode = ai1_modes[i]; 822 g_apf_lum_ip[g_i4_ip_funcs[mode]]( 823 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode); 824 825 sad = ps_ipe_optimised_function_list->pf_8x8_sad_computer( 826 pu1_src_arr[0], &pred_8x8[0], src_stride, 8); 827 828 au2_8x8_costs[i] += 829 ((UWORD16)sad + ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][mode]); 830 831 /*Find the data correspoinding to least cost */ 832 if(au2_8x8_costs[i] <= u2_best_8x8_cost) 833 { 834 u2_best_8x8_cost = au2_8x8_costs[i]; 835 i4_best_8x8_sad_satd = sad; 836 u1_best_8x8_mode = mode; 837 } 838 } 839 /*8x8 vs 4x4 decision based on SAD values*/ 840 if((u2_best_8x8_cost <= u2_sum_best_4x4_sad_cost) || (u2_best_8x8_cost <= 300)) 841 { 842 i4_merge_success_stage2 = 1; 843 } 844 845 /* EIID: Early inter-intra decision */ 846 /* Find the SAD based cost for 8x8 block for best mode */ 847 if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id)) 848 { 849 //UWORD8 i4_best_8x8_mode = u1_best_8x8_mode; 850 WORD32 i4_best_8x8_sad_cost_curr = u2_best_8x8_cost; 851 852 //register best sad in the context 853 //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr; 854 855 //register the best cost in the context 856 *pi4_best_sad_cost_8x8_l1_ipe = i4_best_8x8_sad_cost_curr; 857 *pi4_best_sad_8x8_l1_ipe = 858 i4_best_8x8_sad_satd; //i4_best_8x8_sad_cost_curr; 859 } 860 } 861 } 862 863 /***** Modes for 4x4 and 8x8 are decided before this point ****/ 864 if(merge_success || i4_merge_success_stage2) 865 { 866 /*FYI: 1. 8x8 SATD is not needed if merge is failed. 867 2. For layer_2: SATD won't be calculated for 8x8. So 868 the best_8x8_cost is SAD-cost. */ 869 870 /* Store the 8x8 level data in the first 4x4 block*/ 871 ps_ed_4x4->merge_success = 1; 872 ps_ed_4x4->best_merge_mode = u1_best_8x8_mode; 873 /* ps_ed_4x4->best_merge_sad_cost = u2_best_8x8_cost; 874 This data is not getting consumed anywhere at present */ 875 876 top_intra_mode_ptr[0] = u1_best_8x8_mode; 877 top_intra_mode_ptr[1] = u1_best_8x8_mode; 878 left_intra_mode_ptr[0] = u1_best_8x8_mode; 879 left_intra_mode_ptr[1] = u1_best_8x8_mode; 880 881 /*If it is layer_1 and high_speed*/ 882 u1_cond_8x8_satd = 883 ((1 == i4_layer_id) && 884 (merge_success || ((!u1_good_quality) && i4_merge_success_stage2))); 885 if(u1_cond_8x8_satd) 886 { 887 mode = u1_best_8x8_mode; 888 g_apf_lum_ip[g_i4_ip_funcs[mode]]( 889 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, mode); 890 891 if(i4_quality_preset > IHEVCE_QUALITY_P3) 892 { 893 i4_satd = ps_ipe_optimised_function_list->pf_8x8_sad_computer( 894 pu1_src_arr[0], &pred_8x8[0], src_stride, 8); 895 } 896 else 897 { 898 i4_satd = ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit( 899 pu1_src_arr[0], src_stride, &pred_8x8[0], 8, NULL, 0); 900 } 901 /* u2_best_8x8_cost = ((UWORD16)i4_satd + mode_bits_cost[0][mode]); 902 This data is not getting consumed at present */ 903 i4_best_8x8_sad_satd = i4_satd; 904 } 905 *pi4_best_satd = i4_best_8x8_sad_satd; 906 907 /* EIID: Early inter-intra decision */ 908 /* Find the SAD based cost for 8x8 block for best mode */ 909 if(/*(ISLICE != i4_slice_type) && */ (1 == i4_layer_id)) 910 { 911 UWORD8 i4_best_8x8_mode = u1_best_8x8_mode; 912 WORD32 i4_best_8x8_sad_curr; 913 914 g_apf_lum_ip[g_i4_ip_funcs[i4_best_8x8_mode]]( 915 &ps_ed_ctxt->au1_ref_8x8[0][0], 0, &pred_8x8[0], 8, 8, i4_best_8x8_mode); 916 917 i4_best_8x8_sad_curr = ps_ipe_optimised_function_list->pf_8x8_sad_computer( 918 pu1_src_arr[0], &pred_8x8[0], src_stride, 8); 919 //register best sad in the context 920 //ps_ed_8x8->i4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr; 921 922 //register the best cost in the context 923 //[0]th index is used since all 4 blocks are having same cost right now 924 //also it doesnt depends on mode. It only depends on the lambda 925 926 *pi4_best_sad_cost_8x8_l1_ipe = 927 i4_best_8x8_sad_curr + 928 ps_ed_ctxt->au2_mode_bits_cost_full_ctb[0][i4_best_8x8_mode]; 929 *pi4_best_sad_8x8_l1_ipe = i4_best_8x8_sad_curr; 930 931 } // EIID ends 932 933 } //if(merge_success || i4_merge_success_stage2) 934 } 935 } 936 } 937 938 /*! 939 ****************************************************************************** 940 * \if Function name : ihevce_ed_calc_incomplete_ctb \endif 941 * 942 * \brief: performs L1 8x8 and 4x4 intra mode analysis 943 * 944 ***************************************************************************** 945 */ 946 void ihevce_ed_calc_incomplete_ctb( 947 ihevce_ed_ctxt_t *ps_ed_ctxt, 948 ihevce_ed_blk_t *ps_ed_ctb, 949 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1, 950 UWORD8 *pu1_src, 951 WORD32 src_stride, 952 WORD32 num_4x4_blks_x, 953 WORD32 num_4x4_blks_y, 954 WORD32 *nbr_flags, 955 WORD32 i4_layer_id, 956 WORD32 i4_row_block_no, 957 WORD32 i4_col_block_no, 958 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list, 959 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list) 960 { 961 WORD32 i, j, k; 962 WORD32 z_scan_idx = 0; 963 WORD32 z_scan_act_idx = 0; 964 ihevc_intra_pred_luma_ref_substitution_ft *pf_intra_pred_luma_ref_substitution = 965 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ref_substitution_fptr; 966 967 //UWORD8 ref[18]; 968 //WORD32 top_intra_modes[20]; 969 WORD32 *sad_ptr = &ps_ed_ctxt->sad[0]; 970 WORD32 lambda = ps_ed_ctxt->lambda; 971 //UWORD16 mode_bits_cost[NUM_MODES]; 972 973 UWORD8 *pu1_src_8x8; 974 ihevce_ed_blk_t *ps_ed_8x8, *ps_ed_4x4; 975 WORD32 *top_intra_mode_ptr; 976 WORD32 *left_intra_mode_ptr = ps_ed_ctxt->left_ctb_intra_modes; 977 WORD32 *nbr_flags_ptr; 978 WORD32 top_intra_mode; 979 WORD32 left_intra_mode; 980 WORD32 next_left_intra_mode; 981 WORD32 nbr_flag = 0; 982 WORD32 top_available; 983 WORD32 left_available; 984 UWORD8 *pu1_src_4x4; 985 WORD32 left_over_4x4_blks; 986 WORD32 i4_incomplete_sum_4x4_satd = 0; 987 WORD32 i4_incomplete_min_4x4_satd = 0x7FFFFFFF; 988 WORD32 i4_best_sad_cost_8x8_l1_ipe, i4_best_sad_8x8_l1_ipe, i4_sum_4x4_satd, i4_min_4x4_satd; 989 990 (void)i4_row_block_no; 991 (void)i4_col_block_no; 992 /*Find the modulated qp of 16*16 at L2 from 8*8 SATDs in L2 993 THis is used as 64*64 Qp in L0*/ 994 /*For Incomplete CTB, init all SATD to -1 and then popualate for the complete 8x8 blocks (CU 16 in L0)*/ 995 /* Not populated for 4x4 blocks (CU 8 in L0), can be done */ 996 /*Also, not 32x32 satd is not populated, as it would correspong to CU 64 and it is not an incomplete CTB */ 997 if(i4_layer_id == 1) 998 { 999 WORD32 i4_i; 1000 1001 for(i4_i = 0; i4_i < 64; i4_i++) 1002 { 1003 (ps_ed_ctb + i4_i)->i4_4x4_satd = -1; 1004 (ps_ed_ctb + i4_i)->i4_4x4_cur_satd = -1; 1005 } 1006 1007 for(i4_i = 0; i4_i < 16; i4_i++) 1008 { 1009 ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2; 1010 ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF; 1011 ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2; 1012 ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2; 1013 } 1014 1015 for(i4_i = 0; i4_i < 4; i4_i++) 1016 { 1017 ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2; 1018 ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2; 1019 ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2; 1020 } 1021 ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2; 1022 ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2; 1023 ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2; 1024 1025 ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2; 1026 1027 for(i4_i = 0; i4_i < 16; i4_i++) 1028 { 1029 ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -1; 1030 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -1; 1031 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -1; 1032 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -1; 1033 ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -1; 1034 ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -1; 1035 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -1; 1036 1037 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -1; 1038 } 1039 } 1040 /* 1041 * src scan happens in raster scan order. ps_ed update happens in z-scan order. 1042 */ 1043 for(i = 0; i < num_4x4_blks_x; i++) 1044 { 1045 ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[i] = INTRA_DC; 1046 } 1047 next_left_intra_mode = left_intra_mode_ptr[0]; 1048 for(i = 0; i < num_4x4_blks_y / 2; i++) 1049 { 1050 pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride; 1051 top_intra_mode_ptr = &ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[0]; 1052 nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i; 1053 1054 for(j = 0; j < num_4x4_blks_x / 2; j++) 1055 { 1056 WORD32 i4_best_satd; 1057 // Multiply i by 16 since the 1058 // matrix is prepared for ctb_size = 64 1059 z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2]; 1060 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j]; 1061 ASSERT(z_scan_act_idx <= 15); 1062 ps_ed_8x8 = ps_ed_ctb + z_scan_idx; 1063 1064 ihevce_ed_calc_8x8_blk( 1065 ps_ed_ctxt, 1066 ps_ed_8x8, 1067 pu1_src_8x8, 1068 src_stride, 1069 nbr_flags_ptr, 1070 top_intra_mode_ptr, 1071 left_intra_mode_ptr, 1072 i * 8, 1073 lambda, 1074 sad_ptr + z_scan_idx * NUM_MODES, 1075 &i4_best_satd, 1076 i4_layer_id, 1077 ps_ed_ctxt->i4_quality_preset, 1078 ps_ed_ctxt->i4_slice_type, 1079 &i4_best_sad_cost_8x8_l1_ipe, 1080 &i4_best_sad_8x8_l1_ipe, 1081 &i4_sum_4x4_satd, 1082 &i4_min_4x4_satd, 1083 ps_ipe_optimised_function_list, 1084 ps_cmn_utils_optimised_function_list); 1085 1086 ASSERT(i4_best_satd >= 0); 1087 if(i4_layer_id == 1) 1088 { 1089 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] = 1090 i4_best_sad_cost_8x8_l1_ipe; 1091 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe; 1092 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd; 1093 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd; 1094 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd); 1095 //ps_ed_ctb_l1->i4_sum_4x4_satd[z_scan_act_idx] = i4_sum_4x4_satd; 1096 //ps_ed_ctb_l1->i4_min_4x4_satd[z_scan_act_idx] = i4_min_4x4_satd; 1097 } 1098 1099 pu1_src_8x8 += 8; 1100 //ps_ed_8x8 += 4; 1101 top_intra_mode_ptr += 2; 1102 nbr_flags_ptr += 2; 1103 } 1104 1105 next_left_intra_mode = left_intra_mode_ptr[0]; 1106 left_over_4x4_blks = (num_4x4_blks_x - (2 * (num_4x4_blks_x / 2))); 1107 left_over_4x4_blks = left_over_4x4_blks * 2; 1108 1109 pu1_src_4x4 = pu1_src_8x8; 1110 1111 i4_incomplete_sum_4x4_satd = 0; 1112 i4_incomplete_min_4x4_satd = 0x7FFFFFFF; 1113 1114 /* For leftover right 4x4 blks (num_4x4_blks_x - 2 *(num_4x4_blks_x/2))*/ 1115 for(k = 0; k < left_over_4x4_blks; k++) 1116 { 1117 WORD32 i4_best_satd; 1118 WORD32 i4_dummy_sad_cost; 1119 // Multiply i by 16 since the 1120 // matrix is prepared for ctb_size = 64 1121 ASSERT(left_over_4x4_blks == 2); 1122 z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + k * 16 + j * 2]; 1123 ps_ed_4x4 = ps_ed_ctb + z_scan_idx; 1124 1125 top_intra_mode = ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j]; 1126 left_intra_mode = next_left_intra_mode; 1127 1128 nbr_flag = nbr_flags[i * 2 * 8 + k * 8 + j * 2]; 1129 1130 /* call the function which populates ref data for intra predicion */ 1131 pf_intra_pred_luma_ref_substitution( 1132 pu1_src_4x4 - src_stride - 1, 1133 pu1_src_4x4 - src_stride, 1134 pu1_src_4x4 - 1, 1135 src_stride, 1136 4, 1137 nbr_flag, 1138 &ps_ed_ctxt->au1_ref_ic_ctb[0], 1139 0); 1140 1141 top_available = CHECK_T_AVAILABLE(nbr_flag); 1142 left_available = CHECK_L_AVAILABLE(nbr_flag); 1143 /* call the function which populates sad cost for all the modes */ 1144 ihevce_intra_populate_mode_bits_cost( 1145 top_intra_mode, 1146 left_intra_mode, 1147 top_available, 1148 left_available, 1149 i * 4, 1150 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0], 1151 lambda); 1152 1153 ihevce_ed_calc_4x4_blk( 1154 ps_ed_4x4, 1155 pu1_src_4x4, 1156 src_stride, 1157 &ps_ed_ctxt->au1_ref_ic_ctb[0], 1158 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0], 1159 sad_ptr + z_scan_idx * NUM_MODES, 1160 &i4_best_satd, 1161 ps_ed_ctxt->i4_quality_preset, 1162 &i4_dummy_sad_cost, 1163 ps_ipe_optimised_function_list); 1164 1165 ASSERT(i4_best_satd >= 0); 1166 if(i4_layer_id == 1) //Can we ignore this check? 1167 { 1168 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j]; 1169 /*Note : The satd population is not populated for last 4*4 block in incomplete CTB */ 1170 /* Which corresponds to CU 8 in L0 */ 1171 1172 /*MAM_VAR_L1 */ 1173 i4_incomplete_sum_4x4_satd = i4_incomplete_sum_4x4_satd + i4_best_satd; 1174 if(i4_incomplete_min_4x4_satd >= i4_best_satd) 1175 i4_incomplete_min_4x4_satd = i4_best_satd; 1176 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd; 1177 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd); 1178 if((k & 1) == 0) 1179 { 1180 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = 0; 1181 } 1182 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] += i4_best_satd; 1183 } 1184 1185 ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j * 2] = ps_ed_4x4->best_mode; 1186 next_left_intra_mode = ps_ed_4x4->best_mode; 1187 pu1_src_4x4 += src_stride; 1188 left_intra_mode_ptr[k] = next_left_intra_mode; 1189 } 1190 left_intra_mode_ptr += 2; 1191 } 1192 1193 if(num_4x4_blks_y & 1) 1194 { 1195 /* For leftover bottom 4x4 blks. (num_4x4_blks_x) */ 1196 pu1_src_4x4 = pu1_src + i * 2 * 4 * src_stride; 1197 //memset(&ps_ed_ctb_l1->i4_best_satd_8x8[i][0],0,4*sizeof(WORD32)); 1198 for(j = 0; j < num_4x4_blks_x; j++) 1199 { 1200 WORD32 i4_best_satd; 1201 WORD32 i4_dummy_sad_cost; 1202 // Multiply i by 16 since the 1203 // matrix is prepared for ctb_size = 64 1204 z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j]; 1205 ps_ed_4x4 = ps_ed_ctb + z_scan_idx; 1206 1207 if((j & 1) == 0) 1208 { 1209 i4_incomplete_sum_4x4_satd = 0; 1210 i4_incomplete_min_4x4_satd = 0x7FFFFFFF; 1211 } 1212 1213 top_intra_mode = ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j]; 1214 left_intra_mode = next_left_intra_mode; 1215 1216 nbr_flag = nbr_flags[i * 2 * 8 + j]; 1217 1218 /* call the function which populates ref data for intra predicion */ 1219 pf_intra_pred_luma_ref_substitution( 1220 pu1_src_4x4 - src_stride - 1, 1221 pu1_src_4x4 - src_stride, 1222 pu1_src_4x4 - 1, 1223 src_stride, 1224 4, 1225 nbr_flag, 1226 &ps_ed_ctxt->au1_ref_ic_ctb[0], 1227 0); 1228 1229 top_available = CHECK_T_AVAILABLE(nbr_flag); 1230 left_available = CHECK_L_AVAILABLE(nbr_flag); 1231 /* call the function which populates sad cost for all the modes */ 1232 ihevce_intra_populate_mode_bits_cost( 1233 top_intra_mode, 1234 left_intra_mode, 1235 top_available, 1236 left_available, 1237 i * 4, 1238 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0], 1239 lambda); 1240 1241 ihevce_ed_calc_4x4_blk( 1242 ps_ed_4x4, 1243 pu1_src_4x4, 1244 src_stride, 1245 &ps_ed_ctxt->au1_ref_ic_ctb[0], 1246 &ps_ed_ctxt->au2_mode_bits_cost_ic_ctb[0], 1247 sad_ptr + z_scan_idx * NUM_MODES, 1248 &i4_best_satd, 1249 ps_ed_ctxt->i4_quality_preset, 1250 &i4_dummy_sad_cost, 1251 ps_ipe_optimised_function_list); 1252 1253 /*Note : The satd population is not populated for last 4*4 block in incomplete CTB */ 1254 /* Which corresponds to CU 8 in L0 */ 1255 1256 /*MAM_VAR_L1 */ 1257 ASSERT(i4_best_satd >= 0); 1258 if(i4_layer_id == 1) //Can we ignore this check? 1259 { 1260 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + (j >> 1)]; 1261 if((j & 1) == 0) 1262 { 1263 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = 0; 1264 } 1265 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] += i4_best_satd; 1266 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd; 1267 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd); 1268 i4_incomplete_sum_4x4_satd = i4_incomplete_sum_4x4_satd + i4_best_satd; 1269 if(i4_incomplete_min_4x4_satd >= i4_best_satd) 1270 i4_incomplete_min_4x4_satd = i4_best_satd; 1271 } 1272 1273 ps_ed_ctxt->ai4_top_intra_modes_ic_ctb[j] = ps_ed_4x4->best_mode; 1274 next_left_intra_mode = ps_ed_4x4->best_mode; 1275 pu1_src_4x4 += 4; 1276 } 1277 } 1278 left_intra_mode_ptr[0] = next_left_intra_mode; 1279 } 1280 1281 /*! 1282 ****************************************************************************** 1283 * \if Function name : ihevce_cu_level_qp_mod \endif 1284 * 1285 * \brief: Performs CU level QP modulation 1286 * 1287 ***************************************************************************** 1288 */ 1289 WORD32 ihevce_cu_level_qp_mod( 1290 WORD32 i4_qscale, 1291 WORD32 i4_satd, 1292 long double ld_curr_frame_log_avg_act, 1293 float f_mod_strength, 1294 WORD32 *pi4_act_factor, 1295 WORD32 *pi4_q_scale_mod, 1296 rc_quant_t *ps_rc_quant_ctxt) 1297 { 1298 WORD32 i4_temp_qscale; 1299 WORD32 i4_temp_qp; 1300 1301 if(i4_satd != -1) 1302 { 1303 WORD32 i4_loc_satd = i4_satd; 1304 if(i4_loc_satd < 1) 1305 { 1306 i4_loc_satd = 1; 1307 } 1308 if((WORD32)ld_curr_frame_log_avg_act == 0) 1309 { 1310 *pi4_act_factor = (1 << (QP_LEVEL_MOD_ACT_FACTOR)); 1311 } 1312 else 1313 { 1314 UWORD32 u4_log2_sq_cur_satd; 1315 ULWORD64 u8_sq_cur_satd; 1316 WORD32 qp_offset; 1317 1318 ASSERT(USE_SQRT_AVG_OF_SATD_SQR); 1319 u8_sq_cur_satd = (i4_loc_satd * i4_loc_satd); 1320 GET_POS_MSB_64(u4_log2_sq_cur_satd, u8_sq_cur_satd); 1321 if(ABS(( 1322 long double)(((1 << u4_log2_sq_cur_satd) * POW_2_TO_1_BY_4) - ((long double)u8_sq_cur_satd))) > 1323 ABS(( 1324 long double)(((1 << u4_log2_sq_cur_satd) * POW_2_TO_3_BY_4) - ((long double)u8_sq_cur_satd)))) 1325 { 1326 u4_log2_sq_cur_satd += 1; 1327 } 1328 qp_offset = (WORD32)( 1329 f_mod_strength * 1330 (float)((long double)u4_log2_sq_cur_satd - ld_curr_frame_log_avg_act)); 1331 qp_offset = CLIP3(qp_offset, MIN_QP_MOD_OFFSET, MAX_QP_MOD_OFFSET); 1332 *pi4_act_factor = (WORD32)( 1333 gad_look_up_activity[qp_offset + ABS(MIN_QP_MOD_OFFSET)] * 1334 (1 << QP_LEVEL_MOD_ACT_FACTOR)); 1335 } 1336 1337 ASSERT(*pi4_act_factor > 0); 1338 i4_temp_qscale = ((i4_qscale * (*pi4_act_factor)) + (1 << (QP_LEVEL_MOD_ACT_FACTOR - 1))) >> 1339 QP_LEVEL_MOD_ACT_FACTOR; 1340 } 1341 else 1342 { 1343 i4_temp_qscale = i4_qscale; 1344 *pi4_act_factor = (1 << QP_LEVEL_MOD_ACT_FACTOR); 1345 } 1346 ASSERT(*pi4_act_factor > 0); 1347 1348 if(i4_temp_qscale > ps_rc_quant_ctxt->i2_max_qscale) 1349 { 1350 i4_temp_qscale = ps_rc_quant_ctxt->i2_max_qscale; 1351 } 1352 else if(i4_temp_qscale < ps_rc_quant_ctxt->i2_min_qscale) 1353 { 1354 i4_temp_qscale = ps_rc_quant_ctxt->i2_min_qscale; 1355 } 1356 /*store q scale for stat gen for I frame model*/ 1357 /*Here activity factor is not modified as the cu qp would be clipped in rd-opt stage*/ 1358 *pi4_q_scale_mod = i4_temp_qscale; 1359 i4_temp_qp = ps_rc_quant_ctxt->pi4_qscale_to_qp[i4_temp_qscale]; 1360 if(i4_temp_qp > ps_rc_quant_ctxt->i2_max_qp) 1361 { 1362 i4_temp_qp = ps_rc_quant_ctxt->i2_max_qp; 1363 } 1364 else if(i4_temp_qp < ps_rc_quant_ctxt->i2_min_qp) 1365 { 1366 i4_temp_qp = ps_rc_quant_ctxt->i2_min_qp; 1367 } 1368 return (i4_temp_qp); 1369 } 1370 1371 /*! 1372 ****************************************************************************** 1373 * \if Function name : ihevce_ed_calc_ctb \endif 1374 * 1375 * \brief: performs L1 8x8 and 4x4 intra mode analysis 1376 * 1377 ***************************************************************************** 1378 */ 1379 void ihevce_ed_calc_ctb( 1380 ihevce_ed_ctxt_t *ps_ed_ctxt, 1381 ihevce_ed_blk_t *ps_ed_ctb, 1382 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1, 1383 UWORD8 *pu1_src, 1384 WORD32 src_stride, 1385 WORD32 num_4x4_blks_x, 1386 WORD32 num_4x4_blks_y, 1387 WORD32 *nbr_flags, 1388 WORD32 i4_layer_id, 1389 WORD32 i4_row_block_no, 1390 WORD32 i4_col_block_no, 1391 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list, 1392 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list) 1393 { 1394 WORD32 i, j; 1395 WORD32 z_scan_idx = 0; 1396 WORD32 z_scan_act_idx = 0; 1397 ihevce_ed_blk_t *ps_ed_8x8; 1398 UWORD8 *pu1_src_8x8; 1399 1400 WORD32 top_intra_modes[20]; 1401 WORD32 *top_intra_mode_ptr; 1402 WORD32 *left_intra_mode_ptr = ps_ed_ctxt->left_ctb_intra_modes; 1403 1404 WORD32 *sad_ptr = &ps_ed_ctxt->sad[0]; 1405 WORD32 lambda = ps_ed_ctxt->lambda; 1406 WORD32 *nbr_flags_ptr; 1407 WORD32 i4_best_sad_cost_8x8_l1_ipe, i4_best_sad_8x8_l1_ipe, i4_sum_4x4_satd, i4_min_4x4_satd; 1408 1409 (void)num_4x4_blks_y; 1410 (void)i4_row_block_no; 1411 (void)i4_col_block_no; 1412 ASSERT(num_4x4_blks_x % 2 == 0); 1413 ASSERT(num_4x4_blks_y % 2 == 0); 1414 ASSERT((num_4x4_blks_x == 4) || (num_4x4_blks_x == 8)); 1415 ASSERT((num_4x4_blks_y == 4) || (num_4x4_blks_y == 8)); 1416 1417 if(i4_layer_id == 1) 1418 { 1419 WORD32 i4_i; 1420 1421 for(i4_i = 0; i4_i < 64; i4_i++) 1422 { 1423 (ps_ed_ctb + i4_i)->i4_4x4_satd = -1; 1424 (ps_ed_ctb + i4_i)->i4_4x4_cur_satd = -1; 1425 } 1426 1427 for(i4_i = 0; i4_i < 16; i4_i++) 1428 { 1429 ps_ed_ctb_l1->i4_sum_4x4_satd[i4_i] = -2; 1430 ps_ed_ctb_l1->i4_min_4x4_satd[i4_i] = 0x7FFFFFFF; 1431 ps_ed_ctb_l1->i4_8x8_satd[i4_i][0] = -2; 1432 ps_ed_ctb_l1->i4_8x8_satd[i4_i][1] = -2; 1433 } 1434 1435 for(i4_i = 0; i4_i < 4; i4_i++) 1436 { 1437 ps_ed_ctb_l1->i4_16x16_satd[i4_i][0] = -2; 1438 ps_ed_ctb_l1->i4_16x16_satd[i4_i][1] = -2; 1439 ps_ed_ctb_l1->i4_16x16_satd[i4_i][2] = -2; 1440 } 1441 ps_ed_ctb_l1->i4_32x32_satd[0][0] = -2; 1442 ps_ed_ctb_l1->i4_32x32_satd[0][1] = -2; 1443 ps_ed_ctb_l1->i4_32x32_satd[0][2] = -2; 1444 ps_ed_ctb_l1->i4_32x32_satd[0][3] = -2; 1445 for(i4_i = 0; i4_i < 16; i4_i++) 1446 { 1447 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_me[i4_i] = -2; 1448 ps_ed_ctb_l1->i4_sad_cost_me_for_ref[i4_i] = -2; 1449 ps_ed_ctb_l1->i4_sad_me_for_ref[i4_i] = -2; 1450 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me[i4_i] = -2; 1451 1452 ps_ed_ctb_l1->i4_best_sad_8x8_l1_me_for_decide[i4_i] = -2; 1453 1454 ps_ed_ctb_l1->i4_best_satd_8x8[i4_i] = -2; 1455 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[i4_i] = -2; 1456 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[i4_i] = -2; 1457 } 1458 } 1459 /* 1460 * src scan happens in raster scan order. ps_ed update happens in z-scan order. 1461 */ 1462 for(i = 0; i < num_4x4_blks_x; i++) 1463 { 1464 top_intra_modes[i] = INTRA_DC; 1465 } 1466 for(i = 0; i < num_4x4_blks_x / 2; i++) 1467 { 1468 pu1_src_8x8 = pu1_src + i * 2 * 4 * src_stride; 1469 top_intra_mode_ptr = &top_intra_modes[0]; 1470 nbr_flags_ptr = &nbr_flags[0] + 2 * 8 * i; 1471 1472 for(j = 0; j < num_4x4_blks_x / 2; j++) 1473 { 1474 WORD32 i4_best_satd; 1475 ASSERT(i <= 3); 1476 ASSERT(j <= 3); 1477 1478 // Multiply i by 16 since the 1479 // matrix is prepared for ctb_size = 64 1480 z_scan_idx = gau1_ctb_raster_to_zscan[i * 2 * 16 + j * 2]; 1481 z_scan_act_idx = gau1_ctb_raster_to_zscan[i * 16 + j]; 1482 ASSERT(z_scan_act_idx <= 15); 1483 1484 ps_ed_8x8 = ps_ed_ctb + z_scan_idx; 1485 1486 ihevce_ed_calc_8x8_blk( 1487 ps_ed_ctxt, 1488 ps_ed_8x8, 1489 pu1_src_8x8, 1490 src_stride, 1491 nbr_flags_ptr, 1492 top_intra_mode_ptr, 1493 left_intra_mode_ptr, 1494 i * 8, 1495 lambda, 1496 sad_ptr + z_scan_idx * NUM_MODES, 1497 &i4_best_satd, 1498 i4_layer_id, 1499 ps_ed_ctxt->i4_quality_preset, 1500 ps_ed_ctxt->i4_slice_type, 1501 &i4_best_sad_cost_8x8_l1_ipe, 1502 &i4_best_sad_8x8_l1_ipe, 1503 &i4_sum_4x4_satd, 1504 &i4_min_4x4_satd, 1505 ps_ipe_optimised_function_list, 1506 ps_cmn_utils_optimised_function_list); 1507 1508 if(i4_layer_id == 1) 1509 { 1510 ps_ed_ctb_l1->i4_best_sad_cost_8x8_l1_ipe[z_scan_act_idx] = 1511 i4_best_sad_cost_8x8_l1_ipe; 1512 ps_ed_ctb_l1->i4_best_sad_8x8_l1_ipe[z_scan_act_idx] = i4_best_sad_8x8_l1_ipe; 1513 ps_ed_ctb_l1->i4_best_satd_8x8[z_scan_act_idx] = i4_best_satd; 1514 ps_ed_ctxt->i8_sum_best_satd += i4_best_satd; 1515 ps_ed_ctxt->i8_sum_sq_best_satd += (i4_best_satd * i4_best_satd); 1516 //ps_ed_ctb_l1->i4_sum_4x4_satd[z_scan_act_idx] = i4_sum_4x4_satd; 1517 //ps_ed_ctb_l1->i4_min_4x4_satd[z_scan_act_idx] = i4_min_4x4_satd; 1518 } 1519 1520 pu1_src_8x8 += 8; 1521 //ps_ed_8x8 += 4; 1522 top_intra_mode_ptr += 2; 1523 nbr_flags_ptr += 2; 1524 } 1525 left_intra_mode_ptr += 2; 1526 } 1527 } 1528 1529 /*! 1530 ****************************************************************************** 1531 * \if Function name : ihevce_ed_frame_init \endif 1532 * 1533 * \brief: Initialize frame context for early decision 1534 * 1535 ***************************************************************************** 1536 */ 1537 void ihevce_ed_frame_init(void *pv_ed_ctxt, WORD32 i4_layer_no) 1538 { 1539 ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt; 1540 1541 g_apf_lum_ip[IP_FUNC_MODE_0] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_planar_fptr; 1542 g_apf_lum_ip[IP_FUNC_MODE_1] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_dc_fptr; 1543 g_apf_lum_ip[IP_FUNC_MODE_2] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode2_fptr; 1544 g_apf_lum_ip[IP_FUNC_MODE_3TO9] = 1545 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_3_to_9_fptr; 1546 g_apf_lum_ip[IP_FUNC_MODE_10] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_horz_fptr; 1547 g_apf_lum_ip[IP_FUNC_MODE_11TO17] = 1548 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_11_to_17_fptr; 1549 g_apf_lum_ip[IP_FUNC_MODE_18_34] = 1550 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_18_34_fptr; 1551 g_apf_lum_ip[IP_FUNC_MODE_19TO25] = 1552 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_19_to_25_fptr; 1553 g_apf_lum_ip[IP_FUNC_MODE_26] = ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_ver_fptr; 1554 g_apf_lum_ip[IP_FUNC_MODE_27TO33] = 1555 ps_ed_ctxt->ps_func_selector->ihevc_intra_pred_luma_mode_27_to_33_fptr; 1556 1557 if(i4_layer_no == 1) 1558 { 1559 ps_ed_ctxt->i8_sum_best_satd = 0; 1560 ps_ed_ctxt->i8_sum_sq_best_satd = 0; 1561 } 1562 } 1563 1564 /** 1565 ******************************************************************************** 1566 * 1567 * @brief downscales by 2 in horz and vertical direction, creates output of 1568 * size wd/2 * ht/2 1569 * 1570 * @param[in] pu1_src : source pointer 1571 * @param[in] src_stride : source stride 1572 * @param[out] pu1_dst : destination pointer. Starting of a row. 1573 * @param[in] dst_stride : destination stride 1574 * @param[in] wd : width 1575 * @param[in] ht : height 1576 * @param[in] pu1_wkg_mem : working memory (atleast of size CEIL16(wd) * ht)) 1577 * @param[in] ht_offset : height offset of the block to be scaled 1578 * @param[in] block_ht : height of the block to be scaled 1579 * @param[in] wd_offset : width offset of the block to be scaled 1580 * @param[in] block_wd : width of the block to be scaled 1581 * 1582 * @return void 1583 * 1584 * @remarks Assumption made block_ht should me multiple of 2. LANCZOS_SCALER 1585 * 1586 ******************************************************************************** 1587 */ 1588 void ihevce_scaling_filter_mxn( 1589 UWORD8 *pu1_src, 1590 WORD32 src_strd, 1591 UWORD8 *pu1_scrtch, 1592 WORD32 scrtch_strd, 1593 UWORD8 *pu1_dst, 1594 WORD32 dst_strd, 1595 WORD32 ht, 1596 WORD32 wd) 1597 { 1598 #define FILT_TAP_Q 8 1599 #define N_TAPS 7 1600 const WORD16 i4_ftaps[N_TAPS] = { -18, 0, 80, 132, 80, 0, -18 }; 1601 WORD32 i, j; 1602 WORD32 tmp; 1603 UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd; 1604 UWORD8 *pu1_scrtch_tmp = pu1_scrtch; 1605 1606 /* horizontal filtering */ 1607 for(i = -3; i < ht + 2; i++) 1608 { 1609 for(j = 0; j < wd; j += 2) 1610 { 1611 tmp = (i4_ftaps[3] * pu1_src_tmp[j] + 1612 i4_ftaps[2] * (pu1_src_tmp[j - 1] + pu1_src_tmp[j + 1]) + 1613 i4_ftaps[1] * (pu1_src_tmp[j + 2] + pu1_src_tmp[j - 2]) + 1614 i4_ftaps[0] * (pu1_src_tmp[j + 3] + pu1_src_tmp[j - 3]) + 1615 (1 << (FILT_TAP_Q - 1))) >> 1616 FILT_TAP_Q; 1617 pu1_scrtch_tmp[j >> 1] = CLIP_U8(tmp); 1618 } 1619 pu1_scrtch_tmp += scrtch_strd; 1620 pu1_src_tmp += src_strd; 1621 } 1622 /* vertical filtering */ 1623 pu1_scrtch_tmp = pu1_scrtch + 3 * scrtch_strd; 1624 for(i = 0; i < ht; i += 2) 1625 { 1626 for(j = 0; j < (wd >> 1); j++) 1627 { 1628 tmp = 1629 (i4_ftaps[3] * pu1_scrtch_tmp[j] + 1630 i4_ftaps[2] * (pu1_scrtch_tmp[j + scrtch_strd] + pu1_scrtch_tmp[j - scrtch_strd]) + 1631 i4_ftaps[1] * 1632 (pu1_scrtch_tmp[j + 2 * scrtch_strd] + pu1_scrtch_tmp[j - 2 * scrtch_strd]) + 1633 i4_ftaps[0] * 1634 (pu1_scrtch_tmp[j + 3 * scrtch_strd] + pu1_scrtch_tmp[j - 3 * scrtch_strd]) + 1635 (1 << (FILT_TAP_Q - 1))) >> 1636 FILT_TAP_Q; 1637 pu1_dst[j] = CLIP_U8(tmp); 1638 } 1639 pu1_dst += dst_strd; 1640 pu1_scrtch_tmp += (scrtch_strd << 1); 1641 } 1642 } 1643 1644 void ihevce_scale_by_2( 1645 UWORD8 *pu1_src, 1646 WORD32 src_strd, 1647 UWORD8 *pu1_dst, 1648 WORD32 dst_strd, 1649 WORD32 wd, 1650 WORD32 ht, 1651 UWORD8 *pu1_wkg_mem, 1652 WORD32 ht_offset, 1653 WORD32 block_ht, 1654 WORD32 wd_offset, 1655 WORD32 block_wd, 1656 FT_COPY_2D *pf_copy_2d, 1657 FT_SCALING_FILTER_BY_2 *pf_scaling_filter_mxn) 1658 { 1659 #define N_TAPS 7 1660 #define MAX_BLK_SZ (MAX_CTB_SIZE + ((N_TAPS >> 1) << 1)) 1661 UWORD8 au1_cpy[MAX_BLK_SZ * MAX_BLK_SZ]; 1662 UWORD32 cpy_strd = MAX_BLK_SZ; 1663 UWORD8 *pu1_cpy = au1_cpy + cpy_strd * (N_TAPS >> 1) + (N_TAPS >> 1); 1664 1665 UWORD8 *pu1_in, *pu1_out; 1666 WORD32 in_strd, wkg_mem_strd; 1667 1668 WORD32 row_start, row_end; 1669 WORD32 col_start, col_end; 1670 WORD32 i, fun_select; 1671 WORD32 ht_tmp, wd_tmp; 1672 FT_SCALING_FILTER_BY_2 *ihevce_scaling_filters[2]; 1673 1674 assert((wd & 1) == 0); 1675 assert((ht & 1) == 0); 1676 assert(block_wd <= MAX_CTB_SIZE); 1677 assert(block_ht <= MAX_CTB_SIZE); 1678 1679 /* function pointers for filtering different dimensions */ 1680 ihevce_scaling_filters[0] = ihevce_scaling_filter_mxn; 1681 ihevce_scaling_filters[1] = pf_scaling_filter_mxn; 1682 1683 /* handle boundary blks */ 1684 col_start = (wd_offset < (N_TAPS >> 1)) ? 1 : 0; 1685 row_start = (ht_offset < (N_TAPS >> 1)) ? 1 : 0; 1686 col_end = ((wd_offset + block_wd) > (wd - (N_TAPS >> 1))) ? 1 : 0; 1687 row_end = ((ht_offset + block_ht) > (ht - (N_TAPS >> 1))) ? 1 : 0; 1688 if(col_end && (wd % block_wd != 0)) 1689 { 1690 block_wd = (wd % block_wd); 1691 } 1692 if(row_end && (ht % block_ht != 0)) 1693 { 1694 block_ht = (ht % block_ht); 1695 } 1696 1697 /* boundary blks needs to be padded, copy src to tmp buffer */ 1698 if(col_start || col_end || row_end || row_start) 1699 { 1700 UWORD8 *pu1_src_tmp = pu1_src + wd_offset + ht_offset * src_strd; 1701 1702 pu1_cpy -= (3 * (1 - col_start) + cpy_strd * 3 * (1 - row_start)); 1703 pu1_src_tmp -= (3 * (1 - col_start) + src_strd * 3 * (1 - row_start)); 1704 ht_tmp = block_ht + 3 * (1 - row_start) + 3 * (1 - row_end); 1705 wd_tmp = block_wd + 3 * (1 - col_start) + 3 * (1 - col_end); 1706 pf_copy_2d(pu1_cpy, cpy_strd, pu1_src_tmp, src_strd, wd_tmp, ht_tmp); 1707 pu1_in = au1_cpy + cpy_strd * 3 + 3; 1708 in_strd = cpy_strd; 1709 } 1710 else 1711 { 1712 pu1_in = pu1_src + wd_offset + ht_offset * src_strd; 1713 in_strd = src_strd; 1714 } 1715 1716 /*top padding*/ 1717 if(row_start) 1718 { 1719 UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3; 1720 1721 pu1_cpy = au1_cpy + cpy_strd * (3 - 1); 1722 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); 1723 pu1_cpy -= cpy_strd; 1724 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); 1725 pu1_cpy -= cpy_strd; 1726 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); 1727 } 1728 1729 /*bottom padding*/ 1730 if(row_end) 1731 { 1732 UWORD8 *pu1_cpy_tmp = au1_cpy + cpy_strd * 3 + (block_ht - 1) * cpy_strd; 1733 1734 pu1_cpy = pu1_cpy_tmp + cpy_strd; 1735 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); 1736 pu1_cpy += cpy_strd; 1737 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); 1738 pu1_cpy += cpy_strd; 1739 memcpy(pu1_cpy, pu1_cpy_tmp, block_wd + 6); 1740 } 1741 1742 /*left padding*/ 1743 if(col_start) 1744 { 1745 UWORD8 *pu1_cpy_tmp = au1_cpy + 3; 1746 1747 pu1_cpy = au1_cpy; 1748 for(i = 0; i < block_ht + 6; i++) 1749 { 1750 pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0]; 1751 pu1_cpy += cpy_strd; 1752 pu1_cpy_tmp += cpy_strd; 1753 } 1754 } 1755 1756 /*right padding*/ 1757 if(col_end) 1758 { 1759 UWORD8 *pu1_cpy_tmp = au1_cpy + 3 + block_wd - 1; 1760 1761 pu1_cpy = au1_cpy + 3 + block_wd; 1762 for(i = 0; i < block_ht + 6; i++) 1763 { 1764 pu1_cpy[0] = pu1_cpy[1] = pu1_cpy[2] = pu1_cpy_tmp[0]; 1765 pu1_cpy += cpy_strd; 1766 pu1_cpy_tmp += cpy_strd; 1767 } 1768 } 1769 1770 wkg_mem_strd = block_wd >> 1; 1771 pu1_out = pu1_dst + (wd_offset >> 1); 1772 fun_select = (block_wd % 16 == 0); 1773 ihevce_scaling_filters[fun_select]( 1774 pu1_in, in_strd, pu1_wkg_mem, wkg_mem_strd, pu1_out, dst_strd, block_ht, block_wd); 1775 1776 /* Left padding of 16 for 1st block of every row */ 1777 if(wd_offset == 0) 1778 { 1779 UWORD8 u1_val; 1780 WORD32 pad_wd = 16; 1781 WORD32 pad_ht = block_ht >> 1; 1782 UWORD8 *dst = pu1_dst; 1783 1784 for(i = 0; i < pad_ht; i++) 1785 { 1786 u1_val = dst[0]; 1787 memset(&dst[-pad_wd], u1_val, pad_wd); 1788 dst += dst_strd; 1789 } 1790 } 1791 1792 if(wd == wd_offset + block_wd) 1793 { 1794 /* Right padding of (16 + (CEIL16(wd/2))-wd/2) for last block of every row */ 1795 /* Right padding is done only after processing of last block of that row is done*/ 1796 UWORD8 u1_val; 1797 WORD32 pad_wd = 16 + CEIL16((wd >> 1)) - (wd >> 1) + 4; 1798 WORD32 pad_ht = block_ht >> 1; 1799 UWORD8 *dst = pu1_dst + (wd >> 1) - 1; 1800 1801 for(i = 0; i < pad_ht; i++) 1802 { 1803 u1_val = dst[0]; 1804 memset(&dst[1], u1_val, pad_wd); 1805 dst += dst_strd; 1806 } 1807 1808 if(ht_offset == 0) 1809 { 1810 /* Top padding of 16 is done for 1st row only after we reach end of that row */ 1811 WORD32 pad_wd = dst_strd; 1812 WORD32 pad_ht = 16; 1813 UWORD8 *dst = pu1_dst - 16; 1814 1815 for(i = 1; i <= pad_ht; i++) 1816 { 1817 memcpy(dst - (i * dst_strd), dst, pad_wd); 1818 } 1819 } 1820 1821 /* Bottom padding of (16 + (CEIL16(ht/2)) - ht/2) is done only if we have 1822 reached end of frame */ 1823 if(ht - ht_offset - block_ht == 0) 1824 { 1825 WORD32 pad_wd = dst_strd; 1826 WORD32 pad_ht = 16 + CEIL16((ht >> 1)) - (ht >> 1) + 4; 1827 UWORD8 *dst = pu1_dst + (((block_ht >> 1) - 1) * dst_strd) - 16; 1828 1829 for(i = 1; i <= pad_ht; i++) 1830 memcpy(dst + (i * dst_strd), dst, pad_wd); 1831 } 1832 } 1833 } 1834 1835 /*! 1836 ****************************************************************************** 1837 * \if Function name : ihevce_decomp_pre_intra_process_row \endif 1838 * 1839 * \brief 1840 * Row level function which down scales a given row by 2 in horz and 1841 * vertical direction creates output of size wd/2 * ht/2. 1842 * 1843 * @param[in] pu1_src : soource pointer 1844 * @param[in] src_stride : source stride 1845 * @param[out] pu1_dst : desitnation pointer 1846 * @param[in] dst_stride : destination stride 1847 * @param[in] layer_wd : layer width 1848 * @param[in] layer_ht : layer height 1849 * @param[in] ht_offset : height offset of the block to be scaled 1850 * @param[in] block_ht : height of the block to be scaled 1851 * @param[in] wd_offset : width offset of the block to be scaled 1852 * @param[in] block_wd : width of the block to be scaled 1853 * @param[in] num_col_blks : number of col blks in that row 1854 * 1855 * \return None 1856 * 1857 * @NOTE : When decompositionis done from L1 to L2 pre intra analysis is 1858 * done on L1 1859 * 1860 ***************************************************************************** 1861 */ 1862 void ihevce_decomp_pre_intra_process_row( 1863 UWORD8 *pu1_src, 1864 WORD32 src_stride, 1865 UWORD8 *pu1_dst_decomp, 1866 WORD32 dst_stride, 1867 WORD32 layer_wd, 1868 WORD32 layer_ht, 1869 UWORD8 *pu1_wkg_mem, 1870 WORD32 ht_offset, 1871 WORD32 block_ht, 1872 WORD32 block_wd, 1873 WORD32 i4_cu_aligned_pic_wd, 1874 WORD32 i4_cu_aligned_pic_ht, 1875 WORD32 num_col_blks, 1876 WORD32 layer_no, 1877 ihevce_ed_ctxt_t *ps_ed_ctxt, 1878 ihevce_ed_blk_t *ps_ed_row, 1879 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1_row, 1880 ihevce_8x8_L0_satd_t *ps_layer0_cur_satd, 1881 ihevce_8x8_L0_mean_t *ps_layer0_cur_mean, 1882 WORD32 num_4x4_blks_ctb_y, 1883 WORD32 num_4x4_blks_last_ctb_x, 1884 WORD32 skip_decomp, 1885 WORD32 skip_pre_intra, 1886 WORD32 row_block_no, 1887 WORD32 i4_enable_noise_detection, 1888 ctb_analyse_t *ps_ctb_analyse, 1889 ihevce_ipe_optimised_function_list_t *ps_ipe_optimised_function_list, 1890 ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list) 1891 { 1892 WORD32 col_block_no; 1893 1894 //ihevce_ed_ctxt_t *ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt; 1895 UWORD8 *pu1_src_pre_intra = pu1_src + (ht_offset * src_stride); 1896 WORD32 num_4x4_blks_in_ctb = block_wd >> 2; 1897 //WORD32 nbr_flags[64]; 1898 WORD32 *nbr_flags_ptr = &ps_ed_ctxt->ai4_nbr_flags[0]; 1899 WORD32 src_inc_pre_intra = num_4x4_blks_in_ctb * 4; 1900 WORD32 inc_ctb = 0; 1901 ihevce_ed_blk_t *ps_ed_ctb = ps_ed_row; 1902 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1 = ps_ed_ctb_l1_row; 1903 WORD32 i, j; 1904 WORD32 do_pre_intra_analysis; 1905 pf_ed_calc_ctb ed_calc_ctb; 1906 ctb_analyse_t *ps_ctb_analyse_curr; 1907 1908 (void)i4_cu_aligned_pic_wd; 1909 (void)i4_cu_aligned_pic_ht; 1910 (void)ps_layer0_cur_satd; 1911 (void)ps_layer0_cur_mean; 1912 (void)i4_enable_noise_detection; 1913 /*increment the struct pointer to point to the first CTB of the current row. */ 1914 ps_ctb_analyse_curr = ps_ctb_analyse + row_block_no * num_col_blks; 1915 1916 //if((num_4x4_blks_ctb_x == num_4x4_blks_ctb_y) && (num_4x4_blks_in_ctb == num_4x4_blks_ctb_x) ) 1917 if(num_4x4_blks_in_ctb == num_4x4_blks_ctb_y) 1918 { 1919 ed_calc_ctb = ihevce_ed_calc_ctb; 1920 } 1921 else 1922 { 1923 ed_calc_ctb = ihevce_ed_calc_incomplete_ctb; 1924 } 1925 1926 inc_ctb = num_4x4_blks_in_ctb * num_4x4_blks_in_ctb; 1927 1928 do_pre_intra_analysis = ((layer_no == 1) || (layer_no == 2)) && (!skip_pre_intra); 1929 1930 /* 1931 * For optimal pre intra analysis first block is processed outside 1932 * the loop. 1933 */ 1934 if(!skip_decomp) 1935 { 1936 ihevce_scale_by_2( 1937 pu1_src, 1938 src_stride, 1939 pu1_dst_decomp, 1940 dst_stride, 1941 layer_wd, 1942 layer_ht, 1943 pu1_wkg_mem, 1944 ht_offset, 1945 block_ht, 1946 block_wd * 0, 1947 block_wd, 1948 ps_cmn_utils_optimised_function_list->pf_copy_2d, 1949 ps_ipe_optimised_function_list->pf_scaling_filter_mxn); 1950 /* Disable noise detection */ 1951 ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0; 1952 1953 memset( 1954 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy, 1955 0, 1956 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy)); 1957 } 1958 1959 /* 1960 * Pre intra analysis for the first ctb. 1961 * To analyse any given CTB we need to set the availability flags of the 1962 * following neighbouring CTB: BL,L,TL,T,TR. 1963 */ 1964 if(do_pre_intra_analysis) 1965 { 1966 /* 1967 * At the beginning of ctb row set left intra modes to default value. 1968 */ 1969 for(j = 0; j < num_4x4_blks_ctb_y; j++) 1970 { 1971 ps_ed_ctxt->left_ctb_intra_modes[j] = INTRA_DC; 1972 } 1973 1974 /* 1975 * Copy the neighbor flags for a general ctb (ctb inside the frame; not any corners). 1976 * The table gau4_nbr_flags_8x8_4x4blks generated for 16x16 4x4 blocks(ctb_size = 64). 1977 * But the same table holds good for other 4x4 blocks 2d arrays(eg 8x8 4x4 blks,4x4 4x4blks). 1978 * But the flags must be accessed with stride of 16 since the table has been generated for 1979 * ctb_size = 64. For odd 4x4 2d arrays(eg 3x3 4x4 blks) the flags needs modification. 1980 * The flags also need modification for corner ctbs. 1981 */ 1982 memcpy( 1983 ps_ed_ctxt->ai4_nbr_flags, 1984 gau4_nbr_flags_8x8_4x4blks, 1985 sizeof(gau4_nbr_flags_8x8_4x4blks)); 1986 1987 /* 1988 * Since this is the fist ctb in the ctb row, set left flags unavailable for 1st CTB col 1989 */ 1990 for(j = 0; j < num_4x4_blks_ctb_y; j++) 1991 { 1992 SET_L_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]); 1993 SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]); 1994 SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]); 1995 } 1996 /* 1997 * If this is the fist ctb row, set top flags unavailable. 1998 */ 1999 if(ht_offset == 0) 2000 { 2001 for(j = 0; j < num_4x4_blks_in_ctb; j++) 2002 { 2003 SET_T_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]); 2004 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]); 2005 SET_TL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j]); 2006 } 2007 } 2008 2009 /* If this is last ctb row,set BL as not available. */ 2010 if(ht_offset + block_ht >= layer_ht) 2011 { 2012 for(j = 0; j < num_4x4_blks_in_ctb; j++) 2013 { 2014 SET_BL_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(num_4x4_blks_ctb_y - 1) * 8 + j]); 2015 } 2016 } 2017 col_block_no = 0; 2018 /* Call intra analysis for the ctb */ 2019 ed_calc_ctb( 2020 ps_ed_ctxt, 2021 ps_ed_ctb, 2022 ps_ed_ctb_l1, 2023 pu1_src_pre_intra, 2024 src_stride, 2025 num_4x4_blks_in_ctb, 2026 num_4x4_blks_ctb_y, 2027 nbr_flags_ptr, 2028 layer_no, 2029 row_block_no, 2030 col_block_no, 2031 ps_ipe_optimised_function_list, 2032 ps_cmn_utils_optimised_function_list 2033 2034 ); 2035 2036 pu1_src_pre_intra += src_inc_pre_intra; 2037 ps_ed_ctb += inc_ctb; 2038 ps_ed_ctb_l1 += 1; 2039 /* 2040 * For the rest of the ctbs, set left flags available. 2041 */ 2042 for(j = 0; j < num_4x4_blks_ctb_y; j++) 2043 { 2044 SET_L_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]); 2045 } 2046 for(j = 0; j < num_4x4_blks_ctb_y - 1; j++) 2047 { 2048 SET_BL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[j * 8]); 2049 SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[(j + 1) * 8]); 2050 } 2051 if(ht_offset != 0) 2052 { 2053 SET_TL_AVAILABLE(ps_ed_ctxt->ai4_nbr_flags[0]); 2054 } 2055 } 2056 2057 /* The first ctb is processed before the loop. 2058 * The last one is processed after the loop. 2059 */ 2060 for(col_block_no = 1; col_block_no < num_col_blks - 1; col_block_no++) 2061 { 2062 if(!skip_decomp) 2063 { 2064 ihevce_scale_by_2( 2065 pu1_src, 2066 src_stride, 2067 pu1_dst_decomp, 2068 dst_stride, 2069 layer_wd, 2070 layer_ht, 2071 pu1_wkg_mem, 2072 ht_offset, 2073 block_ht, 2074 block_wd * col_block_no, 2075 block_wd, 2076 ps_cmn_utils_optimised_function_list->pf_copy_2d, 2077 ps_ipe_optimised_function_list->pf_scaling_filter_mxn); 2078 /* Disable noise detection */ 2079 memset( 2080 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy, 2081 0, 2082 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy)); 2083 2084 ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0; 2085 } 2086 2087 if(do_pre_intra_analysis) 2088 { 2089 ed_calc_ctb( 2090 ps_ed_ctxt, 2091 ps_ed_ctb, 2092 ps_ed_ctb_l1, 2093 pu1_src_pre_intra, 2094 src_stride, 2095 num_4x4_blks_in_ctb, 2096 num_4x4_blks_ctb_y, 2097 nbr_flags_ptr, 2098 layer_no, 2099 row_block_no, 2100 col_block_no, 2101 ps_ipe_optimised_function_list, 2102 ps_cmn_utils_optimised_function_list); 2103 pu1_src_pre_intra += src_inc_pre_intra; 2104 ps_ed_ctb += inc_ctb; 2105 ps_ed_ctb_l1 += 1; 2106 } 2107 } 2108 2109 /* Last ctb in row */ 2110 if((!skip_decomp) && (col_block_no == (num_col_blks - 1))) 2111 { 2112 ihevce_scale_by_2( 2113 pu1_src, 2114 src_stride, 2115 pu1_dst_decomp, 2116 dst_stride, 2117 layer_wd, 2118 layer_ht, 2119 pu1_wkg_mem, 2120 ht_offset, 2121 block_ht, 2122 block_wd * col_block_no, 2123 block_wd, 2124 ps_cmn_utils_optimised_function_list->pf_copy_2d, 2125 ps_ipe_optimised_function_list->pf_scaling_filter_mxn); 2126 { 2127 /* Disable noise detection */ 2128 memset( 2129 ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy, 2130 0, 2131 sizeof(ps_ctb_analyse_curr->s_ctb_noise_params.au1_is_8x8Blk_noisy)); 2132 2133 ps_ctb_analyse_curr->s_ctb_noise_params.i4_noise_present = 0; 2134 } 2135 } 2136 2137 if(do_pre_intra_analysis && (col_block_no == (num_col_blks - 1))) 2138 { 2139 /* 2140 * The last ctb can be complete or incomplete. The complete 2141 * ctb is handled in the if and incomplete is handled in the 2142 * else case 2143 */ 2144 //if(num_4x4_blks_last_ctb == num_4x4_blks_in_ctb) 2145 if((num_4x4_blks_last_ctb_x == num_4x4_blks_ctb_y) && 2146 (num_4x4_blks_in_ctb == num_4x4_blks_last_ctb_x)) 2147 { 2148 /* Last ctb so set top right not available */ 2149 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[num_4x4_blks_in_ctb - 1]); 2150 2151 ed_calc_ctb( 2152 ps_ed_ctxt, 2153 ps_ed_ctb, 2154 ps_ed_ctb_l1, 2155 pu1_src_pre_intra, 2156 src_stride, 2157 num_4x4_blks_in_ctb, 2158 num_4x4_blks_in_ctb, 2159 nbr_flags_ptr, 2160 layer_no, 2161 row_block_no, 2162 col_block_no, 2163 ps_ipe_optimised_function_list, 2164 ps_cmn_utils_optimised_function_list); 2165 pu1_src_pre_intra += src_inc_pre_intra; 2166 ps_ed_ctb += inc_ctb; 2167 ps_ed_ctb_l1 += 1; 2168 } 2169 else 2170 { 2171 /* Last ctb so set top right not available */ 2172 for(i = 0; i < num_4x4_blks_ctb_y; i++) 2173 { 2174 SET_TR_UNAVAILABLE(ps_ed_ctxt->ai4_nbr_flags[i * 8 + num_4x4_blks_in_ctb - 1]); 2175 } 2176 2177 ihevce_ed_calc_incomplete_ctb( 2178 ps_ed_ctxt, 2179 ps_ed_ctb, 2180 ps_ed_ctb_l1, 2181 pu1_src_pre_intra, 2182 src_stride, 2183 num_4x4_blks_last_ctb_x, 2184 num_4x4_blks_ctb_y, 2185 nbr_flags_ptr, 2186 layer_no, 2187 row_block_no, 2188 col_block_no, 2189 ps_ipe_optimised_function_list, 2190 ps_cmn_utils_optimised_function_list); 2191 } 2192 } 2193 } 2194 2195 /*! 2196 ****************************************************************************** 2197 * \if Function name : ihevce_decomp_pre_intra_process \endif 2198 * 2199 * \brief 2200 * Frame level function to decompose given layer L0 into coarser layers 2201 * 2202 * \param[in] pv_ctxt : pointer to master context of decomp_pre_intra module 2203 * \param[in] ps_inp : pointer to input yuv buffer (frame buffer) 2204 * \param[in] pv_multi_thrd_ctxt : pointer to multithread context 2205 * \param[out] thrd_id : thread id 2206 * 2207 * \return 2208 * None 2209 * 2210 * \author 2211 * Ittiam 2212 * 2213 ***************************************************************************** 2214 */ 2215 void ihevce_decomp_pre_intra_process( 2216 void *pv_ctxt, 2217 ihevce_lap_output_params_t *ps_lap_out_prms, 2218 frm_ctb_ctxt_t *ps_frm_ctb_prms, 2219 void *pv_multi_thrd_ctxt, 2220 WORD32 thrd_id, 2221 WORD32 i4_ping_pong, 2222 ihevce_8x8_L0_satd_t *ps_layer0_cur_satd, 2223 ihevce_8x8_L0_mean_t *ps_layer0_cur_mean) 2224 { 2225 WORD32 i4_layer_no; 2226 WORD32 i4_num_layers; 2227 WORD32 end_of_layer; 2228 UWORD8 *pu1_src, *pu1_dst; 2229 WORD32 src_stride, dst_stride; 2230 WORD32 i4_layer_wd, i4_layer_ht; 2231 WORD32 ht_offset, block_ht; 2232 WORD32 row_block_no, num_row_blocks; 2233 UWORD8 *pu1_wkg_mem; 2234 WORD32 block_wd; 2235 WORD32 num_col_blks; 2236 WORD32 skip_decomp, skip_pre_intra; 2237 WORD32 i4_cu_aligned_pic_wd, i4_cu_aligned_pic_ht; 2238 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = 2239 (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt; 2240 2241 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = 2242 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thrd_id]; 2243 multi_thrd_ctxt_t *ps_multi_thrd = (multi_thrd_ctxt_t *)pv_multi_thrd_ctxt; 2244 2245 ihevce_ed_ctxt_t *ps_ed_ctxt; 2246 ihevce_ed_blk_t *ps_ed; 2247 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1; 2248 WORD32 inc_ctb = 0; 2249 WORD32 num_4x4_blks_lyr; 2250 2251 i4_num_layers = ps_ctxt->i4_num_layers; 2252 2253 ASSERT(i4_num_layers >= 3); 2254 2255 /* 2256 * Always force minimum layers as 4 so that we would have both l1 and l2 2257 * pre intra analysis 2258 */ 2259 if(i4_num_layers == 3) 2260 { 2261 i4_num_layers = 4; 2262 } 2263 2264 ps_ctxt->as_layers[0].pu1_inp = (UWORD8 *)ps_lap_out_prms->s_input_buf.pv_y_buf; 2265 ps_ctxt->as_layers[0].i4_inp_stride = ps_lap_out_prms->s_input_buf.i4_y_strd; 2266 ps_ctxt->as_layers[0].i4_actual_wd = ps_lap_out_prms->s_input_buf.i4_y_wd; 2267 ps_ctxt->as_layers[0].i4_actual_ht = ps_lap_out_prms->s_input_buf.i4_y_ht; 2268 2269 /* ------------ Loop over all the layers --------------- */ 2270 /* This loop does only decomp for all layers by picking jobs from job queue */ 2271 /* Decomp for all layers will completed with this for loop */ 2272 for(i4_layer_no = 0; i4_layer_no < (i4_num_layers - 1); i4_layer_no++) 2273 { 2274 WORD32 idx = 0; 2275 src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride; 2276 pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp; 2277 i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd; 2278 i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht; 2279 pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp; 2280 dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride; 2281 block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd; 2282 block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht; 2283 num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks; 2284 num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks; 2285 i4_cu_aligned_pic_wd = ps_frm_ctb_prms->i4_cu_aligned_pic_wd; 2286 i4_cu_aligned_pic_ht = ps_frm_ctb_prms->i4_cu_aligned_pic_ht; 2287 2288 /* register ed_ctxt buffer pointer */ 2289 //pv_ed_ctxt = &ps_ctxt->as_layers[i4_layer_no].s_early_decision; 2290 //ps_ed_ctxt = (ihevce_ed_ctxt_t *)pv_ed_ctxt; 2291 //ps_ed = ps_ed_ctxt->ps_ed; 2292 2293 //pv_ed_ctxt = &ps_ctxt->ps_ed_ctxt; 2294 ps_ed_ctxt = ps_ctxt->ps_ed_ctxt; 2295 2296 /* initialize ed_ctxt here */ 2297 /* init is moved here since now allocation is happening for only one instance 2298 is allocated. for each layer it is re-used */ 2299 ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no]; 2300 ps_ed_ctxt->i4_slice_type = ps_ctxt->i4_slice_type; 2301 ps_ed_ctxt->level = ps_ctxt->i4_codec_level; 2302 if(0 == i4_layer_no) 2303 { 2304 ps_ed_ctxt->ps_ed_pic = NULL; 2305 ps_ed_ctxt->ps_ed = NULL; 2306 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL; 2307 ps_ed_ctxt->ps_ed_ctb_l1 = NULL; 2308 } 2309 else if(1 == i4_layer_no) 2310 { 2311 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf; 2312 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf; 2313 ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1; 2314 ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1; 2315 ps_ctxt->ps_layer0_cur_satd = NULL; 2316 ps_ctxt->ps_layer0_cur_mean = NULL; 2317 } 2318 else if(2 == i4_layer_no) 2319 { 2320 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf; 2321 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf; 2322 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL; 2323 ps_ed_ctxt->ps_ed_ctb_l1 = NULL; 2324 ps_ctxt->ps_layer0_cur_satd = NULL; 2325 ps_ctxt->ps_layer0_cur_mean = NULL; 2326 } 2327 2328 /*Calculate the number of 4x4 blocks in a CTB in that layer*/ 2329 /*Divide block_wd by 4. 4 to get no of 4x4 blks*/ 2330 num_4x4_blks_lyr = block_wd >> 2; 2331 inc_ctb = num_4x4_blks_lyr * num_4x4_blks_lyr; 2332 2333 ps_ed = ps_ed_ctxt->ps_ed; 2334 ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1; 2335 2336 end_of_layer = 0; 2337 skip_decomp = 0; 2338 skip_pre_intra = 1; 2339 //if( i4_layer_no >= ps_ctxt->i4_num_layers) 2340 if(i4_layer_no >= (ps_ctxt->i4_num_layers - 1)) 2341 { 2342 skip_decomp = 1; 2343 } 2344 /* ------------ Loop over all the CTB rows --------------- */ 2345 while(0 == end_of_layer) 2346 { 2347 job_queue_t *ps_pre_enc_job; 2348 WORD32 num_4x4_blks_ctb_y = 0; 2349 WORD32 num_4x4_blks_last_ctb_x = 0; 2350 2351 /* Get the current row from the job queue */ 2352 ps_pre_enc_job = (job_queue_t *)ihevce_pre_enc_grp_get_next_job( 2353 pv_multi_thrd_ctxt, (DECOMP_JOB_LYR0 + i4_layer_no), 1, i4_ping_pong); 2354 2355 pu1_wkg_mem = ps_ctxt->pu1_wkg_mem; 2356 2357 /* If all rows are done, set the end of layer flag to 1, */ 2358 if(NULL == ps_pre_enc_job) 2359 { 2360 end_of_layer = 1; 2361 } 2362 else 2363 { 2364 /* Obtain the current row's details from the job */ 2365 row_block_no = ps_pre_enc_job->s_job_info.s_decomp_job_info.i4_vert_unit_row_no; 2366 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = row_block_no; 2367 ht_offset = row_block_no * block_ht; 2368 2369 if(row_block_no < (num_row_blocks)) 2370 { 2371 pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp + 2372 ((block_ht >> 1) * dst_stride * row_block_no); 2373 2374 /*L0 8x8 curr satd for qp mod*/ 2375 if(i4_layer_no == 0) 2376 { 2377 ps_ctxt->ps_layer0_cur_satd = 2378 ps_layer0_cur_satd + (row_block_no * num_col_blks /*num ctbs*/ * 2379 (block_wd >> 3) * (block_ht >> 3)); 2380 ps_ctxt->ps_layer0_cur_mean = 2381 ps_layer0_cur_mean + (row_block_no * num_col_blks /*num ctbs*/ * 2382 (block_wd >> 3) * (block_ht >> 3)); 2383 } 2384 2385 /* call the row level processing function */ 2386 ihevce_decomp_pre_intra_process_row( 2387 pu1_src, 2388 src_stride, 2389 pu1_dst, 2390 dst_stride, 2391 i4_layer_wd, 2392 i4_layer_ht, 2393 pu1_wkg_mem, 2394 ht_offset, 2395 block_ht, 2396 block_wd, 2397 i4_cu_aligned_pic_wd, 2398 i4_cu_aligned_pic_ht, 2399 num_col_blks, 2400 i4_layer_no, 2401 ps_ed_ctxt, 2402 ps_ed, 2403 ps_ed_ctb_l1, 2404 ps_ctxt->ps_layer0_cur_satd, 2405 ps_ctxt->ps_layer0_cur_mean, 2406 num_4x4_blks_ctb_y, 2407 num_4x4_blks_last_ctb_x, 2408 skip_decomp, 2409 skip_pre_intra, 2410 row_block_no, 2411 ps_ctxt->i4_enable_noise_detection, 2412 ps_ctxt->ps_ctb_analyse, 2413 &ps_ctxt->s_ipe_optimised_function_list, 2414 &ps_ctxt->s_cmn_opt_func); 2415 2416 /*When decompositionis done from L1 to L2 2417 pre intra analysis is done on L1*/ 2418 if(i4_layer_no == 1 || i4_layer_no == 2) 2419 { 2420 // ps_ed = ps_ed_ctxt->ps_ed + 2421 // (row_block_no * inc_ctb * (num_col_blks)); 2422 } 2423 } 2424 idx++; 2425 /* set the output dependency */ 2426 ihevce_pre_enc_grp_job_set_out_dep( 2427 pv_multi_thrd_ctxt, ps_pre_enc_job, i4_ping_pong); 2428 } 2429 } 2430 ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = idx; 2431 2432 ihevce_ed_frame_init(ps_ed_ctxt, i4_layer_no); 2433 2434 if((1 == i4_layer_no) && (IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset)) 2435 { 2436 WORD32 vert_ctr, ctb_ctr, i; 2437 WORD32 ctb_ctr_blks = ps_ctxt->as_layers[1].i4_num_col_blks; 2438 WORD32 vert_ctr_blks = ps_ctxt->as_layers[1].i4_num_row_blks; 2439 2440 if((ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P6) && 2441 (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE)) 2442 { 2443 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++) 2444 { 2445 ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 = 2446 ps_ctxt->ps_ed_ctb_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz; 2447 2448 for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++) 2449 { 2450 ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr; 2451 for(i = 0; i < 16; i++) 2452 { 2453 ps_ed_ctb_curr_l1->i4_best_sad_cost_8x8_l1_ipe[i] = 0x7fffffff; 2454 ps_ed_ctb_curr_l1->i4_best_sad_8x8_l1_ipe[i] = 0x7fffffff; 2455 } 2456 } 2457 } 2458 } 2459 } 2460 2461 #if DISABLE_L2_IPE_IN_PB_L1_IN_B 2462 if(((2 == i4_layer_no) && (ps_lap_out_prms->i4_pic_type == IV_I_FRAME || 2463 ps_lap_out_prms->i4_pic_type == IV_IDR_FRAME)) || 2464 ((1 == i4_layer_no) && 2465 (ps_lap_out_prms->i4_temporal_lyr_id <= TEMPORAL_LAYER_DISABLE)) || 2466 ((IHEVCE_QUALITY_P6 != ps_ctxt->i4_quality_preset) && (0 != i4_layer_no))) 2467 #else 2468 if((0 != i4_layer_no) && 2469 (1 != ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) && 2470 (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE)))) 2471 #endif 2472 { 2473 WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed; 2474 2475 src_stride = ps_ctxt->as_layers[i4_layer_no].i4_inp_stride; 2476 pu1_src = ps_ctxt->as_layers[i4_layer_no].pu1_inp; 2477 i4_layer_wd = ps_ctxt->as_layers[i4_layer_no].i4_actual_wd; 2478 i4_layer_ht = ps_ctxt->as_layers[i4_layer_no].i4_actual_ht; 2479 pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp; 2480 dst_stride = ps_ctxt->as_layers[i4_layer_no + 1].i4_inp_stride; 2481 block_wd = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_wd; 2482 block_ht = ps_ctxt->as_layers[i4_layer_no].i4_decomp_blk_ht; 2483 num_col_blks = ps_ctxt->as_layers[i4_layer_no].i4_num_col_blks; 2484 num_row_blocks = ps_ctxt->as_layers[i4_layer_no].i4_num_row_blks; 2485 i4_cu_aligned_pic_wd = ps_frm_ctb_prms->i4_cu_aligned_pic_wd; 2486 i4_cu_aligned_pic_ht = ps_frm_ctb_prms->i4_cu_aligned_pic_ht; 2487 2488 /* register ed_ctxt buffer pointer */ 2489 ps_ed_ctxt = ps_ctxt->ps_ed_ctxt; 2490 2491 /* initialize ed_ctxt here */ 2492 /* init is moved here since now allocation is happening for only one instance 2493 is allocated. for each layer it is re-used */ 2494 ps_ed_ctxt->lambda = ps_ctxt->ai4_lambda[i4_layer_no]; 2495 ps_ed_ctxt->i4_slice_type = ps_ctxt->i4_slice_type; 2496 ps_ed_ctxt->level = ps_ctxt->i4_codec_level; 2497 if(1 == i4_layer_no) 2498 { 2499 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer1_buf; 2500 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer1_buf; 2501 ps_ed_ctxt->ps_ed_ctb_l1_pic = ps_ctxt->ps_ed_ctb_l1; 2502 ps_ed_ctxt->ps_ed_ctb_l1 = ps_ctxt->ps_ed_ctb_l1; 2503 ps_ctxt->ps_layer0_cur_satd = NULL; 2504 ps_ctxt->ps_layer0_cur_mean = NULL; 2505 } 2506 else if(2 == i4_layer_no) 2507 { 2508 ps_ed_ctxt->ps_ed_pic = ps_ctxt->ps_layer2_buf; 2509 ps_ed_ctxt->ps_ed = ps_ctxt->ps_layer2_buf; 2510 ps_ed_ctxt->ps_ed_ctb_l1_pic = NULL; 2511 ps_ed_ctxt->ps_ed_ctb_l1 = NULL; 2512 ps_ctxt->ps_layer0_cur_satd = NULL; 2513 ps_ctxt->ps_layer0_cur_mean = NULL; 2514 } 2515 2516 /*Calculate the number of 4x4 blocks in a CTB in that layer*/ 2517 /*Divide block_wd by 4. 4 to get no of 4x4 blks*/ 2518 num_4x4_blks_lyr = block_wd >> 2; 2519 inc_ctb = num_4x4_blks_lyr * num_4x4_blks_lyr; 2520 2521 ps_ed = ps_ed_ctxt->ps_ed; 2522 ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1; 2523 skip_decomp = 1; 2524 skip_pre_intra = 0; 2525 for(idx = 0; idx < i4_num_rows; idx++) 2526 { 2527 WORD32 num_4x4_blks_ctb_y = 0; 2528 WORD32 num_4x4_blks_last_ctb_x = 0; 2529 2530 pu1_wkg_mem = ps_ctxt->pu1_wkg_mem; 2531 2532 { 2533 /* Obtain the current row's details from the job */ 2534 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx]; 2535 ht_offset = row_block_no * block_ht; 2536 2537 if(row_block_no < (num_row_blocks)) 2538 { 2539 pu1_dst = ps_ctxt->as_layers[i4_layer_no + 1].pu1_inp + 2540 ((block_ht >> 1) * dst_stride * row_block_no); 2541 2542 if(i4_layer_no == 1 || i4_layer_no == 2) 2543 { 2544 ps_ed = ps_ed_ctxt->ps_ed + (row_block_no * inc_ctb * (num_col_blks)); 2545 ps_ed_ctb_l1 = ps_ed_ctxt->ps_ed_ctb_l1 + (row_block_no * num_col_blks); 2546 2547 ps_ed_ctxt->i4_quality_preset = ps_ctxt->i4_quality_preset; 2548 num_4x4_blks_ctb_y = block_ht >> 2; 2549 num_4x4_blks_last_ctb_x = block_wd >> 2; 2550 2551 if(row_block_no == num_row_blocks - 1) 2552 { 2553 if(i4_layer_ht % block_ht) 2554 { 2555 num_4x4_blks_ctb_y = ((i4_layer_ht % block_ht) + 3) >> 2; 2556 } 2557 } 2558 2559 if(i4_layer_wd % block_wd) 2560 { 2561 num_4x4_blks_last_ctb_x = ((i4_layer_wd % block_wd) + 3) >> 2; 2562 } 2563 } 2564 2565 /* call the row level processing function */ 2566 ihevce_decomp_pre_intra_process_row( 2567 pu1_src, 2568 src_stride, 2569 pu1_dst, 2570 dst_stride, 2571 i4_layer_wd, 2572 i4_layer_ht, 2573 pu1_wkg_mem, 2574 ht_offset, 2575 block_ht, 2576 block_wd, 2577 i4_cu_aligned_pic_wd, 2578 i4_cu_aligned_pic_ht, 2579 num_col_blks, 2580 i4_layer_no, 2581 ps_ed_ctxt, 2582 ps_ed, 2583 ps_ed_ctb_l1, 2584 ps_ctxt->ps_layer0_cur_satd, 2585 ps_ctxt->ps_layer0_cur_mean, 2586 num_4x4_blks_ctb_y, 2587 num_4x4_blks_last_ctb_x, 2588 skip_decomp, 2589 skip_pre_intra, 2590 row_block_no, 2591 0, 2592 NULL, 2593 &ps_ctxt->s_ipe_optimised_function_list, 2594 &ps_ctxt->s_cmn_opt_func); 2595 } 2596 } 2597 if(1 == i4_layer_no) 2598 { 2599 ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1; 2600 } 2601 } 2602 for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++) 2603 { 2604 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1; 2605 } 2606 ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0; 2607 } 2608 2609 #if DISABLE_L2_IPE_IN_PB_L1_IN_B 2610 if((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) && 2611 (((i4_layer_no == 2) && (ps_lap_out_prms->i4_pic_type == ISLICE)) || 2612 ((i4_layer_no == 1) && (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE)))) 2613 { 2614 WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed; 2615 if(1 == i4_layer_no) 2616 { 2617 for(idx = 0; idx < i4_num_rows; idx++) 2618 { 2619 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx]; 2620 2621 { 2622 ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1; 2623 } 2624 } 2625 } 2626 for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++) 2627 { 2628 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1; 2629 } 2630 ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0; 2631 } 2632 #else 2633 if((i4_layer_no != 0) && ((IHEVCE_QUALITY_P6 == ps_ctxt->i4_quality_preset) && 2634 (ps_lap_out_prms->i4_temporal_lyr_id > TEMPORAL_LAYER_DISABLE))) 2635 { 2636 WORD32 i4_num_rows = ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed; 2637 for(idx = 0; idx < i4_num_rows; idx++) 2638 { 2639 row_block_no = ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx]; 2640 if(1 == i4_layer_no) 2641 { 2642 ps_multi_thrd->aai4_l1_pre_intra_done[i4_ping_pong][row_block_no] = 1; 2643 } 2644 } 2645 for(idx = 0; idx < MAX_NUM_CTB_ROWS_FRM; idx++) 2646 { 2647 ps_ctxt->as_layers[i4_layer_no].ai4_curr_row_no[idx] = -1; 2648 } 2649 ps_ctxt->as_layers[i4_layer_no].i4_num_rows_processed = 0; 2650 } 2651 #endif 2652 } 2653 } 2654 2655 /*! 2656 ************************************************************************ 2657 * \brief 2658 * return number of records used by decomp pre intra 2659 * 2660 ************************************************************************ 2661 */ 2662 WORD32 ihevce_decomp_pre_intra_get_num_mem_recs(void) 2663 { 2664 return (NUM_DECOMP_PRE_INTRA_MEM_RECS); 2665 } 2666 2667 /*! 2668 ************************************************************************ 2669 * @brief 2670 * return each record attributes of decomp pre intra 2671 ************************************************************************ 2672 */ 2673 WORD32 ihevce_decomp_pre_intra_get_mem_recs( 2674 iv_mem_rec_t *ps_mem_tab, WORD32 i4_num_proc_thrds, WORD32 i4_mem_space) 2675 { 2676 /* memories should be requested assuming worst case requirememnts */ 2677 2678 /* Module context structure */ 2679 ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_size = sizeof(ihevce_decomp_pre_intra_master_ctxt_t); 2680 ps_mem_tab[DECOMP_PRE_INTRA_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space; 2681 ps_mem_tab[DECOMP_PRE_INTRA_CTXT].i4_mem_alignment = 8; 2682 2683 /* Thread context structure */ 2684 ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_size = 2685 i4_num_proc_thrds * sizeof(ihevce_decomp_pre_intra_ctxt_t); 2686 ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space; 2687 ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].i4_mem_alignment = 8; 2688 2689 /* early decision context structure */ 2690 ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_size = i4_num_proc_thrds * sizeof(ihevce_ed_ctxt_t); 2691 ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].e_mem_type = (IV_MEM_TYPE_T)i4_mem_space; 2692 ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].i4_mem_alignment = 8; 2693 2694 return (NUM_DECOMP_PRE_INTRA_MEM_RECS); 2695 } 2696 2697 /*! 2698 ************************************************************************ 2699 * @brief 2700 * Init decomp pre intra context 2701 ************************************************************************ 2702 */ 2703 void *ihevce_decomp_pre_intra_init( 2704 iv_mem_rec_t *ps_mem_tab, 2705 ihevce_static_cfg_params_t *ps_init_prms, 2706 WORD32 i4_num_proc_thrds, 2707 func_selector_t *ps_func_selector, 2708 WORD32 i4_resolution_id, 2709 UWORD8 u1_is_popcnt_available) 2710 { 2711 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt; 2712 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt; 2713 WORD32 thread_no; 2714 WORD32 n_tot_layers; 2715 WORD32 count; 2716 WORD32 a_wd[MAX_NUM_HME_LAYERS], a_ht[MAX_NUM_HME_LAYERS], layer_no; 2717 WORD32 a_disp_wd[MAX_NUM_LAYERS], a_disp_ht[MAX_NUM_LAYERS]; 2718 ihevce_ed_ctxt_t *ps_ed_ctxt; 2719 WORD32 min_cu_size; 2720 2721 /* get the min cu size from config params */ 2722 min_cu_size = ps_init_prms->s_config_prms.i4_min_log2_cu_size; 2723 2724 min_cu_size = 1 << min_cu_size; 2725 2726 /* Get the height and width of each layer */ 2727 *a_wd = ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_width + 2728 SET_CTB_ALIGN( 2729 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_width, min_cu_size); 2730 *a_ht = 2731 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_height + 2732 SET_CTB_ALIGN( 2733 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_height, min_cu_size); 2734 2735 n_tot_layers = hme_derive_num_layers(1, a_wd, a_ht, a_disp_wd, a_disp_ht); 2736 2737 /* Decomp state structure */ 2738 ps_master_ctxt = 2739 (ihevce_decomp_pre_intra_master_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_CTXT].pv_base; 2740 ps_master_ctxt->i4_num_proc_thrds = i4_num_proc_thrds; 2741 2742 ps_ctxt = (ihevce_decomp_pre_intra_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_THRDS_CTXT].pv_base; 2743 ps_ed_ctxt = (ihevce_ed_ctxt_t *)ps_mem_tab[DECOMP_PRE_INTRA_ED_CTXT].pv_base; 2744 2745 for(thread_no = 0; thread_no < ps_master_ctxt->i4_num_proc_thrds; thread_no++) 2746 { 2747 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no] = ps_ctxt; 2748 2749 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->i4_num_layers = n_tot_layers; 2750 2751 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->pu1_wkg_mem = 2752 &ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->au1_wkg_mem[0]; 2753 2754 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]->ps_ed_ctxt = ps_ed_ctxt; 2755 2756 for(layer_no = 0; layer_no < n_tot_layers; layer_no++) 2757 { 2758 WORD32 max_ctb_size; 2759 WORD32 decomp_blk_ht, decomp_blk_wd; 2760 2761 ps_ctxt->as_layers[layer_no].i4_actual_wd = a_wd[layer_no]; 2762 ps_ctxt->as_layers[layer_no].i4_actual_ht = a_ht[layer_no]; 2763 ps_ctxt->as_layers[layer_no].i4_inp_stride = 0; 2764 ps_ctxt->as_layers[layer_no].pu1_inp = NULL; 2765 ps_ctxt->as_layers[layer_no].i4_num_rows_processed = 0; 2766 2767 for(count = 0; count < MAX_NUM_CTB_ROWS_FRM; count++) 2768 { 2769 ps_ctxt->as_layers[layer_no].ai4_curr_row_no[count] = -1; 2770 } 2771 if(0 == layer_no) 2772 { 2773 ps_ctxt->as_layers[layer_no].i4_padded_ht = a_ht[layer_no]; 2774 ps_ctxt->as_layers[layer_no].i4_padded_wd = a_wd[layer_no]; 2775 } 2776 else 2777 { 2778 ps_ctxt->as_layers[layer_no].i4_padded_ht = a_ht[layer_no] + 32 + 4; 2779 ps_ctxt->as_layers[layer_no].i4_padded_wd = a_wd[layer_no] + 32 + 4; 2780 } 2781 2782 /** If CTB size= 64.decomp_blk_wd = 64 for L0, 32 for L1 , 16 for L2, 8 for L3 */ 2783 max_ctb_size = 1 << ps_init_prms->s_config_prms.i4_max_log2_cu_size; 2784 2785 ps_ctxt->as_layers[layer_no].i4_decomp_blk_ht = max_ctb_size >> layer_no; 2786 ps_ctxt->as_layers[layer_no].i4_decomp_blk_wd = max_ctb_size >> layer_no; 2787 2788 decomp_blk_ht = ps_ctxt->as_layers[layer_no].i4_decomp_blk_ht; 2789 decomp_blk_wd = ps_ctxt->as_layers[layer_no].i4_decomp_blk_wd; 2790 2791 ps_ctxt->as_layers[layer_no].i4_num_row_blks = 2792 ((a_ht[layer_no] + (decomp_blk_ht - 1)) / decomp_blk_ht); 2793 2794 ps_ctxt->as_layers[layer_no].i4_num_col_blks = 2795 ((a_wd[layer_no] + (decomp_blk_wd - 1)) / decomp_blk_wd); 2796 } 2797 ps_ed_ctxt->ps_func_selector = ps_func_selector; 2798 2799 ps_ctxt->i4_quality_preset = 2800 ps_init_prms->s_tgt_lyr_prms.as_tgt_params[i4_resolution_id].i4_quality_preset; 2801 2802 if(ps_ctxt->i4_quality_preset == IHEVCE_QUALITY_P7) 2803 { 2804 ps_ctxt->i4_quality_preset = IHEVCE_QUALITY_P6; 2805 } 2806 2807 if(ps_init_prms->s_coding_tools_prms.i4_vqet & 2808 (1 << BITPOS_IN_VQ_TOGGLE_FOR_CONTROL_TOGGLER)) 2809 { 2810 if(ps_init_prms->s_coding_tools_prms.i4_vqet & 2811 (1 << BITPOS_IN_VQ_TOGGLE_FOR_ENABLING_NOISE_PRESERVATION)) 2812 { 2813 ps_ctxt->i4_enable_noise_detection = 1; 2814 } 2815 else 2816 { 2817 ps_ctxt->i4_enable_noise_detection = 0; 2818 } 2819 } 2820 else 2821 { 2822 ps_ctxt->i4_enable_noise_detection = 0; 2823 } 2824 2825 ihevce_cmn_utils_instr_set_router( 2826 &ps_ctxt->s_cmn_opt_func, u1_is_popcnt_available, ps_init_prms->e_arch_type); 2827 2828 ihevce_ipe_instr_set_router( 2829 &ps_ctxt->s_ipe_optimised_function_list, ps_init_prms->e_arch_type); 2830 2831 ps_ctxt++; 2832 ps_ed_ctxt++; 2833 } 2834 /* return the handle to caller */ 2835 return ((void *)ps_master_ctxt); 2836 } 2837 2838 /*! 2839 ****************************************************************************** 2840 * \if Function name : ihevce_decomp_pre_intra_frame_init \endif 2841 * 2842 * \brief 2843 * Frame Intialization for Decomp intra pre analysis. 2844 * 2845 * \param[in] pv_ctxt : pointer to module ctxt 2846 * \param[in] ppu1_decomp_lyr_bufs : pointer to array of layer buffer pointers 2847 * \param[in] pi4_lyr_buf_stride : pointer to array of layer buffer strides 2848 * 2849 * \return 2850 * None 2851 * 2852 * \author 2853 * Ittiam 2854 * 2855 ***************************************************************************** 2856 */ 2857 void ihevce_decomp_pre_intra_frame_init( 2858 void *pv_ctxt, 2859 UWORD8 **ppu1_decomp_lyr_bufs, 2860 WORD32 *pi4_lyr_buf_stride, 2861 ihevce_ed_blk_t *ps_layer1_buf, 2862 ihevce_ed_blk_t *ps_layer2_buf, 2863 ihevce_ed_ctb_l1_t *ps_ed_ctb_l1, 2864 WORD32 i4_ol_sad_lambda_qf, 2865 WORD32 i4_slice_type, 2866 ctb_analyse_t *ps_ctb_analyse) 2867 { 2868 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt; 2869 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt; 2870 WORD32 thread_no; 2871 2872 /* Decomp state structure */ 2873 ps_master_ctxt = (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt; 2874 2875 for(thread_no = 0; thread_no < ps_master_ctxt->i4_num_proc_thrds; thread_no++) 2876 { 2877 WORD32 layer_no; 2878 2879 ps_ctxt = ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[thread_no]; 2880 2881 /* L0 layer (actual input) is registered in process call */ 2882 for(layer_no = 1; layer_no < ps_ctxt->i4_num_layers; layer_no++) 2883 { 2884 ps_ctxt->as_layers[layer_no].i4_inp_stride = pi4_lyr_buf_stride[layer_no - 1]; 2885 ps_ctxt->as_layers[layer_no].pu1_inp = ppu1_decomp_lyr_bufs[layer_no - 1]; 2886 2887 /*Populating the buffer pointers for layer1 and layer2 buffers to store the 2888 structure for each 4x4 block after pre intra analysis on their respective laeyrs*/ 2889 2890 if(layer_no == 1) 2891 { 2892 WORD32 sad_lambda_l1 = (3 * i4_ol_sad_lambda_qf >> 2); 2893 WORD32 temp = 1 << LAMBDA_Q_SHIFT; 2894 WORD32 lambda = ((temp) > sad_lambda_l1) ? temp : sad_lambda_l1; 2895 //ps_ctxt->as_layers[1].s_early_decision.ps_ed_pic = ps_layer1_buf; 2896 //ps_ctxt->as_layers[1].s_early_decision.ps_ed = ps_layer1_buf; 2897 ps_ctxt->ps_layer1_buf = ps_layer1_buf; 2898 ps_ctxt->ps_ed_ctb_l1 = ps_ed_ctb_l1; 2899 ps_ctxt->ai4_lambda[layer_no] = lambda; 2900 ps_ctxt->i4_codec_level = 0; 2901 ps_ctxt->i4_slice_type = i4_slice_type; 2902 } 2903 else if(layer_no == 2) 2904 { 2905 WORD32 sad_lambda_l2 = i4_ol_sad_lambda_qf >> 1; 2906 WORD32 temp = 1 << LAMBDA_Q_SHIFT; 2907 WORD32 lambda = ((temp) > sad_lambda_l2) ? temp : sad_lambda_l2; 2908 2909 //ps_ctxt->as_layers[2].s_early_decision.ps_ed_pic = ps_layer2_buf; 2910 //ps_ctxt->as_layers[2].s_early_decision.ps_ed = ps_layer2_buf; 2911 ps_ctxt->ps_layer2_buf = ps_layer2_buf; 2912 //ihevce_ed_frame_init(ps_ctxt->ps_ed_ctxt); 2913 ps_ctxt->ai4_lambda[layer_no] = lambda; 2914 ps_ctxt->i4_codec_level = 0; 2915 ps_ctxt->i4_slice_type = i4_slice_type; 2916 } 2917 else 2918 { 2919 //ps_ctxt->as_layers[0].s_early_decision.ps_ed_pic = NULL; 2920 //ps_ctxt->as_layers[0].s_early_decision.ps_ed = NULL; 2921 //ps_ctxt->ps_layer1_buf = NULL; 2922 ps_ctxt->ai4_lambda[layer_no] = -1; 2923 ps_ctxt->i4_codec_level = 0; 2924 ps_ctxt->i4_slice_type = i4_slice_type; 2925 } 2926 } 2927 2928 /* make the ps_ctb_analyse refernce as a part of the private context */ 2929 ps_ctxt->ps_ctb_analyse = ps_ctb_analyse; 2930 } 2931 } 2932 2933 /** 2934 ******************************************************************************* 2935 * 2936 * @brief 2937 * Merge Sort function. 2938 * 2939 * @par Description: 2940 * This function sorts the data in the input array in ascending 2941 * order using merge sort algorithm. Intermediate data obtained in 2942 * merge sort are stored in output 2-D array. 2943 * 2944 * @param[in] 2945 * pi4_input_val : Input 1-D array 2946 * aai4_output_val: Output 2-D array containing elements sorted in sets of 2947 * 4,16,64 etc. 2948 * i4_length : length of the array 2949 * i4_ip_sort_level: Input sort level. Specifies the level upto which array is sorted. 2950 * It should be 1 if the array is unsorted. Should be 4 if array is sorted 2951 * in sets of 4. 2952 * i4_op_sort_level: Output sort level. Specify the level upto which sorting is required. 2953 * If it is given as length of array it sorts for whole array. 2954 * 2955 * @returns 2956 * 2957 * @remarks 2958 * None 2959 * 2960 ******************************************************************************* 2961 */ 2962 void ihevce_merge_sort( 2963 WORD32 *pi4_input_val, 2964 WORD32 aai4_output_val[][64], 2965 WORD32 i4_length, 2966 WORD32 i4_ip_sort_level, 2967 WORD32 i4_op_sort_level) 2968 { 2969 WORD32 i, j, k; 2970 WORD32 count, level; 2971 WORD32 temp[64]; 2972 WORD32 *pi4_temp_buf_cpy; 2973 WORD32 *pi4_temp = &temp[0]; 2974 WORD32 calc_level; 2975 2976 pi4_temp_buf_cpy = pi4_temp; 2977 2978 GETRANGE(calc_level, i4_op_sort_level / i4_ip_sort_level); 2979 2980 calc_level = calc_level - 1; 2981 2982 /*** This function is written under the assumption that we need only intermediate values of 2983 sort in the range of 4,16,64 etc. ***/ 2984 ASSERT((calc_level % 2) == 0); 2985 2986 /** One iteration of this for loop does 1 sets of sort and produces one intermediate value in 2 iterations **/ 2987 for(level = 0; level < calc_level; level++) 2988 { 2989 /** Merges adjacent sets of elements based on current sort level **/ 2990 for(count = 0; count < i4_length; (count = count + (i4_ip_sort_level * 2))) 2991 { 2992 i = 0; 2993 j = 0; 2994 if(pi4_input_val[i4_ip_sort_level - 1] < pi4_input_val[i4_ip_sort_level]) 2995 { 2996 /*** Condition for early exit ***/ 2997 memcpy(&pi4_temp[0], pi4_input_val, sizeof(WORD32) * i4_ip_sort_level * 2); 2998 } 2999 else 3000 { 3001 for(k = 0; k < (i4_ip_sort_level * 2); k++) 3002 { 3003 if((i < i4_ip_sort_level) && (j < i4_ip_sort_level)) 3004 { 3005 if(pi4_input_val[i] > pi4_input_val[j + i4_ip_sort_level]) 3006 { 3007 /** copy to output array **/ 3008 pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level]; 3009 j++; 3010 } 3011 else 3012 { 3013 /** copy to output array **/ 3014 pi4_temp[k] = pi4_input_val[i]; 3015 i++; 3016 } 3017 } 3018 else if(i == i4_ip_sort_level) 3019 { 3020 /** copy the remaining data to output array **/ 3021 pi4_temp[k] = pi4_input_val[j + i4_ip_sort_level]; 3022 j++; 3023 } 3024 else 3025 { 3026 /** copy the remaining data to output array **/ 3027 pi4_temp[k] = pi4_input_val[i]; 3028 i++; 3029 } 3030 } 3031 } 3032 pi4_input_val += (i4_ip_sort_level * 2); 3033 pi4_temp += (i4_ip_sort_level * 2); 3034 } 3035 pi4_input_val = pi4_temp - i4_length; 3036 3037 if(level % 2) 3038 { 3039 /** Assign a temp address for storing next sort level output as we will not need this data as output **/ 3040 pi4_temp = pi4_temp_buf_cpy; 3041 } 3042 else 3043 { 3044 /** Assign address for storing the intermediate data into output 2-D array **/ 3045 pi4_temp = aai4_output_val[level / 2]; 3046 } 3047 i4_ip_sort_level *= 2; 3048 } 3049 } 3050 3051 void ihevce_decomp_pre_intra_curr_frame_pre_intra_deinit( 3052 void *pv_pre_intra_ctxt, 3053 pre_enc_me_ctxt_t *ps_curr_out, 3054 WORD32 i4_is_last_thread, 3055 frm_ctb_ctxt_t *ps_frm_ctb_prms, 3056 WORD32 i4_temporal_lyr_id, 3057 WORD32 i4_enable_noise_detection) 3058 { 3059 ihevce_decomp_pre_intra_master_ctxt_t *ps_pre_intra_master_ctxt = 3060 (ihevce_decomp_pre_intra_master_ctxt_t *)pv_pre_intra_ctxt; 3061 ihevce_decomp_pre_intra_ctxt_t *ps_pre_intra_ctxt = 3062 ps_pre_intra_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[0]; 3063 3064 WORD32 i4_k; 3065 WORD32 ctb_ctr, vert_ctr; 3066 3067 WORD32 ai4_curr_frame_8x8_sum_act[2] = { 0, 0 }; 3068 LWORD64 ai8_curr_frame_8x8_sum_act_sqr[2] = { 0, 0 }; 3069 WORD32 ai4_curr_frame_8x8_sum_blks[2] = { 0, 0 }; 3070 ULWORD64 u8_curr_frame_8x8_sum_act_sqr = 0; 3071 3072 LWORD64 ai8_curr_frame_16x16_sum_act_sqr[3] = { 0, 0, 0 }; 3073 WORD32 ai4_curr_frame_16x16_sum_act[3] = { 0, 0, 0 }; 3074 WORD32 ai4_curr_frame_16x16_sum_blks[3] = { 0, 0, 0 }; 3075 3076 LWORD64 ai8_curr_frame_32x32_sum_act_sqr[3] = { 0, 0, 0 }; 3077 WORD32 ai4_curr_frame_32x32_sum_act[3] = { 0, 0, 0 }; 3078 WORD32 ai4_curr_frame_32x32_sum_blks[3] = { 0, 0, 0 }; 3079 3080 (void)i4_temporal_lyr_id; 3081 (void)i4_enable_noise_detection; 3082 3083 if(i4_is_last_thread == 1) 3084 { 3085 WORD32 i4_slice_type = ps_curr_out->s_slice_hdr.i1_slice_type; 3086 //ps_pre_intra_ctxt->i4_slice_type; 3087 WORD32 ctb_ctr_blks = ps_pre_intra_ctxt->as_layers[1].i4_num_col_blks; 3088 WORD32 vert_ctr_blks = ps_pre_intra_ctxt->as_layers[1].i4_num_row_blks; 3089 ihevce_ed_ctb_l1_t *ps_ed_ctb_pic_l1 = ps_curr_out->ps_ed_ctb_l1; 3090 WORD32 block_wd = ps_pre_intra_ctxt->as_layers[1].i4_decomp_blk_wd; 3091 WORD32 inc_ctb = ((block_wd >> 2) * (block_wd >> 2)); 3092 ihevce_ed_blk_t *ps_ed_blk_l1 = ps_curr_out->ps_layer1_buf; 3093 ihevce_ed_blk_t *ps_ed; 3094 WORD32 i, j; 3095 WORD32 i4_avg_noise_satd; 3096 WORD32 k; 3097 WORD32 i4_layer_wd = ps_pre_intra_ctxt->as_layers[1].i4_actual_wd; 3098 WORD32 i4_layer_ht = ps_pre_intra_ctxt->as_layers[1].i4_actual_ht; 3099 3100 /*Calculate min noise threshold */ 3101 /*Min noise threshold is calculted by taking average of lowest 1% satd val in the complete 4x4 frame satds*/ 3102 //ihevce_ed_ctxt_t *ps_ed_ctxt = ps_pre_intra_ctxt->ps_ed_ctxt; 3103 WORD32 i4_min_blk = ((MIN_BLKS * (i4_layer_wd >> 1) * (i4_layer_ht >> 1)) / 100); 3104 WORD32 ai4_noise_thr_hstrgm[MAX_SATD_THRSHLD]; 3105 memset(&ai4_noise_thr_hstrgm[0], 0, (sizeof(WORD32) * MAX_SATD_THRSHLD)); 3106 ASSERT(!(USE_CUR_L0_SATD && USE_CUR_SATD)); 3107 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++) 3108 { 3109 ps_ed = ps_ed_blk_l1 + (vert_ctr * inc_ctb * (ctb_ctr_blks)); 3110 for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++) 3111 { 3112 /* Populate avg satd to calculate MI and activity factors */ 3113 for(i = 0; i < 4; i++) 3114 { 3115 for(j = 0; j < 4; j++) 3116 { 3117 for(k = 0; k < 4; k++) 3118 { 3119 if(-1 != (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd) 3120 { 3121 WORD32 i4_satd_lim; 3122 i4_satd_lim = (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd; 3123 /* Histogram creation for Noise threshold */ 3124 if(i4_satd_lim < MAX_SATD_THRSHLD) 3125 { 3126 ai4_noise_thr_hstrgm[i4_satd_lim]++; 3127 } 3128 } 3129 } 3130 } 3131 } 3132 ps_ed += inc_ctb; 3133 } 3134 } 3135 { 3136 WORD32 i4_total_blks = 0; 3137 LWORD64 i8_acc_satd = 0; 3138 for(i = MIN_SATD_THRSHLD; i < MAX_SATD_THRSHLD; i++) 3139 { 3140 i4_total_blks += ai4_noise_thr_hstrgm[i]; 3141 i8_acc_satd += (i * ai4_noise_thr_hstrgm[i]); 3142 3143 if(i4_total_blks > i4_min_blk) 3144 break; 3145 } 3146 if(i4_total_blks < i4_min_blk) 3147 { 3148 i4_avg_noise_satd = SATD_NOISE_FLOOR_THRESHOLD; 3149 } 3150 else 3151 { 3152 i4_avg_noise_satd = (WORD32)(i8_acc_satd + (i4_total_blks >> 1)) / i4_total_blks; 3153 } 3154 } 3155 3156 ps_curr_out->i4_avg_noise_thrshld_4x4 = i4_avg_noise_satd; 3157 3158 for(vert_ctr = 0; vert_ctr < vert_ctr_blks; vert_ctr++) 3159 { 3160 ihevce_ed_ctb_l1_t *ps_ed_ctb_row_l1 = 3161 ps_ed_ctb_pic_l1 + vert_ctr * ps_frm_ctb_prms->i4_num_ctbs_horz; 3162 ps_ed = ps_ed_blk_l1 + (vert_ctr * inc_ctb * (ctb_ctr_blks)); 3163 3164 for(ctb_ctr = 0; ctb_ctr < ctb_ctr_blks; ctb_ctr++) 3165 { 3166 /*sum of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 level */ 3167 WORD32 ai4_sum_sum_4x4_satd_16x16[4] = { 0, 0, 0, 0 }; 3168 /*min of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 level */ 3169 WORD32 ai4_min_sum_4x4_satd_16x16[4] = { 3170 MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL 3171 }; 3172 /*min of (min of L1_4x4 @ L1_8x8) @ L1_16x16 level */ 3173 WORD32 ai4_min_min_4x4_satd_16x16[4] = { 3174 MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL, MAX_32BIT_VAL 3175 }; 3176 WORD32 i4_sum_4x4_satd, i4_min_4x4_satd; 3177 ihevce_ed_ctb_l1_t *ps_ed_ctb_curr_l1 = ps_ed_ctb_row_l1 + ctb_ctr; 3178 3179 WORD32 is_min_block_uncompensated_in_l32x32 = 0; 3180 3181 /*min of L1_4x4 @ L1_8x8*/ 3182 WORD32 ai4_min_satd_ctb[MAX_CTB_SIZE]; 3183 /*** This 2-D array will contain 4x4 satds sorted in ascending order in sets of 4,16,64 ***/ 3184 /*** For example : '5 10 2 7 6 12 3 1' array input will return '2 5 7 10 1 3 6 12' if sorted in sets of 4 ***/ 3185 WORD32 aai4_min_4_16_64_satd[3][MAX_CTB_SIZE]; 3186 3187 /*sum of L1_4x4 @ L1_8x8*/ 3188 WORD32 ai4_sum_satd_ctb[MAX_CTB_SIZE >> 2]; 3189 /*** This 2-D array will contain 4x4 satds sorted in ascending order in sets of 4,16***/ 3190 WORD32 aai4_sum_4_16_satd_ctb[2][MAX_CTB_SIZE]; 3191 3192 /* sum of (sum of L1_4x4 @ L1_8x8) @ L1_16x16 */ 3193 WORD32 ai4_sum_sum_satd_ctb[(MAX_CTB_SIZE >> 2) >> 2]; 3194 /*L1_32x32 = L0_64x64 3195 so in L1_32x32 there are 64 L1_4x4blocks*/ 3196 for(i = 0; i < MAX_CTB_SIZE; i++) 3197 { 3198 ai4_min_satd_ctb[i] = -1; 3199 } 3200 for(j = 0; j < 3; j++) 3201 { 3202 for(i = 0; i < MAX_CTB_SIZE; i++) 3203 { 3204 aai4_min_4_16_64_satd[j][i] = -1; 3205 } 3206 } 3207 /*L1_32x32 = L0_64x64 3208 so in L1_32x32 there are 16 L1_8x8blocks*/ 3209 for(i = 0; i < (MAX_CTB_SIZE >> 2); i++) 3210 { 3211 ai4_sum_satd_ctb[i] = -1; 3212 } 3213 for(j = 0; j < 2; j++) 3214 { 3215 for(i = 0; i < (MAX_CTB_SIZE >> 2); i++) 3216 { 3217 aai4_sum_4_16_satd_ctb[j][i] = -1; 3218 } 3219 } 3220 /*L1_32x32 = L0_64x64 3221 so in L1_32x32 there are 16 L1_16x16blocks*/ 3222 for(i = 0; i < ((MAX_CTB_SIZE >> 2) >> 2); i++) 3223 { 3224 ai4_sum_sum_satd_ctb[i] = 0; 3225 } 3226 /*Populate sum min 4x4 activty */ 3227 /*loop for L1_32x32 block*/ 3228 for(i = 0; i < 4; i++) 3229 { 3230 /*loop for L1_16x16 block*/ 3231 for(j = 0; j < 4; j++) 3232 { 3233 WORD32 i4_sum_satd_dumyy = 0; 3234 WORD32 i4_num_satd_blks = 0; 3235 /* loop for L1_8x8 block*/ 3236 for(k = 0; k < 4; k++) 3237 { 3238 WORD32 i4_satd_lim; 3239 i4_satd_lim = (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd; 3240 3241 /*complete ctb will not have i4_4x4_satd = -1*/ 3242 if(-1 != i4_satd_lim) 3243 { 3244 #if SUB_NOISE_THRSHLD 3245 i4_satd_lim = i4_satd_lim - i4_avg_noise_satd; 3246 if(i4_satd_lim < 0) 3247 { 3248 i4_satd_lim = 0; 3249 } 3250 #else 3251 if(i4_satd_lim < i4_avg_noise_satd) 3252 { 3253 i4_satd_lim = i4_avg_noise_satd; 3254 } 3255 #endif 3256 i4_num_satd_blks++; 3257 /*populate 4x4 data to calculate modulation index */ 3258 (ps_ed + j * 4 + i * 16 + k)->i4_4x4_satd = i4_satd_lim; 3259 3260 i4_sum_satd_dumyy += i4_satd_lim; 3261 ai4_min_satd_ctb[j * 4 + i * 16 + k] = i4_satd_lim; 3262 } 3263 } 3264 if(i4_num_satd_blks != 0) 3265 { 3266 /*make the sum of satd always for 4 blocks even it is incomplete ctb */ 3267 i4_sum_satd_dumyy = i4_sum_satd_dumyy * 4 / i4_num_satd_blks; 3268 } 3269 else 3270 { 3271 i4_sum_satd_dumyy = -1; 3272 } 3273 /*sum of L1_4x4 @ L1_8x8block level*/ 3274 ai4_sum_satd_ctb[j + i * 4] = i4_sum_satd_dumyy; 3275 /*sum of L1_8x8 @ L1_16x16block level*/ 3276 ai4_sum_sum_satd_ctb[i] += i4_sum_satd_dumyy; 3277 /*store sum of 4x4 @ L1_8x8block level*/ 3278 ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j] = i4_sum_satd_dumyy; 3279 /*store min of 4x4 @ L1_8x8block level */ 3280 //ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] = i4_min_satd_dumyy; 3281 } 3282 } 3283 { 3284 WORD32 i4_array_length = sizeof(ai4_min_satd_ctb) / sizeof(WORD32); 3285 3286 /*** This function will sort 64 elements in array ai4_min_satd_ctb in ascending order to ***/ 3287 /*** 3 arrays in sets of 4,16,64 into the 2-D array aai4_min_4_16_64_satd ***/ 3288 ihevce_merge_sort( 3289 &ai4_min_satd_ctb[0], aai4_min_4_16_64_satd, i4_array_length, 1, 64); 3290 3291 i4_array_length = sizeof(ai4_sum_satd_ctb) / sizeof(WORD32); 3292 3293 /*** This function will sort 16 elements in array ai4_sum_satd_ctb in ascending order to ***/ 3294 /*** 2 arrays in sets of 4,16 into the 2-D array aai4_sum_4_16_satd_ctb ***/ 3295 ihevce_merge_sort( 3296 &ai4_sum_satd_ctb[0], aai4_sum_4_16_satd_ctb, i4_array_length, 1, 16); 3297 } 3298 3299 /*Populate avg satd to calculate MI and activity factors*/ 3300 for(i = 0; i < 4; i++) 3301 { 3302 WORD32 is_min_block_uncompensated_in_l116x16 = 0; 3303 ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = -1; 3304 ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = -1; 3305 ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = -1; 3306 3307 for(j = 0; j < 4; j++) 3308 { 3309 ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j] = 3310 aai4_min_4_16_64_satd[0][i * 16 + j * 4 + MEDIAN_CU_TU]; 3311 /*Accumulate the sum of 8*8 activities in the current layer (16*16 CU in L0)*/ 3312 i4_sum_4x4_satd = ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j]; 3313 i4_min_4x4_satd = ps_ed_ctb_curr_l1->i4_min_4x4_satd[i * 4 + j]; 3314 ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = -1; 3315 ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = -1; 3316 ASSERT(-2 != i4_sum_4x4_satd); 3317 3318 if((-1 != i4_sum_4x4_satd)) 3319 { 3320 WORD32 not_skipped = 1; 3321 3322 if((i4_slice_type == ISLICE) || (1 == not_skipped)) 3323 { 3324 is_min_block_uncompensated_in_l116x16 = 1; 3325 is_min_block_uncompensated_in_l32x32 = 1; 3326 3327 u8_curr_frame_8x8_sum_act_sqr += 3328 (i4_sum_4x4_satd * i4_sum_4x4_satd); 3329 3330 ai4_curr_frame_8x8_sum_act[0] += i4_sum_4x4_satd; 3331 ai8_curr_frame_8x8_sum_act_sqr[0] += 3332 (i4_sum_4x4_satd * i4_sum_4x4_satd); 3333 ai4_curr_frame_8x8_sum_blks[0] += 1; 3334 ai4_curr_frame_8x8_sum_act[1] += i4_min_4x4_satd; 3335 ai8_curr_frame_8x8_sum_act_sqr[1] += 3336 (i4_min_4x4_satd * i4_min_4x4_satd); 3337 ai4_curr_frame_8x8_sum_blks[1] += 1; 3338 } 3339 3340 ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][0] = i4_sum_4x4_satd; 3341 ps_ed_ctb_curr_l1->i4_8x8_satd[i * 4 + j][1] = i4_min_4x4_satd; 3342 } 3343 else 3344 { 3345 ai4_sum_sum_4x4_satd_16x16[i] = MAX_32BIT_VAL; 3346 ai4_min_sum_4x4_satd_16x16[i] = MAX_32BIT_VAL; 3347 ai4_min_min_4x4_satd_16x16[i] = MAX_32BIT_VAL; 3348 } 3349 } 3350 3351 //if(1 == is_min_block_comensated_in_l116x16) 3352 { 3353 ai4_min_sum_4x4_satd_16x16[i] = 3354 aai4_sum_4_16_satd_ctb[0][i * 4 + MEDIAN_CU_TU]; 3355 ai4_min_min_4x4_satd_16x16[i] = 3356 aai4_min_4_16_64_satd[1][i * 16 + MEDIAN_CU_TU_BY_2]; 3357 3358 if(ai4_sum_sum_4x4_satd_16x16[i] != MAX_32BIT_VAL) 3359 { 3360 ai4_sum_sum_4x4_satd_16x16[i] = 0; 3361 for(j = 0; j < 4; j++) 3362 { 3363 ai4_sum_sum_4x4_satd_16x16[i] += 3364 ps_ed_ctb_curr_l1->i4_sum_4x4_satd[i * 4 + j]; 3365 } 3366 ps_ed_ctb_curr_l1->i4_16x16_satd[i][0] = ai4_sum_sum_4x4_satd_16x16[i]; 3367 ps_ed_ctb_curr_l1->i4_16x16_satd[i][1] = ai4_min_sum_4x4_satd_16x16[i]; 3368 ps_ed_ctb_curr_l1->i4_16x16_satd[i][2] = ai4_min_min_4x4_satd_16x16[i]; 3369 } 3370 } 3371 if(1 == is_min_block_uncompensated_in_l116x16) 3372 { 3373 if(MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[i]) 3374 { 3375 ai4_curr_frame_16x16_sum_act[0] += ai4_sum_sum_4x4_satd_16x16[i]; 3376 ai8_curr_frame_16x16_sum_act_sqr[0] += 3377 (ai4_sum_sum_4x4_satd_16x16[i] * ai4_sum_sum_4x4_satd_16x16[i]); 3378 ai4_curr_frame_16x16_sum_blks[0] += 1; 3379 } 3380 if(MAX_32BIT_VAL != ai4_min_sum_4x4_satd_16x16[i]) 3381 { 3382 ai4_curr_frame_16x16_sum_act[1] += ai4_min_sum_4x4_satd_16x16[i]; 3383 ai8_curr_frame_16x16_sum_act_sqr[1] += 3384 (ai4_min_sum_4x4_satd_16x16[i] * ai4_min_sum_4x4_satd_16x16[i]); 3385 ai4_curr_frame_16x16_sum_blks[1] += 1; 3386 ai4_curr_frame_16x16_sum_act[2] += ai4_min_min_4x4_satd_16x16[i]; 3387 ai8_curr_frame_16x16_sum_act_sqr[2] += 3388 (ai4_min_min_4x4_satd_16x16[i] * ai4_min_min_4x4_satd_16x16[i]); 3389 ai4_curr_frame_16x16_sum_blks[2] += 1; 3390 } 3391 } 3392 } 3393 /*32x32*/ 3394 { 3395 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = -1; 3396 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = -1; 3397 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = -1; 3398 ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = -1; 3399 3400 if((MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[0]) || 3401 (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[2]) || 3402 (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[1]) || 3403 (MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[3])) 3404 { 3405 //if(1 == is_min_block_comensated_in_l32x32) 3406 { 3407 { 3408 WORD32 aai4_min_sum_sum_4x4_satd_16x16[1][64]; 3409 WORD32 i4_array_length = 3410 sizeof(ai4_sum_sum_4x4_satd_16x16) / sizeof(WORD32); 3411 /*** Sort 4 elements in ascending order ***/ 3412 ihevce_merge_sort( 3413 &ai4_sum_sum_4x4_satd_16x16[0], 3414 aai4_min_sum_sum_4x4_satd_16x16, 3415 i4_array_length, 3416 1, 3417 4); 3418 3419 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] = 3420 aai4_min_sum_sum_4x4_satd_16x16[0][MEDIAN_CU_TU]; 3421 } 3422 { 3423 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] = 3424 aai4_sum_4_16_satd_ctb[1][MEDIAN_CU_TU_BY_2]; 3425 } 3426 { 3427 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] = 3428 aai4_min_4_16_64_satd[2][MEDIAN_CU_TU_BY_4]; 3429 } 3430 3431 /*Sum of all 32x32 activity */ 3432 ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] = 0; 3433 for(j = 0; j < 4; j++) 3434 { 3435 if(MAX_32BIT_VAL != ai4_sum_sum_4x4_satd_16x16[j]) 3436 ps_ed_ctb_curr_l1->i4_32x32_satd[0][3] += 3437 ai4_sum_sum_4x4_satd_16x16[j]; 3438 } 3439 3440 if(1 == is_min_block_uncompensated_in_l32x32) 3441 { 3442 /*Accumulate the sum of 32*32 activities in the current layer (64*64 CU in L0)*/ 3443 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][0]) 3444 { 3445 ai4_curr_frame_32x32_sum_act[0] += 3446 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0]; 3447 ai8_curr_frame_32x32_sum_act_sqr[0] += 3448 (ps_ed_ctb_curr_l1->i4_32x32_satd[0][0] * 3449 ps_ed_ctb_curr_l1->i4_32x32_satd[0][0]); 3450 ai4_curr_frame_32x32_sum_blks[0] += 1; 3451 } 3452 3453 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][1]) 3454 { 3455 ai4_curr_frame_32x32_sum_act[1] += 3456 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1]; 3457 ai8_curr_frame_32x32_sum_act_sqr[1] += 3458 (ps_ed_ctb_curr_l1->i4_32x32_satd[0][1] * 3459 ps_ed_ctb_curr_l1->i4_32x32_satd[0][1]); 3460 ai4_curr_frame_32x32_sum_blks[1] += 1; 3461 } 3462 3463 if(MAX_32BIT_VAL != ps_ed_ctb_curr_l1->i4_32x32_satd[0][2]) 3464 { 3465 ai4_curr_frame_32x32_sum_act[2] += 3466 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2]; 3467 ai8_curr_frame_32x32_sum_act_sqr[2] += 3468 (ps_ed_ctb_curr_l1->i4_32x32_satd[0][2] * 3469 ps_ed_ctb_curr_l1->i4_32x32_satd[0][2]); 3470 ai4_curr_frame_32x32_sum_blks[2] += 1; 3471 } 3472 } 3473 } 3474 } 3475 } 3476 /*Increment ctb count*/ 3477 ps_ed += inc_ctb; 3478 } 3479 } 3480 3481 /* Spatial Variation and modulation index calculated for the frame */ 3482 { 3483 for(i4_k = 0; i4_k < 2; i4_k++) 3484 { 3485 /*8x8*/ 3486 #if USE_SQRT_AVG_OF_SATD_SQR 3487 ps_curr_out->i8_curr_frame_8x8_sum_act[i4_k] = ai8_curr_frame_8x8_sum_act_sqr[i4_k]; 3488 #else 3489 ps_curr_out->i8_curr_frame_8x8_sum_act[i4_k] = ai4_curr_frame_8x8_sum_act[i4_k]; 3490 #endif 3491 ps_curr_out->i4_curr_frame_8x8_sum_act_for_strength[i4_k] = 3492 ai4_curr_frame_8x8_sum_act[i4_k]; 3493 ps_curr_out->i4_curr_frame_8x8_num_blks[i4_k] = ai4_curr_frame_8x8_sum_blks[i4_k]; 3494 ps_curr_out->u8_curr_frame_8x8_sum_act_sqr = u8_curr_frame_8x8_sum_act_sqr; 3495 3496 /*16x16*/ 3497 #if USE_SQRT_AVG_OF_SATD_SQR 3498 ps_curr_out->i8_curr_frame_16x16_sum_act[i4_k] = 3499 ai8_curr_frame_16x16_sum_act_sqr[i4_k]; 3500 #else 3501 ps_curr_out->i8_curr_frame_16x16_sum_act[i4_k] = ai4_curr_frame_16x16_sum_act[i4_k]; 3502 #endif 3503 ps_curr_out->i4_curr_frame_16x16_num_blks[i4_k] = 3504 ai4_curr_frame_16x16_sum_blks[i4_k]; 3505 3506 /*32x32*/ 3507 #if USE_SQRT_AVG_OF_SATD_SQR 3508 ps_curr_out->i8_curr_frame_32x32_sum_act[i4_k] = 3509 ai8_curr_frame_32x32_sum_act_sqr[i4_k]; 3510 #else 3511 ps_curr_out->i8_curr_frame_32x32_sum_act[i4_k] = ai4_curr_frame_32x32_sum_act[i4_k]; 3512 #endif 3513 ps_curr_out->i4_curr_frame_32x32_num_blks[i4_k] = 3514 ai4_curr_frame_32x32_sum_blks[i4_k]; 3515 } 3516 3517 /*16x16*/ 3518 #if USE_SQRT_AVG_OF_SATD_SQR 3519 ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai8_curr_frame_16x16_sum_act_sqr[2]; 3520 #else 3521 ps_curr_out->i8_curr_frame_16x16_sum_act[2] = ai4_curr_frame_16x16_sum_act[2]; 3522 #endif 3523 3524 ps_curr_out->i4_curr_frame_16x16_num_blks[2] = ai4_curr_frame_16x16_sum_blks[2]; 3525 3526 /*32x32*/ 3527 #if USE_SQRT_AVG_OF_SATD_SQR 3528 ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai8_curr_frame_32x32_sum_act_sqr[2]; 3529 #else 3530 ps_curr_out->i8_curr_frame_32x32_sum_act[2] = ai4_curr_frame_32x32_sum_act[2]; 3531 #endif 3532 ps_curr_out->i4_curr_frame_32x32_num_blks[2] = ai4_curr_frame_32x32_sum_blks[2]; 3533 } 3534 } 3535 } 3536 3537 /*! 3538 ****************************************************************************** 3539 * \if Function name : ihevce_decomp_pre_intra_get_frame_satd \endif 3540 * 3541 * \brief 3542 * Number of memory records are returned for enc_loop module 3543 * 3544 * 3545 * \return 3546 * None 3547 * 3548 * \author 3549 * Ittiam 3550 * 3551 ***************************************************************************** 3552 */ 3553 LWORD64 ihevce_decomp_pre_intra_get_frame_satd(void *pv_ctxt, WORD32 *i4_width, WORD32 *i4_hieght) 3554 { 3555 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = 3556 (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt; 3557 WORD32 i4_i; 3558 LWORD64 i8_tot_satd = 0; 3559 3560 /*accumulate SATD acorss all thread. note that every thread will enter this function, 3561 hence it must be guranteed that all thread must have completed preintra pass by now*/ 3562 for(i4_i = 0; i4_i < ps_master_ctxt->i4_num_proc_thrds; i4_i++) 3563 { 3564 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = 3565 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i4_i]; 3566 3567 //i8_tot_satd += ps_ctxt->as_layers[1].s_early_decision.i8_sum_best_satd; 3568 i8_tot_satd += ps_ctxt->ps_ed_ctxt->i8_sum_best_satd; 3569 3570 *i4_width = ps_ctxt->as_layers[1].i4_actual_wd; 3571 *i4_hieght = ps_ctxt->as_layers[1].i4_actual_ht; 3572 } 3573 3574 return i8_tot_satd; 3575 } 3576 3577 LWORD64 ihevce_decomp_pre_intra_get_frame_satd_squared( 3578 void *pv_ctxt, WORD32 *i4_width, WORD32 *i4_hieght) 3579 { 3580 ihevce_decomp_pre_intra_master_ctxt_t *ps_master_ctxt = 3581 (ihevce_decomp_pre_intra_master_ctxt_t *)pv_ctxt; 3582 WORD32 i4_i; 3583 LWORD64 i8_tot_satd = 0; 3584 3585 /*accumulate SATD acorss all thread. note that every thread will enter this function, 3586 hence it must be guranteed that all thread must have completed preintra pass by now*/ 3587 for(i4_i = 0; i4_i < ps_master_ctxt->i4_num_proc_thrds; i4_i++) 3588 { 3589 ihevce_decomp_pre_intra_ctxt_t *ps_ctxt = 3590 ps_master_ctxt->aps_decomp_pre_intra_thrd_ctxt[i4_i]; 3591 3592 //i8_tot_satd += ps_ctxt->as_layers[1].s_early_decision.i8_sum_best_satd; 3593 i8_tot_satd += (ps_ctxt->ps_ed_ctxt->i8_sum_sq_best_satd); 3594 3595 *i4_width = ps_ctxt->as_layers[1].i4_actual_wd; 3596 *i4_hieght = ps_ctxt->as_layers[1].i4_actual_ht; 3597 } 3598 3599 return i8_tot_satd; 3600 } 3601