1 /****************************************************************************** 2 * 3 * Copyright (C) 2018 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 21 /** 22 ****************************************************************************** 23 * @file ihevce_had_satd.c 24 * 25 * @brief 26 * This file contains functions of Hadamard SAD and SATD 27 * 28 * @author 29 * Ittiam 30 * 31 * List of Functions 32 * <TODO: TO BE ADDED> 33 * 34 ****************************************************************************** 35 */ 36 37 /*****************************************************************************/ 38 /* File Includes */ 39 /*****************************************************************************/ 40 /* System include files */ 41 #include <stdio.h> 42 #include <string.h> 43 #include <stdlib.h> 44 #include <assert.h> 45 #include <stdarg.h> 46 #include <math.h> 47 48 /* User include files */ 49 #include "ihevc_typedefs.h" 50 #include "itt_video_api.h" 51 #include "ihevce_api.h" 52 53 #include "rc_cntrl_param.h" 54 #include "rc_frame_info_collector.h" 55 #include "rc_look_ahead_params.h" 56 57 #include "ihevc_defs.h" 58 #include "ihevc_structs.h" 59 #include "ihevc_platform_macros.h" 60 #include "ihevc_deblk.h" 61 #include "ihevc_itrans_recon.h" 62 #include "ihevc_chroma_itrans_recon.h" 63 #include "ihevc_chroma_intra_pred.h" 64 #include "ihevc_intra_pred.h" 65 #include "ihevc_inter_pred.h" 66 #include "ihevc_mem_fns.h" 67 #include "ihevc_padding.h" 68 #include "ihevc_weighted_pred.h" 69 #include "ihevc_sao.h" 70 #include "ihevc_resi_trans.h" 71 #include "ihevc_quant_iquant_ssd.h" 72 #include "ihevc_cabac_tables.h" 73 74 #include "ihevce_defs.h" 75 #include "ihevce_lap_enc_structs.h" 76 #include "ihevce_multi_thrd_structs.h" 77 #include "ihevce_multi_thrd_funcs.h" 78 #include "ihevce_me_common_defs.h" 79 #include "ihevce_had_satd.h" 80 #include "ihevce_error_codes.h" 81 #include "ihevce_bitstream.h" 82 #include "ihevce_cabac.h" 83 #include "ihevce_rdoq_macros.h" 84 #include "ihevce_function_selector.h" 85 #include "ihevce_enc_structs.h" 86 #include "ihevce_cmn_utils_instr_set_router.h" 87 #include "hme_datatype.h" 88 #include "hme_interface.h" 89 #include "hme_common_defs.h" 90 #include "hme_defs.h" 91 92 /*****************************************************************************/ 93 /* Function Definitions */ 94 /*****************************************************************************/ 95 96 static void ihevce_hadamard_4x4_8bit( 97 UWORD8 *pu1_src, 98 WORD32 src_strd, 99 UWORD8 *pu1_pred, 100 WORD32 pred_strd, 101 WORD16 *pi2_dst, 102 WORD32 dst_strd) 103 { 104 WORD32 k; 105 WORD16 m[16]; 106 107 /*===== hadamard horz transform =====*/ 108 for(k = 0; k < 4; k++) 109 { 110 WORD32 r0, r1, r2, r3; 111 WORD32 h0, h1, h2, h3; 112 113 /* Compute the residue block */ 114 r0 = pu1_src[0] - pu1_pred[0]; 115 r1 = pu1_src[1] - pu1_pred[1]; 116 r2 = pu1_src[2] - pu1_pred[2]; 117 r3 = pu1_src[3] - pu1_pred[3]; 118 119 h0 = r0 + r1; 120 h1 = r0 - r1; 121 h2 = r2 + r3; 122 h3 = r2 - r3; 123 124 m[k * 4 + 0] = h0 + h2; 125 m[k * 4 + 1] = h1 + h3; 126 m[k * 4 + 2] = h0 - h2; 127 m[k * 4 + 3] = h1 - h3; 128 129 pu1_pred += pred_strd; 130 pu1_src += src_strd; 131 } 132 133 /*===== hadamard vert transform =====*/ 134 for(k = 0; k < 4; k++) 135 { 136 WORD32 v0, v1, v2, v3; 137 138 v0 = m[0 + k] + m[4 + k]; 139 v1 = m[0 + k] - m[4 + k]; 140 v2 = m[8 + k] + m[12 + k]; 141 v3 = m[8 + k] - m[12 + k]; 142 143 pi2_dst[0 * dst_strd + k] = v0 + v2; 144 pi2_dst[1 * dst_strd + k] = v1 + v3; 145 pi2_dst[2 * dst_strd + k] = v0 - v2; 146 pi2_dst[3 * dst_strd + k] = v1 - v3; 147 } 148 } 149 150 static void ihevce_hadamard_8x8_8bit( 151 UWORD8 *pu1_src, 152 WORD32 src_strd, 153 UWORD8 *pu1_pred, 154 WORD32 pred_strd, 155 WORD16 *pi2_dst, 156 WORD32 dst_strd) 157 { 158 WORD32 i; 159 160 // y0 161 ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd); 162 // y1 163 ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd); 164 // y2 165 ihevce_hadamard_4x4_8bit( 166 pu1_src + 4 * src_strd, 167 src_strd, 168 pu1_pred + 4 * pred_strd, 169 pred_strd, 170 pi2_dst + (4 * dst_strd), 171 dst_strd); 172 // y3 173 ihevce_hadamard_4x4_8bit( 174 pu1_src + 4 + 4 * src_strd, 175 src_strd, 176 pu1_pred + 4 + 4 * pred_strd, 177 pred_strd, 178 pi2_dst + (4 * dst_strd) + 4, 179 dst_strd); 180 181 /* Child HAD results combined as follows to get Parent result */ 182 /* _ _ */ 183 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ 184 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ 185 /* \- -/ */ 186 for(i = 0; i < 16; i++) 187 { 188 WORD32 idx = (i >> 2) * dst_strd + (i % 4); 189 WORD16 a0 = pi2_dst[idx]; 190 WORD16 a1 = pi2_dst[4 + idx]; 191 WORD16 a2 = pi2_dst[(4 * dst_strd) + idx]; 192 WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx]; 193 194 WORD16 b0 = (a0 + a1); 195 WORD16 b1 = (a0 - a1); 196 WORD16 b2 = (a2 + a3); 197 WORD16 b3 = (a2 - a3); 198 199 pi2_dst[idx] = b0 + b2; 200 pi2_dst[4 + idx] = b1 + b3; 201 pi2_dst[(4 * dst_strd) + idx] = b0 - b2; 202 pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3; 203 } 204 } 205 206 static void ihevce_hadamard_16x16_8bit( 207 UWORD8 *pu1_src, 208 WORD32 src_strd, 209 UWORD8 *pu1_pred, 210 WORD32 pred_strd, 211 WORD16 *pi2_dst, 212 WORD32 dst_strd) 213 { 214 WORD32 i; 215 216 // y0 217 ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd); 218 // y1 219 ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd); 220 // y2 221 ihevce_hadamard_8x8_8bit( 222 pu1_src + 8 * src_strd, 223 src_strd, 224 pu1_pred + 8 * pred_strd, 225 pred_strd, 226 pi2_dst + (8 * dst_strd), 227 dst_strd); 228 // y3 229 ihevce_hadamard_8x8_8bit( 230 pu1_src + 8 + 8 * src_strd, 231 src_strd, 232 pu1_pred + 8 + 8 * pred_strd, 233 pred_strd, 234 pi2_dst + (8 * dst_strd) + 8, 235 dst_strd); 236 237 /* Child HAD results combined as follows to get Parent result */ 238 /* _ _ */ 239 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ 240 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ 241 /* \- -/ */ 242 for(i = 0; i < 64; i++) 243 { 244 WORD32 idx = (i >> 3) * dst_strd + (i % 8); 245 WORD16 a0 = pi2_dst[idx]; 246 WORD16 a1 = pi2_dst[8 + idx]; 247 WORD16 a2 = pi2_dst[(8 * dst_strd) + idx]; 248 WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx]; 249 250 WORD16 b0 = (a0 + a1) >> 1; 251 WORD16 b1 = (a0 - a1) >> 1; 252 WORD16 b2 = (a2 + a3) >> 1; 253 WORD16 b3 = (a2 - a3) >> 1; 254 255 pi2_dst[idx] = b0 + b2; 256 pi2_dst[8 + idx] = b1 + b3; 257 pi2_dst[(8 * dst_strd) + idx] = b0 - b2; 258 pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3; 259 } 260 } 261 262 static void ihevce_hadamard_32x32_8bit( 263 UWORD8 *pu1_src, 264 WORD32 src_strd, 265 UWORD8 *pu1_pred, 266 WORD32 pred_strd, 267 WORD16 *pi2_dst, 268 WORD32 dst_strd) 269 { 270 WORD32 i; 271 272 // y0 273 ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd); 274 // y1 275 ihevce_hadamard_16x16_8bit( 276 pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd); 277 // y2 278 ihevce_hadamard_16x16_8bit( 279 pu1_src + 16 * src_strd, 280 src_strd, 281 pu1_pred + 16 * pred_strd, 282 pred_strd, 283 pi2_dst + (16 * dst_strd), 284 dst_strd); 285 // y3 286 ihevce_hadamard_16x16_8bit( 287 pu1_src + 16 + 16 * src_strd, 288 src_strd, 289 pu1_pred + 16 + 16 * pred_strd, 290 pred_strd, 291 pi2_dst + (16 * dst_strd) + 16, 292 dst_strd); 293 294 /* Child HAD results combined as follows to get Parent result */ 295 /* _ _ */ 296 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ 297 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ 298 /* \- -/ */ 299 for(i = 0; i < 256; i++) 300 { 301 WORD32 idx = (i >> 4) * dst_strd + (i % 16); 302 WORD16 a0 = pi2_dst[idx] >> 2; 303 WORD16 a1 = pi2_dst[16 + idx] >> 2; 304 WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2; 305 WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2; 306 307 WORD16 b0 = (a0 + a1); 308 WORD16 b1 = (a0 - a1); 309 WORD16 b2 = (a2 + a3); 310 WORD16 b3 = (a2 - a3); 311 312 pi2_dst[idx] = b0 + b2; 313 pi2_dst[16 + idx] = b1 + b3; 314 pi2_dst[(16 * dst_strd) + idx] = b0 - b2; 315 pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3; 316 } 317 } 318 319 /** 320 ******************************************************************************* 321 * 322 * @brief 323 * Compute Hadamard sad for 4x4 block with 8-bit input 324 * 325 * @par Description: 326 * 327 * @param[in] pu1_origin 328 * UWORD8 pointer to the current block 329 * 330 * @param[in] src_strd 331 * WORD32 Source stride 332 * 333 * @param[in] pu1_pred_buf 334 * UWORD8 pointer to the prediction block 335 * 336 * @param[in] pred_strd 337 * WORD32 Pred stride 338 * 339 * @param[in] pi2_dst 340 * WORD16 pointer to the transform block 341 * 342 * @param[in] dst_strd 343 * WORD32 Destination stride 344 * 345 * @param[in] size 346 * WORD32 transform Block size 347 * 348 * @returns hadamard SAD 349 * 350 * @remarks 351 * Not updating the transform destination now. Only returning the SATD 352 * 353 ******************************************************************************* 354 */ 355 UWORD32 ihevce_HAD_4x4_8bit( 356 UWORD8 *pu1_origin, 357 WORD32 src_strd, 358 UWORD8 *pu1_pred_buf, 359 WORD32 pred_strd, 360 WORD16 *pi2_dst, 361 WORD32 dst_strd) 362 { 363 WORD32 k; 364 WORD16 v[16]; 365 UWORD32 u4_sad = 0; 366 367 (void)pi2_dst; 368 (void)dst_strd; 369 ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4); 370 371 for(k = 0; k < 16; ++k) 372 u4_sad += abs(v[k]); 373 u4_sad = ((u4_sad + 2) >> 2); 374 375 return u4_sad; 376 } 377 378 /** 379 ******************************************************************************* 380 * 381 * @brief 382 * Computes Hadamard Sad for 8x8 block with 8-bit input 383 * 384 * @par Description: 385 * 386 * @param[in] pu1_origin 387 * UWORD8 pointer to the current block 388 * 389 * @param[in] src_strd 390 * WORD32 Source stride 391 * 392 * @param[in] pu1_pred_buf 393 * UWORD8 pointer to the prediction block 394 * 395 * @param[in] pred_strd 396 * WORD32 Pred stride 397 * 398 * @param[in] pi2_dst 399 * WORD16 pointer to the transform block 400 * 401 * @param[in] dst_strd 402 * WORD32 Destination stride 403 * 404 * @param[in] size 405 * WORD32 transform Block size 406 * 407 * @returns Hadamard SAD 408 * 409 * @remarks 410 * Not updating the transform destination now. Only returning the SATD 411 * 412 ******************************************************************************* 413 */ 414 UWORD32 ihevce_HAD_8x8_8bit( 415 UWORD8 *pu1_origin, 416 WORD32 src_strd, 417 UWORD8 *pu1_pred_buf, 418 WORD32 pred_strd, 419 WORD16 *pi2_dst, 420 WORD32 dst_strd) 421 { 422 WORD32 k; 423 UWORD32 u4_sad = 0; 424 WORD16 v[64]; 425 426 (void)pi2_dst; 427 (void)dst_strd; 428 ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8); 429 430 for(k = 0; k < 64; ++k) 431 u4_sad += abs(v[k]); 432 u4_sad = ((u4_sad + 4) >> 3); 433 434 return u4_sad; 435 } 436 437 /** 438 ******************************************************************************* 439 * 440 * @brief 441 * Compute dc suppressed hadamard sad for 8x8 block with 8-bit input 442 * 443 * @par Description: 444 * 445 * @param[in] pu1_origin 446 * UWORD8 pointer to the current block 447 * 448 * @param[in] src_strd 449 * WORD32 Source stride 450 * 451 * @param[in] pu1_pred_buf 452 * UWORD8 pointer to the prediction block 453 * 454 * @param[in] pred_strd 455 * WORD32 Pred stride 456 * 457 * @param[in] pi2_dst 458 * WORD16 pointer to the transform block 459 * 460 * @param[in] dst_strd 461 * WORD32 Destination stride 462 * 463 * @param[in] size 464 * WORD32 transform Block size 465 * 466 * @returns Hadamard SAD with DC Suppressed 467 * 468 * @remarks 469 * Not updating the transform destination now. Only returning the SATD 470 * 471 ******************************************************************************* 472 */ 473 UWORD32 ihevce_compute_ac_had_8x8_8bit( 474 UWORD8 *pu1_origin, 475 WORD32 src_strd, 476 UWORD8 *pu1_pred_buf, 477 WORD32 pred_strd, 478 WORD16 *pi2_dst, 479 WORD32 dst_strd) 480 { 481 WORD32 k; 482 UWORD32 u4_sad = 0; 483 WORD16 v[64]; 484 485 (void)pi2_dst; 486 (void)dst_strd; 487 ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8); 488 489 v[0] = 0; 490 for(k = 0; k < 64; ++k) 491 u4_sad += abs(v[k]); 492 u4_sad = ((u4_sad + 4) >> 3); 493 494 return u4_sad; 495 } 496 497 /** 498 ******************************************************************************* 499 * 500 * @brief 501 * Computes Hadamard Sad for 16x16 block with 8-bit input 502 * 503 * @par Description: 504 * 505 * @param[in] pu1_origin 506 * UWORD8 pointer to the current block 507 * 508 * @param[in] src_strd 509 * WORD32 Source stride 510 * 511 * @param[in] pu1_pred_buf 512 * UWORD8 pointer to the prediction block 513 * 514 * @param[in] pred_strd 515 * WORD32 Pred stride 516 * 517 * @param[in] pi2_dst 518 * WORD16 pointer to the transform block 519 * 520 * @param[in] dst_strd 521 * WORD32 Destination stride 522 * 523 * @param[in] size 524 * WORD32 transform Block size 525 * 526 * @returns Hadamard SAD 527 * 528 * @remarks 529 * Not updating the transform destination now. Only returning the SATD 530 * 531 ******************************************************************************* 532 */ 533 UWORD32 ihevce_HAD_16x16_8bit( 534 UWORD8 *pu1_origin, 535 WORD32 src_strd, 536 UWORD8 *pu1_pred_buf, 537 WORD32 pred_strd, 538 WORD16 *pi2_dst, 539 WORD32 dst_strd) 540 { 541 WORD32 k; 542 UWORD32 u4_sad = 0; 543 WORD16 v[256]; 544 545 (void)pi2_dst; 546 (void)dst_strd; 547 ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16); 548 549 for(k = 0; k < 256; ++k) 550 u4_sad += abs(v[k]); 551 u4_sad = ((u4_sad + 4) >> 3); 552 553 return u4_sad; 554 } 555 556 /** 557 ******************************************************************************* 558 * 559 * @brief 560 * Computes Hadamard Sad for 32x32 block with 8-bit input 561 * 562 * @par Description: 563 * 564 * @param[in] pu1_origin 565 * UWORD8 pointer to the current block 566 * 567 * @param[in] src_strd 568 * WORD32 Source stride 569 * 570 * @param[in] pu1_pred_buf 571 * UWORD8 pointer to the prediction block 572 * 573 * @param[in] pred_strd 574 * WORD32 Pred stride 575 * 576 * @param[in] pi2_dst 577 * WORD16 pointer to the transform block 578 * 579 * @param[in] dst_strd 580 * WORD32 Destination stride 581 * 582 * @param[in] size 583 * WORD32 transform Block size 584 * 585 * @returns Hadamard SAD 586 * 587 * @remarks 588 * Not updating the transform destination now. Only returning the SATD 589 * 590 ******************************************************************************* 591 */ 592 UWORD32 ihevce_HAD_32x32_8bit( 593 UWORD8 *pu1_origin, 594 WORD32 src_strd, 595 UWORD8 *pu1_pred_buf, 596 WORD32 pred_strd, 597 WORD16 *pi2_dst, 598 WORD32 dst_strd) 599 { 600 WORD32 k; 601 UWORD32 u4_sad = 0; 602 WORD16 v[32 * 32]; 603 604 (void)pi2_dst; 605 (void)dst_strd; 606 ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32); 607 608 for(k = 0; k < 32 * 32; ++k) 609 u4_sad += abs(v[k]); 610 u4_sad = ((u4_sad + 2) >> 2); 611 612 return u4_sad; 613 } 614 615 //#if COMPUTE_16x16_R == C 616 /** 617 ******************************************************************************* 618 * 619 * @brief 620 * Computes 8x8 transform using children 4x4 hadamard results 621 * 622 * @par Description: 623 * 624 * @param[in] pi2_4x4_had 625 * WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order) 626 * 627 * @param[in] had4_strd 628 * stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3 629 * 630 * @param[out] pi2_dst 631 * destination buffer where 8x8 hadamard result is stored 632 * 633 * @param[in] dst_stride 634 * stride of destination block 635 * 636 * @param[in] i4_frm_qstep 637 * frm_qstep value based on the which the threshold value is calculated 638 * 639 * @returns 640 * 8x8 Hadamard SATD 641 * @remarks 642 * 643 ******************************************************************************* 644 */ 645 static UWORD32 ihevce_compute_8x8HAD_using_4x4( 646 WORD16 *pi2_4x4_had, 647 WORD32 had4_strd, 648 WORD16 *pi2_dst, 649 WORD32 dst_strd, 650 WORD32 i4_frm_qstep, 651 WORD32 *pi4_cbf) 652 { 653 /* Qstep value is right shifted by 8 */ 654 WORD32 threshold = (i4_frm_qstep >> 8); 655 656 /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */ 657 WORD16 *pi2_y0 = pi2_4x4_had; 658 WORD16 *pi2_y1 = pi2_4x4_had + 4; 659 WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4; 660 WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4; 661 662 /* Initialize pointers to store 8x8 HAD output */ 663 WORD16 *pi2_dst0 = pi2_dst; 664 WORD16 *pi2_dst1 = pi2_dst + 4; 665 WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4; 666 WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4; 667 668 UWORD32 u4_satd = 0; 669 WORD32 i; 670 671 /* Child HAD results combined as follows to get Parent result */ 672 /* _ _ */ 673 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ 674 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ 675 /* \- -/ */ 676 for(i = 0; i < 16; i++) 677 { 678 WORD32 src_idx = (i >> 2) * had4_strd + (i % 4); 679 WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4); 680 681 WORD16 a0 = pi2_y0[src_idx]; 682 WORD16 a1 = pi2_y1[src_idx]; 683 WORD16 a2 = pi2_y2[src_idx]; 684 WORD16 a3 = pi2_y3[src_idx]; 685 686 WORD16 b0 = (a0 + a1); 687 WORD16 b1 = (a0 - a1); 688 WORD16 b2 = (a2 + a3); 689 WORD16 b3 = (a2 - a3); 690 691 pi2_dst0[dst_idx] = b0 + b2; 692 pi2_dst1[dst_idx] = b1 + b3; 693 pi2_dst2[dst_idx] = b0 - b2; 694 pi2_dst3[dst_idx] = b1 - b3; 695 696 if(ABS(pi2_dst0[dst_idx]) > threshold) 697 *pi4_cbf = 1; 698 if(ABS(pi2_dst1[dst_idx]) > threshold) 699 *pi4_cbf = 1; 700 if(ABS(pi2_dst2[dst_idx]) > threshold) 701 *pi4_cbf = 1; 702 if(ABS(pi2_dst3[dst_idx]) > threshold) 703 *pi4_cbf = 1; 704 705 u4_satd += ABS(pi2_dst0[dst_idx]); 706 u4_satd += ABS(pi2_dst1[dst_idx]); 707 u4_satd += ABS(pi2_dst2[dst_idx]); 708 u4_satd += ABS(pi2_dst3[dst_idx]); 709 } 710 711 /* return the 8x8 satd */ 712 return (u4_satd); 713 } 714 715 /** 716 ******************************************************************************* 717 * 718 * @brief 719 * Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of 720 * a 8x8 block (Residue is computed for 8-bit src and prediction buffers) 721 * Modified to incorporate the dead-zone implementation - Lokesh 722 * 723 * @par Description: 724 * 725 * @param[in] pu1_origin 726 * UWORD8 pointer to the current block 727 * 728 * @param[in] src_strd 729 * WORD32 Source stride 730 * 731 * @param[in] pu1_pred 732 * UWORD8 pointer to the prediction block 733 * 734 * @param[in] pred_strd 735 * WORD32 Pred stride 736 * 737 * @param[out] pi2_dst 738 * WORD16 pointer to the transform block 739 * 740 * @param[in] dst_strd 741 * WORD32 Destination stride 742 * 743 * @param[out] pi4_hsad 744 * array for storing hadmard sad of each 4x4 block 745 * 746 * @param[in] hsad_stride 747 * stride of hadmard sad destination buffer (for Zscan order of storing sads) 748 * 749 * @param[in] i4_frm_qstep 750 * frm_qstep value based on the which the threshold value is calculated 751 * 752 * @returns 753 * 754 * @remarks 755 * 756 ******************************************************************************* 757 */ 758 static WORD32 ihevce_had4_4x4( 759 UWORD8 *pu1_src, 760 WORD32 src_strd, 761 UWORD8 *pu1_pred, 762 WORD32 pred_strd, 763 WORD16 *pi2_dst4x4, 764 WORD32 dst_strd, 765 WORD32 *pi4_hsad, 766 WORD32 hsad_stride, 767 WORD32 i4_frm_qstep) 768 { 769 WORD32 i, k; 770 WORD32 i4_child_total_sad = 0; 771 772 (void)i4_frm_qstep; 773 /* -------- Compute four 4x4 HAD Transforms ---------*/ 774 for(i = 0; i < 4; i++) 775 { 776 UWORD8 *pu1_pi0, *pu1_pi1; 777 WORD16 *pi2_dst; 778 WORD32 blkx, blky; 779 UWORD32 u4_hsad = 0; 780 // TODO: choose deadzone as f(qstep) 781 WORD32 threshold = 0; 782 783 /*****************************************************/ 784 /* Assuming the looping structure of the four */ 785 /* blocks is in Z scan order of 4x4s in a 8x8 */ 786 /* block instead of raster scan */ 787 /*****************************************************/ 788 blkx = (i & 0x1); 789 blky = (i >> 1); 790 791 pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd); 792 pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd); 793 pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd); 794 795 ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd); 796 797 for(k = 0; k < 4; k++) 798 { 799 if(ABS(pi2_dst[0 * dst_strd + k]) < threshold) 800 pi2_dst[0 * dst_strd + k] = 0; 801 802 if(ABS(pi2_dst[1 * dst_strd + k]) < threshold) 803 pi2_dst[1 * dst_strd + k] = 0; 804 805 if(ABS(pi2_dst[2 * dst_strd + k]) < threshold) 806 pi2_dst[2 * dst_strd + k] = 0; 807 808 if(ABS(pi2_dst[3 * dst_strd + k]) < threshold) 809 pi2_dst[3 * dst_strd + k] = 0; 810 811 /* Accumulate the SATD */ 812 u4_hsad += ABS(pi2_dst[0 * dst_strd + k]); 813 u4_hsad += ABS(pi2_dst[1 * dst_strd + k]); 814 u4_hsad += ABS(pi2_dst[2 * dst_strd + k]); 815 u4_hsad += ABS(pi2_dst[3 * dst_strd + k]); 816 } 817 818 /*===== Normalize the HSAD =====*/ 819 pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2); 820 i4_child_total_sad += ((u4_hsad + 2) >> 2); 821 } 822 return i4_child_total_sad; 823 } 824 825 /** 826 ******************************************************************************* 827 * 828 * @brief 829 * HSAD is returned for the 4, 4x4 in 8x8 830 * 831 * @par Description: 832 * 833 * @param[in] pu1_origin 834 * UWORD8 pointer to the current block 835 * 836 * @param[in] src_strd 837 * WORD32 Source stride 838 * 839 * @param[in] pu1_pred 840 * UWORD8 pointer to the prediction block 841 * 842 * @param[in] pred_strd 843 * WORD32 Pred stride 844 * 845 * @param[out] pi2_dst 846 * WORD16 pointer to the transform output block 847 * 848 * @param[out] dst_strd 849 * WORD32 Destination stride 850 * 851 * @param[out] ppi4_hsad 852 * pointer to base pointers for storing hadmard sads of various 853 * block sizes (4x4 to 32x32) 854 * 855 * @param[in] pos_x_y_4x4 856 * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB 857 * Lower 16bits denote xpos and upper 16ypos of the 4x4block 858 * 859 * @param[in] num_4x4_in_row 860 * Denotes the number of current 4x4 blocks in a ctb/CU/MB 861 * 862 * @returns 863 * 864 * @remarks 865 * 866 ******************************************************************************* 867 */ 868 void ihevce_had_8x8_using_4_4x4( 869 UWORD8 *pu1_src, 870 WORD32 src_strd, 871 UWORD8 *pu1_pred, 872 WORD32 pred_strd, 873 WORD16 *pi2_dst, 874 WORD32 dst_strd, 875 WORD32 **ppi4_hsad, 876 WORD32 pos_x_y_4x4, 877 WORD32 num_4x4_in_row) 878 { 879 WORD16 ai2_4x4_had[64]; 880 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF; 881 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF; 882 WORD32 *pi4_4x4_hsad; 883 WORD32 *pi4_8x8_hsad; 884 885 (void)pi2_dst; 886 (void)dst_strd; 887 ASSERT(pos_x >= 0); 888 ASSERT(pos_y >= 0); 889 890 /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */ 891 pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row; 892 pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1); 893 894 /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */ 895 pi4_8x8_hsad[0] = ihevce_had4_4x4( 896 pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0); 897 } 898 899 /** 900 ******************************************************************************* 901 * 902 * @brief 903 * Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8 904 * block and its four subblocks(4x4). 905 * 906 * @par Description: 907 * 908 * @param[in] pu1_origin 909 * UWORD8 pointer to the current block 910 * 911 * @param[in] src_strd 912 * WORD32 Source stride 913 * 914 * @param[in] pu1_pred 915 * UWORD8 pointer to the prediction block 916 * 917 * @param[in] pred_strd 918 * WORD32 Pred stride 919 * 920 * @param[out] pi2_dst 921 * WORD16 pointer to the transform output block 922 * 923 * @param[out] dst_strd 924 * WORD32 Destination stride 925 * 926 * @param[out] ppi4_hsad 927 * pointer to base pointers for storing hadmard sads of various 928 * block sizes (4x4 to 32x32) 929 * 930 * @param[in] pos_x_y_4x4 931 * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB 932 * Lower 16bits denote xpos and upper 16ypos of the 4x4block 933 * 934 * @param[in] num_4x4_in_row 935 * Denotes the number of current 4x4 blocks in a ctb/CU/MB 936 * 937 * @param[in] i4_frm_qstep 938 * frm_qstep value based on the which the threshold value is calculated 939 * 940 * @returns 941 * 942 * @remarks 943 * 944 ******************************************************************************* 945 */ 946 WORD32 ihevce_had_8x8_using_4_4x4_r( 947 UWORD8 *pu1_src, 948 WORD32 src_strd, 949 UWORD8 *pu1_pred, 950 WORD32 pred_strd, 951 WORD16 *pi2_dst, 952 WORD32 dst_strd, 953 WORD32 **ppi4_hsad, 954 WORD32 **ppi4_tu_split, 955 WORD32 **ppi4_tu_early_cbf, 956 WORD32 pos_x_y_4x4, 957 WORD32 num_4x4_in_row, 958 WORD32 lambda, 959 WORD32 lambda_q_shift, 960 WORD32 i4_frm_qstep, 961 WORD32 i4_cur_depth, 962 WORD32 i4_max_depth, 963 WORD32 i4_max_tr_size, 964 WORD32 *pi4_tu_split_cost, 965 void *pv_func_sel) 966 { 967 WORD16 ai2_4x4_had[64]; 968 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF; 969 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF; 970 WORD32 *pi4_4x4_hsad; 971 WORD32 *pi4_8x8_hsad; 972 WORD32 *pi4_8x8_tu_split; 973 974 WORD32 *pi4_8x8_tu_early_cbf; 975 976 UWORD32 u4_satd; 977 WORD32 cost_child = 0, cost_parent = 0; 978 WORD32 early_cbf = 0; 979 980 const UWORD8 u1_cur_tr_size = 8; 981 /* Stores the best cost for the Current 8x8: Lokesh */ 982 WORD32 best_cost = 0; 983 984 (void)pv_func_sel; 985 ASSERT(pos_x >= 0); 986 ASSERT(pos_y >= 0); 987 988 /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */ 989 pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row; 990 pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1); 991 pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1); 992 pi4_8x8_tu_early_cbf = 993 ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1); 994 995 /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */ 996 cost_child = ihevce_had4_4x4( 997 pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0); 998 999 /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */ 1000 u4_satd = ihevce_compute_8x8HAD_using_4x4( 1001 ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf); 1002 1003 /* store the normalized 8x8 satd */ 1004 cost_parent = ((u4_satd + 4) >> 3); 1005 1006 /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */ 1007 cost_child += ((4) * lambda) >> (lambda_q_shift + 1); 1008 1009 if(i4_cur_depth < i4_max_depth) 1010 { 1011 if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size)) 1012 { 1013 //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1); 1014 *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1); 1015 best_cost = cost_child; 1016 best_cost <<= 1; 1017 best_cost++; 1018 pi4_8x8_tu_split[0] = 1; 1019 pi4_8x8_hsad[0] = cost_child; 1020 } 1021 else 1022 { 1023 //cost_parent -= ((1) * lambda) >> (lambda_q_shift + 1); 1024 best_cost = cost_parent; 1025 best_cost <<= 1; 1026 pi4_8x8_tu_split[0] = 0; 1027 pi4_8x8_hsad[0] = cost_parent; 1028 } 1029 } 1030 else 1031 { 1032 //cost_parent -= ((1) * lambda) >> (lambda_q_shift + 1); 1033 best_cost = cost_parent; 1034 best_cost <<= 1; 1035 pi4_8x8_tu_split[0] = 0; 1036 pi4_8x8_hsad[0] = cost_parent; 1037 } 1038 1039 pi4_8x8_tu_early_cbf[0] = early_cbf; 1040 1041 /* best cost has tu_split_flag at LSB(Least significant bit) */ 1042 return ((best_cost << 1) + early_cbf); 1043 } 1044 1045 /** 1046 ******************************************************************************* 1047 * 1048 * @brief 1049 * Computes 16x16 transform using children 8x8 hadamard results 1050 * Modified to incorporate the dead-zone implementation - Lokesh 1051 * 1052 * @par Description: 1053 * 1054 * @param[in] pi2_8x8_had 1055 * WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order) 1056 * 1057 * @param[in] had8_strd 1058 * stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3 1059 * 1060 * @param[out] pi2_dst 1061 * destination buffer where 8x8 hadamard result is stored 1062 * 1063 * @param[in] dst_stride 1064 * stride of destination block 1065 * 1066 * @param[in] i4_frm_qstep 1067 * frm_qstep value based on the which the threshold value is calculated 1068 * 1069 * @returns 1070 * 16x16 Hadamard SATD 1071 * @remarks 1072 * 1073 ******************************************************************************* 1074 */ 1075 static UWORD32 ihevce_compute_16x16HAD_using_8x8( 1076 WORD16 *pi2_8x8_had, 1077 WORD32 had8_strd, 1078 WORD16 *pi2_dst, 1079 WORD32 dst_strd, 1080 WORD32 i4_frm_qstep, 1081 WORD32 *pi4_cbf) 1082 { 1083 /* Qstep value is right shifted by 8 */ 1084 WORD32 threshold = (i4_frm_qstep >> 8); 1085 1086 /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */ 1087 WORD16 *pi2_y0 = pi2_8x8_had; 1088 WORD16 *pi2_y1 = pi2_8x8_had + 8; 1089 WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8; 1090 WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8; 1091 1092 /* Initialize pointers to store 8x8 HAD output */ 1093 WORD16 *pi2_dst0 = pi2_dst; 1094 WORD16 *pi2_dst1 = pi2_dst + 8; 1095 WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8; 1096 WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8; 1097 1098 UWORD32 u4_satd = 0; 1099 WORD32 i; 1100 1101 /* Child HAD results combined as follows to get Parent result */ 1102 /* _ _ */ 1103 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ 1104 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ 1105 /* \- -/ */ 1106 for(i = 0; i < 64; i++) 1107 { 1108 WORD32 src_idx = (i >> 3) * had8_strd + (i % 8); 1109 WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8); 1110 1111 WORD16 a0 = pi2_y0[src_idx]; 1112 WORD16 a1 = pi2_y1[src_idx]; 1113 WORD16 a2 = pi2_y2[src_idx]; 1114 WORD16 a3 = pi2_y3[src_idx]; 1115 1116 WORD16 b0 = (a0 + a1) >> 1; 1117 WORD16 b1 = (a0 - a1) >> 1; 1118 WORD16 b2 = (a2 + a3) >> 1; 1119 WORD16 b3 = (a2 - a3) >> 1; 1120 1121 pi2_dst0[dst_idx] = b0 + b2; 1122 pi2_dst1[dst_idx] = b1 + b3; 1123 pi2_dst2[dst_idx] = b0 - b2; 1124 pi2_dst3[dst_idx] = b1 - b3; 1125 1126 /* Make the value of dst to zerp, if it falls below the dead-zone */ 1127 if(ABS(pi2_dst0[dst_idx]) > threshold) 1128 *pi4_cbf = 1; 1129 if(ABS(pi2_dst1[dst_idx]) > threshold) 1130 *pi4_cbf = 1; 1131 if(ABS(pi2_dst2[dst_idx]) > threshold) 1132 *pi4_cbf = 1; 1133 if(ABS(pi2_dst3[dst_idx]) > threshold) 1134 *pi4_cbf = 1; 1135 1136 u4_satd += ABS(pi2_dst0[dst_idx]); 1137 u4_satd += ABS(pi2_dst1[dst_idx]); 1138 u4_satd += ABS(pi2_dst2[dst_idx]); 1139 u4_satd += ABS(pi2_dst3[dst_idx]); 1140 } 1141 1142 /* return 16x16 satd */ 1143 return (u4_satd); 1144 } 1145 1146 /** 1147 ******************************************************************************* 1148 * 1149 * @brief 1150 * Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates. 1151 * Uses recursive 8x8 had output to compute satd for 16x16 and its children 1152 * 1153 * @par Description: 1154 * 1155 * @param[in] pu1_origin 1156 * UWORD8 pointer to the current block 1157 * 1158 * @param[in] src_strd 1159 * WORD32 Source stride 1160 * 1161 * @param[in] pu1_pred 1162 * UWORD8 pointer to the prediction block 1163 * 1164 * @param[in] pred_strd 1165 * WORD32 Pred stride 1166 * 1167 * @param[out] pi2_dst 1168 * WORD16 pointer to the transform output block 1169 * 1170 * @param[out] dst_strd 1171 * WORD32 Destination stride 1172 * 1173 * @param[out] ppi4_hsad 1174 * pointer to base pointers for storing hadmard sads of various 1175 * block sizes (4x4 to 32x32) 1176 * 1177 * @param[in] pos_x_y_4x4 1178 * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB 1179 * Lower 16bits denote xpos and upper 16ypos of the 4x4block 1180 * 1181 * @param[in] num_4x4_in_row 1182 * Denotes the number of current 4x4 blocks in a ctb/CU/MB 1183 * 1184 * @param[in] lambda 1185 * lambda values is the cost factor calculated based on QP 1186 * 1187 * @param[in] lambda_q_shift 1188 * lambda_q_shift used to reverse the lambda value back from q8 format 1189 * 1190 * @param[in] depth 1191 * depth gives the current TU depth with respect to the CU 1192 * 1193 * @param[in] i4_frm_qstep 1194 * frm_qstep value based on the which the threshold value is calculated 1195 * 1196 * @returns 1197 * 1198 * @remarks 1199 * 1200 ******************************************************************************* 1201 */ 1202 1203 WORD32 ihevce_had_16x16_r( 1204 UWORD8 *pu1_src, 1205 WORD32 src_strd, 1206 UWORD8 *pu1_pred, 1207 WORD32 pred_strd, 1208 WORD16 *pi2_dst, 1209 WORD32 dst_strd, 1210 WORD32 **ppi4_hsad, 1211 WORD32 **ppi4_tu_split, 1212 WORD32 **ppi4_tu_early_cbf, 1213 WORD32 pos_x_y_4x4, 1214 WORD32 num_4x4_in_row, 1215 WORD32 lambda, 1216 WORD32 lambda_q_shift, 1217 WORD32 i4_frm_qstep, 1218 WORD32 i4_cur_depth, 1219 WORD32 i4_max_depth, 1220 WORD32 i4_max_tr_size, 1221 WORD32 *pi4_tu_split_cost, 1222 void *pv_func_sel) 1223 { 1224 WORD16 ai2_8x8_had[256]; 1225 WORD32 *pi4_16x16_hsad; 1226 WORD32 *pi4_16x16_tu_split; 1227 1228 WORD32 *pi4_16x16_tu_early_cbf; 1229 1230 UWORD32 u4_satd = 0; 1231 WORD32 tu_split_flag = 0; 1232 WORD32 i4_early_cbf_flag = 0, early_cbf = 0; 1233 const UWORD8 u1_cur_tr_size = 16; 1234 1235 /* cost_parent : Stores the cost of the parent HAD transform (16x16) */ 1236 /* cost_child : Stores the cost of the child HAD transform (16x16) */ 1237 WORD32 cost_parent = 0, cost_child = 0; 1238 1239 /*best_cost returns the best cost at the end of the function */ 1240 /*tu_split denoes whether the TU (16x16)is split or not */ 1241 WORD32 best_cost = 0, best_cost_tu_split; 1242 WORD32 i; 1243 1244 WORD16 *pi2_y0; 1245 UWORD8 *pu1_src0; 1246 UWORD8 *pu1_pred0; 1247 WORD32 pos_x_y_4x4_0; 1248 1249 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF; 1250 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF; 1251 1252 ASSERT(pos_x >= 0); 1253 ASSERT(pos_y >= 0); 1254 1255 /* Initialize pointers to store 16x16 SATDs */ 1256 pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2); 1257 1258 pi4_16x16_tu_split = 1259 ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2); 1260 1261 pi4_16x16_tu_early_cbf = 1262 ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2); 1263 1264 /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */ 1265 for(i = 0; i < 4; i++) 1266 { 1267 pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8; 1268 pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8; 1269 pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8; 1270 pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16); 1271 1272 best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r( 1273 pu1_src0, 1274 src_strd, 1275 pu1_pred0, 1276 pred_strd, 1277 pi2_y0, 1278 16, 1279 ppi4_hsad, 1280 ppi4_tu_split, 1281 ppi4_tu_early_cbf, 1282 pos_x_y_4x4_0, 1283 num_4x4_in_row, 1284 lambda, 1285 lambda_q_shift, 1286 i4_frm_qstep, 1287 i4_cur_depth + 1, 1288 i4_max_depth, 1289 i4_max_tr_size, 1290 pi4_tu_split_cost, 1291 pv_func_sel); 1292 1293 /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */ 1294 best_cost = (best_cost_tu_split >> 2); 1295 1296 /* Last but one bit stores the information regarding the TU_Split */ 1297 tu_split_flag += (best_cost_tu_split & 0x3) >> 1; 1298 1299 /* Last bit stores the information regarding the early_cbf */ 1300 i4_early_cbf_flag += (best_cost_tu_split & 0x1); 1301 1302 cost_child += best_cost; 1303 1304 tu_split_flag <<= 1; 1305 i4_early_cbf_flag <<= 1; 1306 } 1307 1308 /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */ 1309 pi2_y0 = ai2_8x8_had; 1310 1311 /* Threshold currently passed as "0" */ 1312 u4_satd = 1313 ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf); 1314 1315 /* store the normalized satd */ 1316 cost_parent = ((u4_satd + 4) >> 3); 1317 1318 /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */ 1319 cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1); 1320 1321 i4_early_cbf_flag += early_cbf; 1322 1323 /* Right now the depth is hard-coded to 4: The depth can be modified from the config file 1324 which decides the extent to which TU_REC needs to be done */ 1325 if(i4_cur_depth < i4_max_depth) 1326 { 1327 if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size)) 1328 { 1329 //cost_child -= ((4 + 4) * lambda) >> (lambda_q_shift + 1); 1330 *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1); 1331 tu_split_flag += 1; 1332 best_cost = cost_child; 1333 } 1334 else 1335 { 1336 //cost_parent -= ((1 + 1) * lambda) >> (lambda_q_shift + 1); 1337 tu_split_flag += 0; 1338 best_cost = cost_parent; 1339 } 1340 } 1341 else 1342 { 1343 //cost_parent -= ((1 + 1) * lambda) >> (lambda_q_shift + 1); 1344 tu_split_flag += 0; 1345 best_cost = cost_parent; 1346 } 1347 1348 pi4_16x16_hsad[0] = best_cost; 1349 pi4_16x16_tu_split[0] = tu_split_flag; 1350 pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag; 1351 1352 /*returning two values(best cost & tu_split_flag) as a single value*/ 1353 return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag); 1354 } 1355 1356 //#endif 1357 /** 1358 ******************************************************************************* 1359 * 1360 * @brief 1361 * Computes 32x32 transform using children 16x16 hadamard results 1362 * 1363 * @par Description: 1364 * 1365 * @param[in] pi2_16x16_had 1366 * WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order) 1367 * 1368 * @param[in] had16_strd 1369 * stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3 1370 * 1371 * @param[out] pi2_dst 1372 * destination buffer where 16x16 hadamard result is stored 1373 * 1374 * @param[in] dst_stride 1375 * stride of destination block 1376 * 1377 * @param[in] i4_frm_qstep 1378 * frm_qstep value based on the which the threshold value is calculated 1379 * 1380 * @returns 1381 * 32x32 Hadamard SATD 1382 * @remarks 1383 * 1384 ******************************************************************************* 1385 */ 1386 //#if COMPUTE_32x32_USING_16X16 == C 1387 UWORD32 ihevce_compute_32x32HAD_using_16x16( 1388 WORD16 *pi2_16x16_had, 1389 WORD32 had16_strd, 1390 WORD16 *pi2_dst, 1391 WORD32 dst_strd, 1392 WORD32 i4_frm_qstep, 1393 WORD32 *pi4_cbf) 1394 { 1395 /* Qstep value is right shifted by 8 */ 1396 WORD32 threshold = (i4_frm_qstep >> 8); 1397 1398 /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */ 1399 WORD16 *pi2_y0 = pi2_16x16_had; 1400 WORD16 *pi2_y1 = pi2_16x16_had + 16; 1401 WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16; 1402 WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16; 1403 1404 /* Initialize pointers to store 8x8 HAD output */ 1405 WORD16 *pi2_dst0 = pi2_dst; 1406 WORD16 *pi2_dst1 = pi2_dst + 16; 1407 WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16; 1408 WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16; 1409 1410 UWORD32 u4_satd = 0; 1411 WORD32 i; 1412 1413 /* Child HAD results combined as follows to get Parent result */ 1414 /* _ _ */ 1415 /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */ 1416 /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */ 1417 /* \- -/ */ 1418 for(i = 0; i < 256; i++) 1419 { 1420 WORD32 src_idx = (i >> 4) * had16_strd + (i % 16); 1421 WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16); 1422 1423 WORD16 a0 = pi2_y0[src_idx] >> 2; 1424 WORD16 a1 = pi2_y1[src_idx] >> 2; 1425 WORD16 a2 = pi2_y2[src_idx] >> 2; 1426 WORD16 a3 = pi2_y3[src_idx] >> 2; 1427 1428 WORD16 b0 = (a0 + a1); 1429 WORD16 b1 = (a0 - a1); 1430 WORD16 b2 = (a2 + a3); 1431 WORD16 b3 = (a2 - a3); 1432 1433 pi2_dst0[dst_idx] = b0 + b2; 1434 pi2_dst1[dst_idx] = b1 + b3; 1435 pi2_dst2[dst_idx] = b0 - b2; 1436 pi2_dst3[dst_idx] = b1 - b3; 1437 1438 /* Make the value of dst to zerp, if it falls below the dead-zone */ 1439 if(ABS(pi2_dst0[dst_idx]) > threshold) 1440 *pi4_cbf = 1; 1441 if(ABS(pi2_dst1[dst_idx]) > threshold) 1442 *pi4_cbf = 1; 1443 if(ABS(pi2_dst2[dst_idx]) > threshold) 1444 *pi4_cbf = 1; 1445 if(ABS(pi2_dst3[dst_idx]) > threshold) 1446 *pi4_cbf = 1; 1447 1448 u4_satd += ABS(pi2_dst0[dst_idx]); 1449 u4_satd += ABS(pi2_dst1[dst_idx]); 1450 u4_satd += ABS(pi2_dst2[dst_idx]); 1451 u4_satd += ABS(pi2_dst3[dst_idx]); 1452 } 1453 1454 /* return 32x32 satd */ 1455 return (u4_satd); 1456 } 1457 //#endif 1458 1459 /** 1460 ******************************************************************************* 1461 * 1462 * @brief 1463 * Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates. 1464 * Uses recursive 16x16 had output to compute satd for 32x32 and its children 1465 * 1466 * @par Description: 1467 * 1468 * @param[in] pu1_origin 1469 * UWORD8 pointer to the current block 1470 * 1471 * @param[in] src_strd 1472 * WORD32 Source stride 1473 * 1474 * @param[in] pu1_pred 1475 * UWORD8 pointer to the prediction block 1476 * 1477 * @param[in] pred_strd 1478 * WORD32 Pred stride 1479 * 1480 * @param[out] pi2_dst 1481 * WORD16 pointer to the transform output block 1482 * 1483 * @param[out] dst_strd 1484 * WORD32 Destination stride 1485 * 1486 * @param[out] ppi4_hsad 1487 * pointer to base pointers for storing hadmard sads of various 1488 * block sizes (4x4 to 32x32) 1489 * 1490 * @param[in] pos_x_y_4x4 1491 * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB 1492 * Lower 16bits denote xpos and upper 16ypos of the 4x4block 1493 * 1494 * @param[in] num_4x4_in_row 1495 * Denotes the number of current 4x4 blocks in a ctb/CU/MB 1496 * 1497 * @param[in] lambda 1498 * lambda values is the cost factor calculated based on QP 1499 * 1500 * @param[in] lambda_q_shift 1501 * lambda_q_shift used to reverse the lambda value back from q8 format 1502 * 1503 * @param[in] depth 1504 * depth gives the current TU depth with respect to the CU 1505 * 1506 * @param[in] i4_frm_qstep 1507 * frm_qstep value based on the which the threshold value is calculated 1508 * 1509 * 1510 * @returns 1511 * 1512 * @remarks 1513 * 1514 ******************************************************************************* 1515 */ 1516 void ihevce_had_32x32_r( 1517 UWORD8 *pu1_src, 1518 WORD32 src_strd, 1519 UWORD8 *pu1_pred, 1520 WORD32 pred_strd, 1521 WORD16 *pi2_dst, 1522 WORD32 dst_strd, 1523 WORD32 **ppi4_hsad, 1524 WORD32 **ppi4_tu_split, 1525 WORD32 **ppi4_tu_early_cbf, 1526 WORD32 pos_x_y_4x4, 1527 WORD32 num_4x4_in_row, 1528 WORD32 lambda, 1529 WORD32 lambda_q_shift, 1530 WORD32 i4_frm_qstep, 1531 WORD32 i4_cur_depth, 1532 WORD32 i4_max_depth, 1533 WORD32 i4_max_tr_size, 1534 WORD32 *pi4_tu_split_cost, 1535 me_func_selector_t *ps_func_selector) 1536 1537 { 1538 WORD16 ai2_16x16_had[1024]; 1539 WORD32 *pi4_32x32_hsad; 1540 WORD32 *pi4_32x32_tu_split; 1541 WORD32 *pi4_32x32_tu_early_cbf; 1542 1543 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF; 1544 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF; 1545 WORD32 tu_split_flag = 0; 1546 const UWORD8 u1_cur_tr_size = 32; 1547 WORD32 i4_early_cbf_flag = 0, early_cbf = 0; 1548 1549 /* cost_parent : Stores the cost of the parent HAD transform (16x16) */ 1550 /* cost_child : Stores the cost of the child HAD transform (16x16) */ 1551 WORD32 cost_child = 0, cost_parent = 0; 1552 1553 /*retuned as the best cost for the entire TU (32x32) */ 1554 WORD32 best_cost = 0; 1555 /*captures the best cost and tu_split at child level */ 1556 WORD32 best_cost_tu_split; 1557 1558 /* Initialize pointers to 4 8x8 blocks in 16x16 */ 1559 WORD16 *pi2_y0 = ai2_16x16_had; 1560 WORD16 *pi2_y1 = ai2_16x16_had + 16; 1561 WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16; 1562 WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16; 1563 1564 UWORD8 *pu1_src0 = pu1_src; 1565 UWORD8 *pu1_src1 = pu1_src + 16; 1566 UWORD8 *pu1_src2 = pu1_src + src_strd * 16; 1567 UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16; 1568 1569 UWORD8 *pu1_pred0 = pu1_pred; 1570 UWORD8 *pu1_pred1 = pu1_pred + 16; 1571 UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16; 1572 UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16; 1573 1574 ASSERT(pos_x >= 0); 1575 ASSERT(pos_y >= 0); 1576 1577 /* Initialize pointers to store 32x32 SATDs */ 1578 pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3); 1579 1580 pi4_32x32_tu_split = 1581 ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3); 1582 1583 pi4_32x32_tu_early_cbf = 1584 ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3); 1585 1586 /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */ 1587 best_cost_tu_split = ps_func_selector->pf_had_16x16_r( 1588 pu1_src0, 1589 src_strd, 1590 pu1_pred0, 1591 pred_strd, 1592 pi2_y0, 1593 32, 1594 ppi4_hsad, 1595 ppi4_tu_split, 1596 ppi4_tu_early_cbf, 1597 pos_x_y_4x4, 1598 num_4x4_in_row, 1599 lambda, 1600 lambda_q_shift, 1601 i4_frm_qstep, 1602 i4_cur_depth + 1, 1603 i4_max_depth, 1604 i4_max_tr_size, 1605 pi4_tu_split_cost, 1606 NULL); 1607 1608 /* cost is shifted by 10bits */ 1609 best_cost = best_cost_tu_split >> 10; 1610 1611 /* Tu split is present in the 6-10 bits */ 1612 tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5; 1613 1614 /*Early CBF info is present in the last 5 bits */ 1615 i4_early_cbf_flag += best_cost_tu_split & 0x1F; 1616 1617 tu_split_flag <<= 5; 1618 i4_early_cbf_flag <<= 5; 1619 1620 cost_child += best_cost; 1621 1622 best_cost_tu_split = ps_func_selector->pf_had_16x16_r( 1623 pu1_src1, 1624 src_strd, 1625 pu1_pred1, 1626 pred_strd, 1627 pi2_y1, 1628 32, 1629 ppi4_hsad, 1630 ppi4_tu_split, 1631 ppi4_tu_early_cbf, 1632 pos_x_y_4x4 + 4, 1633 num_4x4_in_row, 1634 lambda, 1635 lambda_q_shift, 1636 i4_frm_qstep, 1637 i4_cur_depth + 1, 1638 i4_max_depth, 1639 i4_max_tr_size, 1640 pi4_tu_split_cost, 1641 NULL); 1642 1643 /* cost is shifted by 10bits */ 1644 best_cost = best_cost_tu_split >> 10; 1645 1646 /* Tu split is present in the 6-10 bits */ 1647 tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5; 1648 1649 /*Early CBF info is present in the last 5 bits */ 1650 i4_early_cbf_flag += best_cost_tu_split & 0x1F; 1651 1652 tu_split_flag <<= 5; 1653 i4_early_cbf_flag <<= 5; 1654 1655 cost_child += best_cost; 1656 1657 best_cost_tu_split = ps_func_selector->pf_had_16x16_r( 1658 pu1_src2, 1659 src_strd, 1660 pu1_pred2, 1661 pred_strd, 1662 pi2_y2, 1663 32, 1664 ppi4_hsad, 1665 ppi4_tu_split, 1666 ppi4_tu_early_cbf, 1667 pos_x_y_4x4 + (4 << 16), 1668 num_4x4_in_row, 1669 lambda, 1670 lambda_q_shift, 1671 i4_frm_qstep, 1672 i4_cur_depth + 1, 1673 i4_max_depth, 1674 i4_max_tr_size, 1675 pi4_tu_split_cost, 1676 NULL); 1677 1678 /* cost is shifted by 10bits */ 1679 best_cost = best_cost_tu_split >> 10; 1680 1681 /* Tu split is present in the 6-10 bits */ 1682 tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5; 1683 1684 /*Early CBF info is present in the last 5 bits */ 1685 i4_early_cbf_flag += best_cost_tu_split & 0x1F; 1686 1687 tu_split_flag <<= 5; 1688 i4_early_cbf_flag <<= 5; 1689 1690 cost_child += best_cost; 1691 1692 best_cost_tu_split = ps_func_selector->pf_had_16x16_r( 1693 pu1_src3, 1694 src_strd, 1695 pu1_pred3, 1696 pred_strd, 1697 pi2_y3, 1698 32, 1699 ppi4_hsad, 1700 ppi4_tu_split, 1701 ppi4_tu_early_cbf, 1702 pos_x_y_4x4 + (4 << 16) + 4, 1703 num_4x4_in_row, 1704 lambda, 1705 lambda_q_shift, 1706 i4_frm_qstep, 1707 i4_cur_depth + 1, 1708 i4_max_depth, 1709 i4_max_tr_size, 1710 pi4_tu_split_cost, 1711 NULL); 1712 1713 /* cost is shifted by 10bits */ 1714 best_cost = best_cost_tu_split >> 10; 1715 1716 /* Tu split is present in the 6-10 bits */ 1717 tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5; 1718 1719 /*Early CBF info is present in the last 5 bits */ 1720 i4_early_cbf_flag += best_cost_tu_split & 0x1F; 1721 1722 tu_split_flag <<= 1; 1723 i4_early_cbf_flag <<= 1; 1724 1725 cost_child += best_cost; 1726 1727 { 1728 UWORD32 u4_satd = 0; 1729 1730 u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16( 1731 pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf); 1732 1733 cost_parent = ((u4_satd + 2) >> 2); 1734 } 1735 1736 /* 4 TU_Split flags , 4 CBF Flags*/ 1737 cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1); 1738 1739 i4_early_cbf_flag += early_cbf; 1740 1741 /* 1 TU_SPlit flag, 1 CBF flag */ 1742 //cost_parent += ((1 + 1)* lambda) >> (lambda_q_shift + 1); 1743 1744 if(i4_cur_depth < i4_max_depth) 1745 { 1746 if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size)) 1747 { 1748 *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1); 1749 best_cost = cost_child; 1750 tu_split_flag++; 1751 } 1752 else 1753 { 1754 tu_split_flag = 0; 1755 best_cost = cost_parent; 1756 } 1757 } 1758 else 1759 { 1760 tu_split_flag = 0; 1761 best_cost = cost_parent; 1762 } 1763 1764 pi4_32x32_tu_split[0] = tu_split_flag; 1765 1766 pi4_32x32_hsad[0] = best_cost; 1767 1768 pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag; 1769 } 1770