Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2018 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22 ******************************************************************************
     23 * @file ihevce_had_satd.c
     24 *
     25 * @brief
     26 *    This file contains functions of Hadamard SAD and SATD
     27 *
     28 * @author
     29 *    Ittiam
     30 *
     31 * List of Functions
     32 *   <TODO: TO BE ADDED>
     33 *
     34 ******************************************************************************
     35 */
     36 
     37 /*****************************************************************************/
     38 /* File Includes                                                             */
     39 /*****************************************************************************/
     40 /* System include files */
     41 #include <stdio.h>
     42 #include <string.h>
     43 #include <stdlib.h>
     44 #include <assert.h>
     45 #include <stdarg.h>
     46 #include <math.h>
     47 
     48 /* User include files */
     49 #include "ihevc_typedefs.h"
     50 #include "itt_video_api.h"
     51 #include "ihevce_api.h"
     52 
     53 #include "rc_cntrl_param.h"
     54 #include "rc_frame_info_collector.h"
     55 #include "rc_look_ahead_params.h"
     56 
     57 #include "ihevc_defs.h"
     58 #include "ihevc_structs.h"
     59 #include "ihevc_platform_macros.h"
     60 #include "ihevc_deblk.h"
     61 #include "ihevc_itrans_recon.h"
     62 #include "ihevc_chroma_itrans_recon.h"
     63 #include "ihevc_chroma_intra_pred.h"
     64 #include "ihevc_intra_pred.h"
     65 #include "ihevc_inter_pred.h"
     66 #include "ihevc_mem_fns.h"
     67 #include "ihevc_padding.h"
     68 #include "ihevc_weighted_pred.h"
     69 #include "ihevc_sao.h"
     70 #include "ihevc_resi_trans.h"
     71 #include "ihevc_quant_iquant_ssd.h"
     72 #include "ihevc_cabac_tables.h"
     73 
     74 #include "ihevce_defs.h"
     75 #include "ihevce_lap_enc_structs.h"
     76 #include "ihevce_multi_thrd_structs.h"
     77 #include "ihevce_multi_thrd_funcs.h"
     78 #include "ihevce_me_common_defs.h"
     79 #include "ihevce_had_satd.h"
     80 #include "ihevce_error_codes.h"
     81 #include "ihevce_bitstream.h"
     82 #include "ihevce_cabac.h"
     83 #include "ihevce_rdoq_macros.h"
     84 #include "ihevce_function_selector.h"
     85 #include "ihevce_enc_structs.h"
     86 #include "ihevce_cmn_utils_instr_set_router.h"
     87 #include "hme_datatype.h"
     88 #include "hme_interface.h"
     89 #include "hme_common_defs.h"
     90 #include "hme_defs.h"
     91 
     92 /*****************************************************************************/
     93 /* Function Definitions                                                      */
     94 /*****************************************************************************/
     95 
     96 static void ihevce_hadamard_4x4_8bit(
     97     UWORD8 *pu1_src,
     98     WORD32 src_strd,
     99     UWORD8 *pu1_pred,
    100     WORD32 pred_strd,
    101     WORD16 *pi2_dst,
    102     WORD32 dst_strd)
    103 {
    104     WORD32 k;
    105     WORD16 m[16];
    106 
    107     /*===== hadamard horz transform =====*/
    108     for(k = 0; k < 4; k++)
    109     {
    110         WORD32 r0, r1, r2, r3;
    111         WORD32 h0, h1, h2, h3;
    112 
    113         /* Compute the residue block */
    114         r0 = pu1_src[0] - pu1_pred[0];
    115         r1 = pu1_src[1] - pu1_pred[1];
    116         r2 = pu1_src[2] - pu1_pred[2];
    117         r3 = pu1_src[3] - pu1_pred[3];
    118 
    119         h0 = r0 + r1;
    120         h1 = r0 - r1;
    121         h2 = r2 + r3;
    122         h3 = r2 - r3;
    123 
    124         m[k * 4 + 0] = h0 + h2;
    125         m[k * 4 + 1] = h1 + h3;
    126         m[k * 4 + 2] = h0 - h2;
    127         m[k * 4 + 3] = h1 - h3;
    128 
    129         pu1_pred += pred_strd;
    130         pu1_src += src_strd;
    131     }
    132 
    133     /*===== hadamard vert transform =====*/
    134     for(k = 0; k < 4; k++)
    135     {
    136         WORD32 v0, v1, v2, v3;
    137 
    138         v0 = m[0 + k] + m[4 + k];
    139         v1 = m[0 + k] - m[4 + k];
    140         v2 = m[8 + k] + m[12 + k];
    141         v3 = m[8 + k] - m[12 + k];
    142 
    143         pi2_dst[0 * dst_strd + k] = v0 + v2;
    144         pi2_dst[1 * dst_strd + k] = v1 + v3;
    145         pi2_dst[2 * dst_strd + k] = v0 - v2;
    146         pi2_dst[3 * dst_strd + k] = v1 - v3;
    147     }
    148 }
    149 
    150 static void ihevce_hadamard_8x8_8bit(
    151     UWORD8 *pu1_src,
    152     WORD32 src_strd,
    153     UWORD8 *pu1_pred,
    154     WORD32 pred_strd,
    155     WORD16 *pi2_dst,
    156     WORD32 dst_strd)
    157 {
    158     WORD32 i;
    159 
    160     // y0
    161     ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
    162     // y1
    163     ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd);
    164     // y2
    165     ihevce_hadamard_4x4_8bit(
    166         pu1_src + 4 * src_strd,
    167         src_strd,
    168         pu1_pred + 4 * pred_strd,
    169         pred_strd,
    170         pi2_dst + (4 * dst_strd),
    171         dst_strd);
    172     // y3
    173     ihevce_hadamard_4x4_8bit(
    174         pu1_src + 4 + 4 * src_strd,
    175         src_strd,
    176         pu1_pred + 4 + 4 * pred_strd,
    177         pred_strd,
    178         pi2_dst + (4 * dst_strd) + 4,
    179         dst_strd);
    180 
    181     /*   Child HAD results combined as follows to get Parent result */
    182     /*  _                                                 _         */
    183     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
    184     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
    185     /* \-                                                 -/        */
    186     for(i = 0; i < 16; i++)
    187     {
    188         WORD32 idx = (i >> 2) * dst_strd + (i % 4);
    189         WORD16 a0 = pi2_dst[idx];
    190         WORD16 a1 = pi2_dst[4 + idx];
    191         WORD16 a2 = pi2_dst[(4 * dst_strd) + idx];
    192         WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx];
    193 
    194         WORD16 b0 = (a0 + a1);
    195         WORD16 b1 = (a0 - a1);
    196         WORD16 b2 = (a2 + a3);
    197         WORD16 b3 = (a2 - a3);
    198 
    199         pi2_dst[idx] = b0 + b2;
    200         pi2_dst[4 + idx] = b1 + b3;
    201         pi2_dst[(4 * dst_strd) + idx] = b0 - b2;
    202         pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3;
    203     }
    204 }
    205 
    206 static void ihevce_hadamard_16x16_8bit(
    207     UWORD8 *pu1_src,
    208     WORD32 src_strd,
    209     UWORD8 *pu1_pred,
    210     WORD32 pred_strd,
    211     WORD16 *pi2_dst,
    212     WORD32 dst_strd)
    213 {
    214     WORD32 i;
    215 
    216     // y0
    217     ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
    218     // y1
    219     ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd);
    220     // y2
    221     ihevce_hadamard_8x8_8bit(
    222         pu1_src + 8 * src_strd,
    223         src_strd,
    224         pu1_pred + 8 * pred_strd,
    225         pred_strd,
    226         pi2_dst + (8 * dst_strd),
    227         dst_strd);
    228     // y3
    229     ihevce_hadamard_8x8_8bit(
    230         pu1_src + 8 + 8 * src_strd,
    231         src_strd,
    232         pu1_pred + 8 + 8 * pred_strd,
    233         pred_strd,
    234         pi2_dst + (8 * dst_strd) + 8,
    235         dst_strd);
    236 
    237     /*   Child HAD results combined as follows to get Parent result */
    238     /*  _                                                 _         */
    239     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
    240     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
    241     /* \-                                                 -/        */
    242     for(i = 0; i < 64; i++)
    243     {
    244         WORD32 idx = (i >> 3) * dst_strd + (i % 8);
    245         WORD16 a0 = pi2_dst[idx];
    246         WORD16 a1 = pi2_dst[8 + idx];
    247         WORD16 a2 = pi2_dst[(8 * dst_strd) + idx];
    248         WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx];
    249 
    250         WORD16 b0 = (a0 + a1) >> 1;
    251         WORD16 b1 = (a0 - a1) >> 1;
    252         WORD16 b2 = (a2 + a3) >> 1;
    253         WORD16 b3 = (a2 - a3) >> 1;
    254 
    255         pi2_dst[idx] = b0 + b2;
    256         pi2_dst[8 + idx] = b1 + b3;
    257         pi2_dst[(8 * dst_strd) + idx] = b0 - b2;
    258         pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3;
    259     }
    260 }
    261 
    262 static void ihevce_hadamard_32x32_8bit(
    263     UWORD8 *pu1_src,
    264     WORD32 src_strd,
    265     UWORD8 *pu1_pred,
    266     WORD32 pred_strd,
    267     WORD16 *pi2_dst,
    268     WORD32 dst_strd)
    269 {
    270     WORD32 i;
    271 
    272     // y0
    273     ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
    274     // y1
    275     ihevce_hadamard_16x16_8bit(
    276         pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd);
    277     // y2
    278     ihevce_hadamard_16x16_8bit(
    279         pu1_src + 16 * src_strd,
    280         src_strd,
    281         pu1_pred + 16 * pred_strd,
    282         pred_strd,
    283         pi2_dst + (16 * dst_strd),
    284         dst_strd);
    285     // y3
    286     ihevce_hadamard_16x16_8bit(
    287         pu1_src + 16 + 16 * src_strd,
    288         src_strd,
    289         pu1_pred + 16 + 16 * pred_strd,
    290         pred_strd,
    291         pi2_dst + (16 * dst_strd) + 16,
    292         dst_strd);
    293 
    294     /*   Child HAD results combined as follows to get Parent result */
    295     /*  _                                                 _         */
    296     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
    297     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
    298     /* \-                                                 -/        */
    299     for(i = 0; i < 256; i++)
    300     {
    301         WORD32 idx = (i >> 4) * dst_strd + (i % 16);
    302         WORD16 a0 = pi2_dst[idx] >> 2;
    303         WORD16 a1 = pi2_dst[16 + idx] >> 2;
    304         WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2;
    305         WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2;
    306 
    307         WORD16 b0 = (a0 + a1);
    308         WORD16 b1 = (a0 - a1);
    309         WORD16 b2 = (a2 + a3);
    310         WORD16 b3 = (a2 - a3);
    311 
    312         pi2_dst[idx] = b0 + b2;
    313         pi2_dst[16 + idx] = b1 + b3;
    314         pi2_dst[(16 * dst_strd) + idx] = b0 - b2;
    315         pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3;
    316     }
    317 }
    318 
    319 /**
    320 *******************************************************************************
    321 *
    322 * @brief
    323 *  Compute Hadamard sad for 4x4 block with 8-bit input
    324 *
    325 * @par Description:
    326 *
    327 * @param[in] pu1_origin
    328 *  UWORD8 pointer to the current block
    329 *
    330 * @param[in] src_strd
    331 *  WORD32 Source stride
    332 *
    333 * @param[in] pu1_pred_buf
    334 *  UWORD8 pointer to the prediction block
    335 *
    336 * @param[in] pred_strd
    337 *  WORD32 Pred stride
    338 *
    339 * @param[in] pi2_dst
    340 *  WORD16 pointer to the transform block
    341 *
    342 * @param[in] dst_strd
    343 *  WORD32 Destination stride
    344 *
    345 * @param[in] size
    346 *  WORD32 transform Block size
    347 *
    348 * @returns hadamard SAD
    349 *
    350 * @remarks
    351 *  Not updating the transform destination now. Only returning the SATD
    352 *
    353 *******************************************************************************
    354 */
    355 UWORD32 ihevce_HAD_4x4_8bit(
    356     UWORD8 *pu1_origin,
    357     WORD32 src_strd,
    358     UWORD8 *pu1_pred_buf,
    359     WORD32 pred_strd,
    360     WORD16 *pi2_dst,
    361     WORD32 dst_strd)
    362 {
    363     WORD32 k;
    364     WORD16 v[16];
    365     UWORD32 u4_sad = 0;
    366 
    367     (void)pi2_dst;
    368     (void)dst_strd;
    369     ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4);
    370 
    371     for(k = 0; k < 16; ++k)
    372         u4_sad += abs(v[k]);
    373     u4_sad = ((u4_sad + 2) >> 2);
    374 
    375     return u4_sad;
    376 }
    377 
    378 /**
    379 *******************************************************************************
    380 *
    381 * @brief
    382 *  Computes Hadamard Sad for 8x8 block with 8-bit input
    383 *
    384 * @par Description:
    385 *
    386 * @param[in] pu1_origin
    387 *  UWORD8 pointer to the current block
    388 *
    389 * @param[in] src_strd
    390 *  WORD32 Source stride
    391 *
    392 * @param[in] pu1_pred_buf
    393 *  UWORD8 pointer to the prediction block
    394 *
    395 * @param[in] pred_strd
    396 *  WORD32 Pred stride
    397 *
    398 * @param[in] pi2_dst
    399 *  WORD16 pointer to the transform block
    400 *
    401 * @param[in] dst_strd
    402 *  WORD32 Destination stride
    403 *
    404 * @param[in] size
    405 *  WORD32 transform Block size
    406 *
    407 * @returns Hadamard SAD
    408 *
    409 * @remarks
    410 *  Not updating the transform destination now. Only returning the SATD
    411 *
    412 *******************************************************************************
    413 */
    414 UWORD32 ihevce_HAD_8x8_8bit(
    415     UWORD8 *pu1_origin,
    416     WORD32 src_strd,
    417     UWORD8 *pu1_pred_buf,
    418     WORD32 pred_strd,
    419     WORD16 *pi2_dst,
    420     WORD32 dst_strd)
    421 {
    422     WORD32 k;
    423     UWORD32 u4_sad = 0;
    424     WORD16 v[64];
    425 
    426     (void)pi2_dst;
    427     (void)dst_strd;
    428     ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
    429 
    430     for(k = 0; k < 64; ++k)
    431         u4_sad += abs(v[k]);
    432     u4_sad = ((u4_sad + 4) >> 3);
    433 
    434     return u4_sad;
    435 }
    436 
    437 /**
    438 *******************************************************************************
    439 *
    440 * @brief
    441 *  Compute dc suppressed hadamard sad for 8x8 block with 8-bit input
    442 *
    443 * @par Description:
    444 *
    445 * @param[in] pu1_origin
    446 *  UWORD8 pointer to the current block
    447 *
    448 * @param[in] src_strd
    449 *  WORD32 Source stride
    450 *
    451 * @param[in] pu1_pred_buf
    452 *  UWORD8 pointer to the prediction block
    453 *
    454 * @param[in] pred_strd
    455 *  WORD32 Pred stride
    456 *
    457 * @param[in] pi2_dst
    458 *  WORD16 pointer to the transform block
    459 *
    460 * @param[in] dst_strd
    461 *  WORD32 Destination stride
    462 *
    463 * @param[in] size
    464 *  WORD32 transform Block size
    465 *
    466 * @returns Hadamard SAD with DC Suppressed
    467 *
    468 * @remarks
    469 *  Not updating the transform destination now. Only returning the SATD
    470 *
    471 *******************************************************************************
    472 */
    473 UWORD32 ihevce_compute_ac_had_8x8_8bit(
    474     UWORD8 *pu1_origin,
    475     WORD32 src_strd,
    476     UWORD8 *pu1_pred_buf,
    477     WORD32 pred_strd,
    478     WORD16 *pi2_dst,
    479     WORD32 dst_strd)
    480 {
    481     WORD32 k;
    482     UWORD32 u4_sad = 0;
    483     WORD16 v[64];
    484 
    485     (void)pi2_dst;
    486     (void)dst_strd;
    487     ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
    488 
    489     v[0] = 0;
    490     for(k = 0; k < 64; ++k)
    491         u4_sad += abs(v[k]);
    492     u4_sad = ((u4_sad + 4) >> 3);
    493 
    494     return u4_sad;
    495 }
    496 
    497 /**
    498 *******************************************************************************
    499 *
    500 * @brief
    501 *  Computes Hadamard Sad for 16x16 block with 8-bit input
    502 *
    503 * @par Description:
    504 *
    505 * @param[in] pu1_origin
    506 *  UWORD8 pointer to the current block
    507 *
    508 * @param[in] src_strd
    509 *  WORD32 Source stride
    510 *
    511 * @param[in] pu1_pred_buf
    512 *  UWORD8 pointer to the prediction block
    513 *
    514 * @param[in] pred_strd
    515 *  WORD32 Pred stride
    516 *
    517 * @param[in] pi2_dst
    518 *  WORD16 pointer to the transform block
    519 *
    520 * @param[in] dst_strd
    521 *  WORD32 Destination stride
    522 *
    523 * @param[in] size
    524 *  WORD32 transform Block size
    525 *
    526 * @returns Hadamard SAD
    527 *
    528 * @remarks
    529 *  Not updating the transform destination now. Only returning the SATD
    530 *
    531 *******************************************************************************
    532 */
    533 UWORD32 ihevce_HAD_16x16_8bit(
    534     UWORD8 *pu1_origin,
    535     WORD32 src_strd,
    536     UWORD8 *pu1_pred_buf,
    537     WORD32 pred_strd,
    538     WORD16 *pi2_dst,
    539     WORD32 dst_strd)
    540 {
    541     WORD32 k;
    542     UWORD32 u4_sad = 0;
    543     WORD16 v[256];
    544 
    545     (void)pi2_dst;
    546     (void)dst_strd;
    547     ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16);
    548 
    549     for(k = 0; k < 256; ++k)
    550         u4_sad += abs(v[k]);
    551     u4_sad = ((u4_sad + 4) >> 3);
    552 
    553     return u4_sad;
    554 }
    555 
    556 /**
    557 *******************************************************************************
    558 *
    559 * @brief
    560 *  Computes Hadamard Sad for 32x32 block with 8-bit input
    561 *
    562 * @par Description:
    563 *
    564 * @param[in] pu1_origin
    565 *  UWORD8 pointer to the current block
    566 *
    567 * @param[in] src_strd
    568 *  WORD32 Source stride
    569 *
    570 * @param[in] pu1_pred_buf
    571 *  UWORD8 pointer to the prediction block
    572 *
    573 * @param[in] pred_strd
    574 *  WORD32 Pred stride
    575 *
    576 * @param[in] pi2_dst
    577 *  WORD16 pointer to the transform block
    578 *
    579 * @param[in] dst_strd
    580 *  WORD32 Destination stride
    581 *
    582 * @param[in] size
    583 *  WORD32 transform Block size
    584 *
    585 * @returns Hadamard SAD
    586 *
    587 * @remarks
    588 *  Not updating the transform destination now. Only returning the SATD
    589 *
    590 *******************************************************************************
    591 */
    592 UWORD32 ihevce_HAD_32x32_8bit(
    593     UWORD8 *pu1_origin,
    594     WORD32 src_strd,
    595     UWORD8 *pu1_pred_buf,
    596     WORD32 pred_strd,
    597     WORD16 *pi2_dst,
    598     WORD32 dst_strd)
    599 {
    600     WORD32 k;
    601     UWORD32 u4_sad = 0;
    602     WORD16 v[32 * 32];
    603 
    604     (void)pi2_dst;
    605     (void)dst_strd;
    606     ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32);
    607 
    608     for(k = 0; k < 32 * 32; ++k)
    609         u4_sad += abs(v[k]);
    610     u4_sad = ((u4_sad + 2) >> 2);
    611 
    612     return u4_sad;
    613 }
    614 
    615 //#if COMPUTE_16x16_R == C
    616 /**
    617 *******************************************************************************
    618 *
    619 * @brief
    620 *   Computes 8x8 transform using children 4x4 hadamard results
    621 *
    622 * @par Description:
    623 *
    624 * @param[in] pi2_4x4_had
    625 *  WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
    626 *
    627 * @param[in] had4_strd
    628 *  stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
    629 *
    630 * @param[out] pi2_dst
    631 *  destination buffer where 8x8 hadamard result is stored
    632 *
    633 * @param[in] dst_stride
    634 *  stride of destination block
    635 *
    636 * @param[in] i4_frm_qstep
    637 *  frm_qstep value based on the which the threshold value is calculated
    638 *
    639 * @returns
    640 *  8x8 Hadamard SATD
    641 * @remarks
    642 *
    643 *******************************************************************************
    644 */
    645 static UWORD32 ihevce_compute_8x8HAD_using_4x4(
    646     WORD16 *pi2_4x4_had,
    647     WORD32 had4_strd,
    648     WORD16 *pi2_dst,
    649     WORD32 dst_strd,
    650     WORD32 i4_frm_qstep,
    651     WORD32 *pi4_cbf)
    652 {
    653     /* Qstep value is right shifted by 8 */
    654     WORD32 threshold = (i4_frm_qstep >> 8);
    655 
    656     /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */
    657     WORD16 *pi2_y0 = pi2_4x4_had;
    658     WORD16 *pi2_y1 = pi2_4x4_had + 4;
    659     WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4;
    660     WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4;
    661 
    662     /* Initialize pointers to store 8x8 HAD output */
    663     WORD16 *pi2_dst0 = pi2_dst;
    664     WORD16 *pi2_dst1 = pi2_dst + 4;
    665     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4;
    666     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4;
    667 
    668     UWORD32 u4_satd = 0;
    669     WORD32 i;
    670 
    671     /*   Child HAD results combined as follows to get Parent result */
    672     /*  _                                                 _         */
    673     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
    674     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
    675     /* \-                                                 -/        */
    676     for(i = 0; i < 16; i++)
    677     {
    678         WORD32 src_idx = (i >> 2) * had4_strd + (i % 4);
    679         WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4);
    680 
    681         WORD16 a0 = pi2_y0[src_idx];
    682         WORD16 a1 = pi2_y1[src_idx];
    683         WORD16 a2 = pi2_y2[src_idx];
    684         WORD16 a3 = pi2_y3[src_idx];
    685 
    686         WORD16 b0 = (a0 + a1);
    687         WORD16 b1 = (a0 - a1);
    688         WORD16 b2 = (a2 + a3);
    689         WORD16 b3 = (a2 - a3);
    690 
    691         pi2_dst0[dst_idx] = b0 + b2;
    692         pi2_dst1[dst_idx] = b1 + b3;
    693         pi2_dst2[dst_idx] = b0 - b2;
    694         pi2_dst3[dst_idx] = b1 - b3;
    695 
    696         if(ABS(pi2_dst0[dst_idx]) > threshold)
    697             *pi4_cbf = 1;
    698         if(ABS(pi2_dst1[dst_idx]) > threshold)
    699             *pi4_cbf = 1;
    700         if(ABS(pi2_dst2[dst_idx]) > threshold)
    701             *pi4_cbf = 1;
    702         if(ABS(pi2_dst3[dst_idx]) > threshold)
    703             *pi4_cbf = 1;
    704 
    705         u4_satd += ABS(pi2_dst0[dst_idx]);
    706         u4_satd += ABS(pi2_dst1[dst_idx]);
    707         u4_satd += ABS(pi2_dst2[dst_idx]);
    708         u4_satd += ABS(pi2_dst3[dst_idx]);
    709     }
    710 
    711     /* return the 8x8 satd */
    712     return (u4_satd);
    713 }
    714 
    715 /**
    716 *******************************************************************************
    717 *
    718 * @brief
    719 *    Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of
    720 *    a 8x8 block (Residue is computed for 8-bit src and prediction buffers)
    721 *    Modified to incorporate the dead-zone implementation - Lokesh
    722 *
    723 * @par Description:
    724 *
    725 * @param[in] pu1_origin
    726 *  UWORD8 pointer to the current block
    727 *
    728 * @param[in] src_strd
    729 *  WORD32 Source stride
    730 *
    731 * @param[in] pu1_pred
    732 *  UWORD8 pointer to the prediction block
    733 *
    734 * @param[in] pred_strd
    735 *  WORD32 Pred stride
    736 *
    737 * @param[out] pi2_dst
    738 *  WORD16 pointer to the transform block
    739 *
    740 * @param[in] dst_strd
    741 *  WORD32 Destination stride
    742 *
    743 * @param[out] pi4_hsad
    744 *  array for storing hadmard sad of each 4x4 block
    745 *
    746 * @param[in] hsad_stride
    747 *  stride of hadmard sad destination buffer (for Zscan order of storing sads)
    748 *
    749 * @param[in] i4_frm_qstep
    750 *  frm_qstep value based on the which the threshold value is calculated
    751 *
    752 * @returns
    753 *
    754 * @remarks
    755 *
    756 *******************************************************************************
    757 */
    758 static WORD32 ihevce_had4_4x4(
    759     UWORD8 *pu1_src,
    760     WORD32 src_strd,
    761     UWORD8 *pu1_pred,
    762     WORD32 pred_strd,
    763     WORD16 *pi2_dst4x4,
    764     WORD32 dst_strd,
    765     WORD32 *pi4_hsad,
    766     WORD32 hsad_stride,
    767     WORD32 i4_frm_qstep)
    768 {
    769     WORD32 i, k;
    770     WORD32 i4_child_total_sad = 0;
    771 
    772     (void)i4_frm_qstep;
    773     /* -------- Compute four 4x4 HAD Transforms ---------*/
    774     for(i = 0; i < 4; i++)
    775     {
    776         UWORD8 *pu1_pi0, *pu1_pi1;
    777         WORD16 *pi2_dst;
    778         WORD32 blkx, blky;
    779         UWORD32 u4_hsad = 0;
    780         // TODO: choose deadzone as f(qstep)
    781         WORD32 threshold = 0;
    782 
    783         /*****************************************************/
    784         /*    Assuming the looping structure of the four     */
    785         /*    blocks is in Z scan order of 4x4s in a 8x8     */
    786         /*    block instead of raster scan                   */
    787         /*****************************************************/
    788         blkx = (i & 0x1);
    789         blky = (i >> 1);
    790 
    791         pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd);
    792         pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd);
    793         pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd);
    794 
    795         ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd);
    796 
    797         for(k = 0; k < 4; k++)
    798         {
    799             if(ABS(pi2_dst[0 * dst_strd + k]) < threshold)
    800                 pi2_dst[0 * dst_strd + k] = 0;
    801 
    802             if(ABS(pi2_dst[1 * dst_strd + k]) < threshold)
    803                 pi2_dst[1 * dst_strd + k] = 0;
    804 
    805             if(ABS(pi2_dst[2 * dst_strd + k]) < threshold)
    806                 pi2_dst[2 * dst_strd + k] = 0;
    807 
    808             if(ABS(pi2_dst[3 * dst_strd + k]) < threshold)
    809                 pi2_dst[3 * dst_strd + k] = 0;
    810 
    811             /* Accumulate the SATD */
    812             u4_hsad += ABS(pi2_dst[0 * dst_strd + k]);
    813             u4_hsad += ABS(pi2_dst[1 * dst_strd + k]);
    814             u4_hsad += ABS(pi2_dst[2 * dst_strd + k]);
    815             u4_hsad += ABS(pi2_dst[3 * dst_strd + k]);
    816         }
    817 
    818         /*===== Normalize the HSAD =====*/
    819         pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2);
    820         i4_child_total_sad += ((u4_hsad + 2) >> 2);
    821     }
    822     return i4_child_total_sad;
    823 }
    824 
    825 /**
    826 *******************************************************************************
    827 *
    828 * @brief
    829 *    HSAD is returned for the 4, 4x4 in 8x8
    830 *
    831 * @par Description:
    832 *
    833 * @param[in] pu1_origin
    834 *  UWORD8 pointer to the current block
    835 *
    836 * @param[in] src_strd
    837 *  WORD32 Source stride
    838 *
    839 * @param[in] pu1_pred
    840 *  UWORD8 pointer to the prediction block
    841 *
    842 * @param[in] pred_strd
    843 *  WORD32 Pred stride
    844 *
    845 * @param[out] pi2_dst
    846 *  WORD16 pointer to the transform output block
    847 *
    848 * @param[out] dst_strd
    849 *  WORD32 Destination stride
    850 *
    851 * @param[out] ppi4_hsad
    852 *   pointer to base pointers for storing hadmard sads of various
    853 *   block sizes (4x4 to 32x32)
    854 *
    855 * @param[in] pos_x_y_4x4
    856 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
    857 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
    858 *
    859 * @param[in] num_4x4_in_row
    860 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
    861 *
    862 * @returns
    863 *
    864 * @remarks
    865 *
    866 *******************************************************************************
    867 */
    868 void ihevce_had_8x8_using_4_4x4(
    869     UWORD8 *pu1_src,
    870     WORD32 src_strd,
    871     UWORD8 *pu1_pred,
    872     WORD32 pred_strd,
    873     WORD16 *pi2_dst,
    874     WORD32 dst_strd,
    875     WORD32 **ppi4_hsad,
    876     WORD32 pos_x_y_4x4,
    877     WORD32 num_4x4_in_row)
    878 {
    879     WORD16 ai2_4x4_had[64];
    880     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
    881     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
    882     WORD32 *pi4_4x4_hsad;
    883     WORD32 *pi4_8x8_hsad;
    884 
    885     (void)pi2_dst;
    886     (void)dst_strd;
    887     ASSERT(pos_x >= 0);
    888     ASSERT(pos_y >= 0);
    889 
    890     /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
    891     pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
    892     pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
    893 
    894     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
    895     pi4_8x8_hsad[0] = ihevce_had4_4x4(
    896         pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
    897 }
    898 
    899 /**
    900 *******************************************************************************
    901 *
    902 * @brief
    903 *    Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8
    904 *    block and its four subblocks(4x4).
    905 *
    906 * @par Description:
    907 *
    908 * @param[in] pu1_origin
    909 *  UWORD8 pointer to the current block
    910 *
    911 * @param[in] src_strd
    912 *  WORD32 Source stride
    913 *
    914 * @param[in] pu1_pred
    915 *  UWORD8 pointer to the prediction block
    916 *
    917 * @param[in] pred_strd
    918 *  WORD32 Pred stride
    919 *
    920 * @param[out] pi2_dst
    921 *  WORD16 pointer to the transform output block
    922 *
    923 * @param[out] dst_strd
    924 *  WORD32 Destination stride
    925 *
    926 * @param[out] ppi4_hsad
    927 *   pointer to base pointers for storing hadmard sads of various
    928 *   block sizes (4x4 to 32x32)
    929 *
    930 * @param[in] pos_x_y_4x4
    931 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
    932 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
    933 *
    934 * @param[in] num_4x4_in_row
    935 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
    936 *
    937 * @param[in] i4_frm_qstep
    938 *  frm_qstep value based on the which the threshold value is calculated
    939 *
    940 * @returns
    941 *
    942 * @remarks
    943 *
    944 *******************************************************************************
    945 */
    946 WORD32 ihevce_had_8x8_using_4_4x4_r(
    947     UWORD8 *pu1_src,
    948     WORD32 src_strd,
    949     UWORD8 *pu1_pred,
    950     WORD32 pred_strd,
    951     WORD16 *pi2_dst,
    952     WORD32 dst_strd,
    953     WORD32 **ppi4_hsad,
    954     WORD32 **ppi4_tu_split,
    955     WORD32 **ppi4_tu_early_cbf,
    956     WORD32 pos_x_y_4x4,
    957     WORD32 num_4x4_in_row,
    958     WORD32 lambda,
    959     WORD32 lambda_q_shift,
    960     WORD32 i4_frm_qstep,
    961     WORD32 i4_cur_depth,
    962     WORD32 i4_max_depth,
    963     WORD32 i4_max_tr_size,
    964     WORD32 *pi4_tu_split_cost,
    965     void *pv_func_sel)
    966 {
    967     WORD16 ai2_4x4_had[64];
    968     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
    969     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
    970     WORD32 *pi4_4x4_hsad;
    971     WORD32 *pi4_8x8_hsad;
    972     WORD32 *pi4_8x8_tu_split;
    973 
    974     WORD32 *pi4_8x8_tu_early_cbf;
    975 
    976     UWORD32 u4_satd;
    977     WORD32 cost_child = 0, cost_parent = 0;
    978     WORD32 early_cbf = 0;
    979 
    980     const UWORD8 u1_cur_tr_size = 8;
    981     /* Stores the best cost for the Current 8x8: Lokesh */
    982     WORD32 best_cost = 0;
    983 
    984     (void)pv_func_sel;
    985     ASSERT(pos_x >= 0);
    986     ASSERT(pos_y >= 0);
    987 
    988     /* Initialize pointers to  store 4x4 and 8x8 HAD SATDs */
    989     pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
    990     pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
    991     pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
    992     pi4_8x8_tu_early_cbf =
    993         ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
    994 
    995     /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
    996     cost_child = ihevce_had4_4x4(
    997         pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
    998 
    999     /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */
   1000     u4_satd = ihevce_compute_8x8HAD_using_4x4(
   1001         ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
   1002 
   1003     /* store the normalized 8x8 satd */
   1004     cost_parent = ((u4_satd + 4) >> 3);
   1005 
   1006     /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
   1007     cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
   1008 
   1009     if(i4_cur_depth < i4_max_depth)
   1010     {
   1011         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
   1012         {
   1013             //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1);
   1014             *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
   1015             best_cost = cost_child;
   1016             best_cost <<= 1;
   1017             best_cost++;
   1018             pi4_8x8_tu_split[0] = 1;
   1019             pi4_8x8_hsad[0] = cost_child;
   1020         }
   1021         else
   1022         {
   1023             //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
   1024             best_cost = cost_parent;
   1025             best_cost <<= 1;
   1026             pi4_8x8_tu_split[0] = 0;
   1027             pi4_8x8_hsad[0] = cost_parent;
   1028         }
   1029     }
   1030     else
   1031     {
   1032         //cost_parent -= ((1) * lambda) >>  (lambda_q_shift + 1);
   1033         best_cost = cost_parent;
   1034         best_cost <<= 1;
   1035         pi4_8x8_tu_split[0] = 0;
   1036         pi4_8x8_hsad[0] = cost_parent;
   1037     }
   1038 
   1039     pi4_8x8_tu_early_cbf[0] = early_cbf;
   1040 
   1041     /* best cost has tu_split_flag at LSB(Least significant bit) */
   1042     return ((best_cost << 1) + early_cbf);
   1043 }
   1044 
   1045 /**
   1046 *******************************************************************************
   1047 *
   1048 * @brief
   1049 *   Computes 16x16 transform using children 8x8 hadamard results
   1050 *    Modified to incorporate the dead-zone implementation - Lokesh
   1051 *
   1052 * @par Description:
   1053 *
   1054 * @param[in] pi2_8x8_had
   1055 *  WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
   1056 *
   1057 * @param[in] had8_strd
   1058 *  stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
   1059 *
   1060 * @param[out] pi2_dst
   1061 *  destination buffer where 8x8 hadamard result is stored
   1062 *
   1063 * @param[in] dst_stride
   1064 *  stride of destination block
   1065 *
   1066 * @param[in] i4_frm_qstep
   1067 *  frm_qstep value based on the which the threshold value is calculated
   1068 *
   1069 * @returns
   1070 *  16x16 Hadamard SATD
   1071 * @remarks
   1072 *
   1073 *******************************************************************************
   1074 */
   1075 static UWORD32 ihevce_compute_16x16HAD_using_8x8(
   1076     WORD16 *pi2_8x8_had,
   1077     WORD32 had8_strd,
   1078     WORD16 *pi2_dst,
   1079     WORD32 dst_strd,
   1080     WORD32 i4_frm_qstep,
   1081     WORD32 *pi4_cbf)
   1082 {
   1083     /* Qstep value is right shifted by 8 */
   1084     WORD32 threshold = (i4_frm_qstep >> 8);
   1085 
   1086     /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
   1087     WORD16 *pi2_y0 = pi2_8x8_had;
   1088     WORD16 *pi2_y1 = pi2_8x8_had + 8;
   1089     WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8;
   1090     WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8;
   1091 
   1092     /* Initialize pointers to store 8x8 HAD output */
   1093     WORD16 *pi2_dst0 = pi2_dst;
   1094     WORD16 *pi2_dst1 = pi2_dst + 8;
   1095     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8;
   1096     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8;
   1097 
   1098     UWORD32 u4_satd = 0;
   1099     WORD32 i;
   1100 
   1101     /*   Child HAD results combined as follows to get Parent result */
   1102     /*  _                                                 _         */
   1103     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
   1104     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
   1105     /* \-                                                 -/        */
   1106     for(i = 0; i < 64; i++)
   1107     {
   1108         WORD32 src_idx = (i >> 3) * had8_strd + (i % 8);
   1109         WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8);
   1110 
   1111         WORD16 a0 = pi2_y0[src_idx];
   1112         WORD16 a1 = pi2_y1[src_idx];
   1113         WORD16 a2 = pi2_y2[src_idx];
   1114         WORD16 a3 = pi2_y3[src_idx];
   1115 
   1116         WORD16 b0 = (a0 + a1) >> 1;
   1117         WORD16 b1 = (a0 - a1) >> 1;
   1118         WORD16 b2 = (a2 + a3) >> 1;
   1119         WORD16 b3 = (a2 - a3) >> 1;
   1120 
   1121         pi2_dst0[dst_idx] = b0 + b2;
   1122         pi2_dst1[dst_idx] = b1 + b3;
   1123         pi2_dst2[dst_idx] = b0 - b2;
   1124         pi2_dst3[dst_idx] = b1 - b3;
   1125 
   1126         /* Make the value of dst to zerp, if it falls below the dead-zone */
   1127         if(ABS(pi2_dst0[dst_idx]) > threshold)
   1128             *pi4_cbf = 1;
   1129         if(ABS(pi2_dst1[dst_idx]) > threshold)
   1130             *pi4_cbf = 1;
   1131         if(ABS(pi2_dst2[dst_idx]) > threshold)
   1132             *pi4_cbf = 1;
   1133         if(ABS(pi2_dst3[dst_idx]) > threshold)
   1134             *pi4_cbf = 1;
   1135 
   1136         u4_satd += ABS(pi2_dst0[dst_idx]);
   1137         u4_satd += ABS(pi2_dst1[dst_idx]);
   1138         u4_satd += ABS(pi2_dst2[dst_idx]);
   1139         u4_satd += ABS(pi2_dst3[dst_idx]);
   1140     }
   1141 
   1142     /* return 16x16 satd */
   1143     return (u4_satd);
   1144 }
   1145 
   1146 /**
   1147 *******************************************************************************
   1148 *
   1149 * @brief
   1150 *    Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates.
   1151 *    Uses recursive 8x8 had output to compute satd for 16x16 and its children
   1152 *
   1153 * @par Description:
   1154 *
   1155 * @param[in] pu1_origin
   1156 *  UWORD8 pointer to the current block
   1157 *
   1158 * @param[in] src_strd
   1159 *  WORD32 Source stride
   1160 *
   1161 * @param[in] pu1_pred
   1162 *  UWORD8 pointer to the prediction block
   1163 *
   1164 * @param[in] pred_strd
   1165 *  WORD32 Pred stride
   1166 *
   1167 * @param[out] pi2_dst
   1168 *  WORD16 pointer to the transform output block
   1169 *
   1170 * @param[out] dst_strd
   1171 *  WORD32 Destination stride
   1172 *
   1173 * @param[out] ppi4_hsad
   1174 *   pointer to base pointers for storing hadmard sads of various
   1175 *   block sizes (4x4 to 32x32)
   1176 *
   1177 * @param[in] pos_x_y_4x4
   1178 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
   1179 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
   1180 *
   1181 * @param[in] num_4x4_in_row
   1182 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
   1183 *
   1184 * @param[in] lambda
   1185 *  lambda values is the cost factor calculated based on QP
   1186 *
   1187 * @param[in] lambda_q_shift
   1188 *  lambda_q_shift used to reverse the lambda value back from q8 format
   1189 *
   1190 * @param[in] depth
   1191 *  depth gives the current TU depth with respect to the CU
   1192 *
   1193 * @param[in] i4_frm_qstep
   1194 *  frm_qstep value based on the which the threshold value is calculated
   1195 *
   1196 * @returns
   1197 *
   1198 * @remarks
   1199 *
   1200 *******************************************************************************
   1201 */
   1202 
   1203 WORD32 ihevce_had_16x16_r(
   1204     UWORD8 *pu1_src,
   1205     WORD32 src_strd,
   1206     UWORD8 *pu1_pred,
   1207     WORD32 pred_strd,
   1208     WORD16 *pi2_dst,
   1209     WORD32 dst_strd,
   1210     WORD32 **ppi4_hsad,
   1211     WORD32 **ppi4_tu_split,
   1212     WORD32 **ppi4_tu_early_cbf,
   1213     WORD32 pos_x_y_4x4,
   1214     WORD32 num_4x4_in_row,
   1215     WORD32 lambda,
   1216     WORD32 lambda_q_shift,
   1217     WORD32 i4_frm_qstep,
   1218     WORD32 i4_cur_depth,
   1219     WORD32 i4_max_depth,
   1220     WORD32 i4_max_tr_size,
   1221     WORD32 *pi4_tu_split_cost,
   1222     void *pv_func_sel)
   1223 {
   1224     WORD16 ai2_8x8_had[256];
   1225     WORD32 *pi4_16x16_hsad;
   1226     WORD32 *pi4_16x16_tu_split;
   1227 
   1228     WORD32 *pi4_16x16_tu_early_cbf;
   1229 
   1230     UWORD32 u4_satd = 0;
   1231     WORD32 tu_split_flag = 0;
   1232     WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
   1233     const UWORD8 u1_cur_tr_size = 16;
   1234 
   1235     /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
   1236     /* cost_child : Stores the cost of the child HAD transform (16x16) */
   1237     WORD32 cost_parent = 0, cost_child = 0;
   1238 
   1239     /*best_cost returns the best cost at the end of the function */
   1240     /*tu_split denoes whether the TU (16x16)is split or not */
   1241     WORD32 best_cost = 0, best_cost_tu_split;
   1242     WORD32 i;
   1243 
   1244     WORD16 *pi2_y0;
   1245     UWORD8 *pu1_src0;
   1246     UWORD8 *pu1_pred0;
   1247     WORD32 pos_x_y_4x4_0;
   1248 
   1249     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
   1250     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
   1251 
   1252     ASSERT(pos_x >= 0);
   1253     ASSERT(pos_y >= 0);
   1254 
   1255     /* Initialize pointers to  store 16x16 SATDs */
   1256     pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
   1257 
   1258     pi4_16x16_tu_split =
   1259         ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
   1260 
   1261     pi4_16x16_tu_early_cbf =
   1262         ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
   1263 
   1264     /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
   1265     for(i = 0; i < 4; i++)
   1266     {
   1267         pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
   1268         pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
   1269         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
   1270         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
   1271 
   1272         best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r(
   1273             pu1_src0,
   1274             src_strd,
   1275             pu1_pred0,
   1276             pred_strd,
   1277             pi2_y0,
   1278             16,
   1279             ppi4_hsad,
   1280             ppi4_tu_split,
   1281             ppi4_tu_early_cbf,
   1282             pos_x_y_4x4_0,
   1283             num_4x4_in_row,
   1284             lambda,
   1285             lambda_q_shift,
   1286             i4_frm_qstep,
   1287             i4_cur_depth + 1,
   1288             i4_max_depth,
   1289             i4_max_tr_size,
   1290             pi4_tu_split_cost,
   1291             pv_func_sel);
   1292 
   1293         /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
   1294         best_cost = (best_cost_tu_split >> 2);
   1295 
   1296         /* Last but one bit stores the information regarding the TU_Split */
   1297         tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
   1298 
   1299         /* Last bit stores the information regarding the early_cbf */
   1300         i4_early_cbf_flag += (best_cost_tu_split & 0x1);
   1301 
   1302         cost_child += best_cost;
   1303 
   1304         tu_split_flag <<= 1;
   1305         i4_early_cbf_flag <<= 1;
   1306     }
   1307 
   1308     /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
   1309     pi2_y0 = ai2_8x8_had;
   1310 
   1311     /* Threshold currently passed as "0" */
   1312     u4_satd =
   1313         ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
   1314 
   1315     /* store the normalized satd */
   1316     cost_parent = ((u4_satd + 4) >> 3);
   1317 
   1318     /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
   1319     cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
   1320 
   1321     i4_early_cbf_flag += early_cbf;
   1322 
   1323     /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
   1324     which decides the extent to which TU_REC needs to be done */
   1325     if(i4_cur_depth < i4_max_depth)
   1326     {
   1327         if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
   1328         {
   1329             //cost_child -= ((4 + 4)  * lambda) >> (lambda_q_shift + 1);
   1330             *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
   1331             tu_split_flag += 1;
   1332             best_cost = cost_child;
   1333         }
   1334         else
   1335         {
   1336             //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
   1337             tu_split_flag += 0;
   1338             best_cost = cost_parent;
   1339         }
   1340     }
   1341     else
   1342     {
   1343         //cost_parent -= ((1 + 1) * lambda) >>  (lambda_q_shift + 1);
   1344         tu_split_flag += 0;
   1345         best_cost = cost_parent;
   1346     }
   1347 
   1348     pi4_16x16_hsad[0] = best_cost;
   1349     pi4_16x16_tu_split[0] = tu_split_flag;
   1350     pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
   1351 
   1352     /*returning two values(best cost & tu_split_flag) as a single value*/
   1353     return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
   1354 }
   1355 
   1356 //#endif
   1357 /**
   1358 *******************************************************************************
   1359 *
   1360 * @brief
   1361 *   Computes 32x32 transform using children 16x16 hadamard results
   1362 *
   1363 * @par Description:
   1364 *
   1365 * @param[in] pi2_16x16_had
   1366 *  WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
   1367 *
   1368 * @param[in] had16_strd
   1369 *  stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
   1370 *
   1371 * @param[out] pi2_dst
   1372 *  destination buffer where 16x16 hadamard result is stored
   1373 *
   1374 * @param[in] dst_stride
   1375 *  stride of destination block
   1376 *
   1377 * @param[in] i4_frm_qstep
   1378 *  frm_qstep value based on the which the threshold value is calculated
   1379 *
   1380 * @returns
   1381 *  32x32 Hadamard SATD
   1382 * @remarks
   1383 *
   1384 *******************************************************************************
   1385 */
   1386 //#if COMPUTE_32x32_USING_16X16 == C
   1387 UWORD32 ihevce_compute_32x32HAD_using_16x16(
   1388     WORD16 *pi2_16x16_had,
   1389     WORD32 had16_strd,
   1390     WORD16 *pi2_dst,
   1391     WORD32 dst_strd,
   1392     WORD32 i4_frm_qstep,
   1393     WORD32 *pi4_cbf)
   1394 {
   1395     /* Qstep value is right shifted by 8 */
   1396     WORD32 threshold = (i4_frm_qstep >> 8);
   1397 
   1398     /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
   1399     WORD16 *pi2_y0 = pi2_16x16_had;
   1400     WORD16 *pi2_y1 = pi2_16x16_had + 16;
   1401     WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16;
   1402     WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16;
   1403 
   1404     /* Initialize pointers to store 8x8 HAD output */
   1405     WORD16 *pi2_dst0 = pi2_dst;
   1406     WORD16 *pi2_dst1 = pi2_dst + 16;
   1407     WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16;
   1408     WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16;
   1409 
   1410     UWORD32 u4_satd = 0;
   1411     WORD32 i;
   1412 
   1413     /*   Child HAD results combined as follows to get Parent result */
   1414     /*  _                                                 _         */
   1415     /* |  (y0 + y1) + (y2 + y3)    (y0 - y1) + (y2 - y3)   |        */
   1416     /* |  (y0 + y1) - (y2 + y3)    (y0 - y1) - (y2 - y3)   |        */
   1417     /* \-                                                 -/        */
   1418     for(i = 0; i < 256; i++)
   1419     {
   1420         WORD32 src_idx = (i >> 4) * had16_strd + (i % 16);
   1421         WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16);
   1422 
   1423         WORD16 a0 = pi2_y0[src_idx] >> 2;
   1424         WORD16 a1 = pi2_y1[src_idx] >> 2;
   1425         WORD16 a2 = pi2_y2[src_idx] >> 2;
   1426         WORD16 a3 = pi2_y3[src_idx] >> 2;
   1427 
   1428         WORD16 b0 = (a0 + a1);
   1429         WORD16 b1 = (a0 - a1);
   1430         WORD16 b2 = (a2 + a3);
   1431         WORD16 b3 = (a2 - a3);
   1432 
   1433         pi2_dst0[dst_idx] = b0 + b2;
   1434         pi2_dst1[dst_idx] = b1 + b3;
   1435         pi2_dst2[dst_idx] = b0 - b2;
   1436         pi2_dst3[dst_idx] = b1 - b3;
   1437 
   1438         /* Make the value of dst to zerp, if it falls below the dead-zone */
   1439         if(ABS(pi2_dst0[dst_idx]) > threshold)
   1440             *pi4_cbf = 1;
   1441         if(ABS(pi2_dst1[dst_idx]) > threshold)
   1442             *pi4_cbf = 1;
   1443         if(ABS(pi2_dst2[dst_idx]) > threshold)
   1444             *pi4_cbf = 1;
   1445         if(ABS(pi2_dst3[dst_idx]) > threshold)
   1446             *pi4_cbf = 1;
   1447 
   1448         u4_satd += ABS(pi2_dst0[dst_idx]);
   1449         u4_satd += ABS(pi2_dst1[dst_idx]);
   1450         u4_satd += ABS(pi2_dst2[dst_idx]);
   1451         u4_satd += ABS(pi2_dst3[dst_idx]);
   1452     }
   1453 
   1454     /* return 32x32 satd */
   1455     return (u4_satd);
   1456 }
   1457 //#endif
   1458 
   1459 /**
   1460 *******************************************************************************
   1461 *
   1462 * @brief
   1463 *    Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates.
   1464 *    Uses recursive 16x16 had output to compute satd for 32x32 and its children
   1465 *
   1466 * @par Description:
   1467 *
   1468 * @param[in] pu1_origin
   1469 *  UWORD8 pointer to the current block
   1470 *
   1471 * @param[in] src_strd
   1472 *  WORD32 Source stride
   1473 *
   1474 * @param[in] pu1_pred
   1475 *  UWORD8 pointer to the prediction block
   1476 *
   1477 * @param[in] pred_strd
   1478 *  WORD32 Pred stride
   1479 *
   1480 * @param[out] pi2_dst
   1481 *  WORD16 pointer to the transform output block
   1482 *
   1483 * @param[out] dst_strd
   1484 *  WORD32 Destination stride
   1485 *
   1486 * @param[out] ppi4_hsad
   1487 *   pointer to base pointers for storing hadmard sads of various
   1488 *   block sizes (4x4 to 32x32)
   1489 *
   1490 * @param[in] pos_x_y_4x4
   1491 *   Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
   1492 *   Lower 16bits denote xpos and upper 16ypos of the 4x4block
   1493 *
   1494 * @param[in] num_4x4_in_row
   1495 *   Denotes the number of current 4x4 blocks in a ctb/CU/MB
   1496 *
   1497 * @param[in] lambda
   1498 *  lambda values is the cost factor calculated based on QP
   1499 *
   1500 * @param[in] lambda_q_shift
   1501 *  lambda_q_shift used to reverse the lambda value back from q8 format
   1502 *
   1503 * @param[in] depth
   1504 *  depth gives the current TU depth with respect to the CU
   1505 *
   1506 * @param[in] i4_frm_qstep
   1507 *  frm_qstep value based on the which the threshold value is calculated
   1508 *
   1509 *
   1510 * @returns
   1511 *
   1512 * @remarks
   1513 *
   1514 *******************************************************************************
   1515 */
   1516 void ihevce_had_32x32_r(
   1517     UWORD8 *pu1_src,
   1518     WORD32 src_strd,
   1519     UWORD8 *pu1_pred,
   1520     WORD32 pred_strd,
   1521     WORD16 *pi2_dst,
   1522     WORD32 dst_strd,
   1523     WORD32 **ppi4_hsad,
   1524     WORD32 **ppi4_tu_split,
   1525     WORD32 **ppi4_tu_early_cbf,
   1526     WORD32 pos_x_y_4x4,
   1527     WORD32 num_4x4_in_row,
   1528     WORD32 lambda,
   1529     WORD32 lambda_q_shift,
   1530     WORD32 i4_frm_qstep,
   1531     WORD32 i4_cur_depth,
   1532     WORD32 i4_max_depth,
   1533     WORD32 i4_max_tr_size,
   1534     WORD32 *pi4_tu_split_cost,
   1535     me_func_selector_t *ps_func_selector)
   1536 
   1537 {
   1538     WORD16 ai2_16x16_had[1024];
   1539     WORD32 *pi4_32x32_hsad;
   1540     WORD32 *pi4_32x32_tu_split;
   1541     WORD32 *pi4_32x32_tu_early_cbf;
   1542 
   1543     WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
   1544     WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
   1545     WORD32 tu_split_flag = 0;
   1546     const UWORD8 u1_cur_tr_size = 32;
   1547     WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
   1548 
   1549     /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
   1550     /* cost_child : Stores the cost of the child HAD transform (16x16) */
   1551     WORD32 cost_child = 0, cost_parent = 0;
   1552 
   1553     /*retuned as the best cost for the entire TU (32x32) */
   1554     WORD32 best_cost = 0;
   1555     /*captures the best cost and tu_split at child level */
   1556     WORD32 best_cost_tu_split;
   1557 
   1558     /* Initialize pointers to 4 8x8 blocks in 16x16 */
   1559     WORD16 *pi2_y0 = ai2_16x16_had;
   1560     WORD16 *pi2_y1 = ai2_16x16_had + 16;
   1561     WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16;
   1562     WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16;
   1563 
   1564     UWORD8 *pu1_src0 = pu1_src;
   1565     UWORD8 *pu1_src1 = pu1_src + 16;
   1566     UWORD8 *pu1_src2 = pu1_src + src_strd * 16;
   1567     UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16;
   1568 
   1569     UWORD8 *pu1_pred0 = pu1_pred;
   1570     UWORD8 *pu1_pred1 = pu1_pred + 16;
   1571     UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16;
   1572     UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16;
   1573 
   1574     ASSERT(pos_x >= 0);
   1575     ASSERT(pos_y >= 0);
   1576 
   1577     /* Initialize pointers to store 32x32 SATDs */
   1578     pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
   1579 
   1580     pi4_32x32_tu_split =
   1581         ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
   1582 
   1583     pi4_32x32_tu_early_cbf =
   1584         ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
   1585 
   1586     /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
   1587     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
   1588         pu1_src0,
   1589         src_strd,
   1590         pu1_pred0,
   1591         pred_strd,
   1592         pi2_y0,
   1593         32,
   1594         ppi4_hsad,
   1595         ppi4_tu_split,
   1596         ppi4_tu_early_cbf,
   1597         pos_x_y_4x4,
   1598         num_4x4_in_row,
   1599         lambda,
   1600         lambda_q_shift,
   1601         i4_frm_qstep,
   1602         i4_cur_depth + 1,
   1603         i4_max_depth,
   1604         i4_max_tr_size,
   1605         pi4_tu_split_cost,
   1606         NULL);
   1607 
   1608     /* cost is shifted by 10bits */
   1609     best_cost = best_cost_tu_split >> 10;
   1610 
   1611     /* Tu split is present in the 6-10 bits */
   1612     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
   1613 
   1614     /*Early CBF info is present in the last 5 bits */
   1615     i4_early_cbf_flag += best_cost_tu_split & 0x1F;
   1616 
   1617     tu_split_flag <<= 5;
   1618     i4_early_cbf_flag <<= 5;
   1619 
   1620     cost_child += best_cost;
   1621 
   1622     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
   1623         pu1_src1,
   1624         src_strd,
   1625         pu1_pred1,
   1626         pred_strd,
   1627         pi2_y1,
   1628         32,
   1629         ppi4_hsad,
   1630         ppi4_tu_split,
   1631         ppi4_tu_early_cbf,
   1632         pos_x_y_4x4 + 4,
   1633         num_4x4_in_row,
   1634         lambda,
   1635         lambda_q_shift,
   1636         i4_frm_qstep,
   1637         i4_cur_depth + 1,
   1638         i4_max_depth,
   1639         i4_max_tr_size,
   1640         pi4_tu_split_cost,
   1641         NULL);
   1642 
   1643     /* cost is shifted by 10bits */
   1644     best_cost = best_cost_tu_split >> 10;
   1645 
   1646     /* Tu split is present in the 6-10 bits */
   1647     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
   1648 
   1649     /*Early CBF info is present in the last 5 bits */
   1650     i4_early_cbf_flag += best_cost_tu_split & 0x1F;
   1651 
   1652     tu_split_flag <<= 5;
   1653     i4_early_cbf_flag <<= 5;
   1654 
   1655     cost_child += best_cost;
   1656 
   1657     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
   1658         pu1_src2,
   1659         src_strd,
   1660         pu1_pred2,
   1661         pred_strd,
   1662         pi2_y2,
   1663         32,
   1664         ppi4_hsad,
   1665         ppi4_tu_split,
   1666         ppi4_tu_early_cbf,
   1667         pos_x_y_4x4 + (4 << 16),
   1668         num_4x4_in_row,
   1669         lambda,
   1670         lambda_q_shift,
   1671         i4_frm_qstep,
   1672         i4_cur_depth + 1,
   1673         i4_max_depth,
   1674         i4_max_tr_size,
   1675         pi4_tu_split_cost,
   1676         NULL);
   1677 
   1678     /* cost is shifted by 10bits */
   1679     best_cost = best_cost_tu_split >> 10;
   1680 
   1681     /* Tu split is present in the 6-10 bits */
   1682     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
   1683 
   1684     /*Early CBF info is present in the last 5 bits */
   1685     i4_early_cbf_flag += best_cost_tu_split & 0x1F;
   1686 
   1687     tu_split_flag <<= 5;
   1688     i4_early_cbf_flag <<= 5;
   1689 
   1690     cost_child += best_cost;
   1691 
   1692     best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
   1693         pu1_src3,
   1694         src_strd,
   1695         pu1_pred3,
   1696         pred_strd,
   1697         pi2_y3,
   1698         32,
   1699         ppi4_hsad,
   1700         ppi4_tu_split,
   1701         ppi4_tu_early_cbf,
   1702         pos_x_y_4x4 + (4 << 16) + 4,
   1703         num_4x4_in_row,
   1704         lambda,
   1705         lambda_q_shift,
   1706         i4_frm_qstep,
   1707         i4_cur_depth + 1,
   1708         i4_max_depth,
   1709         i4_max_tr_size,
   1710         pi4_tu_split_cost,
   1711         NULL);
   1712 
   1713     /* cost is shifted by 10bits */
   1714     best_cost = best_cost_tu_split >> 10;
   1715 
   1716     /* Tu split is present in the 6-10 bits */
   1717     tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
   1718 
   1719     /*Early CBF info is present in the last 5 bits */
   1720     i4_early_cbf_flag += best_cost_tu_split & 0x1F;
   1721 
   1722     tu_split_flag <<= 1;
   1723     i4_early_cbf_flag <<= 1;
   1724 
   1725     cost_child += best_cost;
   1726 
   1727     {
   1728         UWORD32 u4_satd = 0;
   1729 
   1730         u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16(
   1731             pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
   1732 
   1733         cost_parent = ((u4_satd + 2) >> 2);
   1734     }
   1735 
   1736     /* 4 TU_Split flags , 4 CBF Flags*/
   1737     cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
   1738 
   1739     i4_early_cbf_flag += early_cbf;
   1740 
   1741     /* 1 TU_SPlit flag, 1 CBF flag */
   1742     //cost_parent += ((1 + 1)* lambda) >>  (lambda_q_shift + 1);
   1743 
   1744     if(i4_cur_depth < i4_max_depth)
   1745     {
   1746         if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size))
   1747         {
   1748             *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
   1749             best_cost = cost_child;
   1750             tu_split_flag++;
   1751         }
   1752         else
   1753         {
   1754             tu_split_flag = 0;
   1755             best_cost = cost_parent;
   1756         }
   1757     }
   1758     else
   1759     {
   1760         tu_split_flag = 0;
   1761         best_cost = cost_parent;
   1762     }
   1763 
   1764     pi4_32x32_tu_split[0] = tu_split_flag;
   1765 
   1766     pi4_32x32_hsad[0] = best_cost;
   1767 
   1768     pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag;
   1769 }
   1770