Home | History | Annotate | Download | only in decoder
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19  *******************************************************************************
     20  * @file
     21  *  ihevcd_iquant_itrans_recon_ctb.c
     22  *
     23  * @brief
     24  *  Contains functions for inverse quantization, inverse transform and recon
     25  *
     26  * @author
     27  *  Ittiam
     28  *
     29  * @par List of Functions:
     30  * - ihevcd_iquant_itrans_recon_ctb()
     31  *
     32  * @remarks
     33  *  None
     34  *
     35  *******************************************************************************
     36  */
     37 /*****************************************************************************/
     38 /* File Includes                                                             */
     39 /*****************************************************************************/
     40 #include <stdio.h>
     41 #include <stddef.h>
     42 #include <stdlib.h>
     43 #include <string.h>
     44 
     45 #include "ihevc_typedefs.h"
     46 #include "iv.h"
     47 #include "ivd.h"
     48 #include "ihevcd_cxa.h"
     49 
     50 #include "ihevc_defs.h"
     51 #include "ihevc_debug.h"
     52 #include "ihevc_structs.h"
     53 #include "ihevc_cabac_tables.h"
     54 #include "ihevc_macros.h"
     55 #include "ihevc_platform_macros.h"
     56 
     57 #include "ihevcd_defs.h"
     58 #include "ihevcd_function_selector.h"
     59 #include "ihevcd_structs.h"
     60 #include "ihevcd_error.h"
     61 #include "ihevcd_bitstream.h"
     62 #include "ihevc_common_tables.h"
     63 
     64 /* Intra pred includes */
     65 #include "ihevc_intra_pred.h"
     66 
     67 /* Inverse transform common module includes */
     68 #include "ihevc_trans_tables.h"
     69 #include "ihevc_trans_macros.h"
     70 #include "ihevc_itrans_recon.h"
     71 #include "ihevc_recon.h"
     72 #include "ihevc_chroma_itrans_recon.h"
     73 #include "ihevc_chroma_recon.h"
     74 
     75 /* Decoder includes */
     76 #include "ihevcd_common_tables.h"
     77 #include "ihevcd_iquant_itrans_recon_ctb.h"
     78 #include "ihevcd_debug.h"
     79 #include "ihevcd_profile.h"
     80 #include "ihevcd_statistics.h"
     81 #include "ihevcd_itrans_recon_dc.h"
     82 
     83 static const UWORD32 gau4_ihevcd_4_bit_reverse[] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 };
     84 
     85 
     86 /* Globals */
     87 static const WORD32 g_i4_ip_funcs[MAX_NUM_IP_MODES] =
     88   { IP_FUNC_MODE_0, /* Mode 0 */
     89     IP_FUNC_MODE_1, /* Mode 1 */
     90     IP_FUNC_MODE_2, /* Mode 2 */
     91     IP_FUNC_MODE_3TO9, /* Mode 3 */
     92     IP_FUNC_MODE_3TO9, /* Mode 4 */
     93     IP_FUNC_MODE_3TO9, /* Mode 5 */
     94     IP_FUNC_MODE_3TO9, /* Mode 6 */
     95     IP_FUNC_MODE_3TO9, /* Mode 7 */
     96     IP_FUNC_MODE_3TO9, /* Mode 8 */
     97     IP_FUNC_MODE_3TO9, /* Mode 9 */
     98     IP_FUNC_MODE_10, /* Mode 10 */
     99     IP_FUNC_MODE_11TO17, /* Mode 11 */
    100     IP_FUNC_MODE_11TO17, /* Mode 12 */
    101     IP_FUNC_MODE_11TO17, /* Mode 13 */
    102     IP_FUNC_MODE_11TO17, /* Mode 14 */
    103     IP_FUNC_MODE_11TO17, /* Mode 15 */
    104     IP_FUNC_MODE_11TO17, /* Mode 16 */
    105     IP_FUNC_MODE_11TO17, /* Mode 17 */
    106     IP_FUNC_MODE_18_34, /* Mode 18 */
    107     IP_FUNC_MODE_19TO25, /* Mode 19 */
    108     IP_FUNC_MODE_19TO25, /* Mode 20 */
    109     IP_FUNC_MODE_19TO25, /* Mode 21 */
    110     IP_FUNC_MODE_19TO25, /* Mode 22 */
    111     IP_FUNC_MODE_19TO25, /* Mode 23 */
    112     IP_FUNC_MODE_19TO25, /* Mode 24 */
    113     IP_FUNC_MODE_19TO25, /* Mode 25 */
    114     IP_FUNC_MODE_26, /* Mode 26 */
    115     IP_FUNC_MODE_27TO33, /* Mode 27 */
    116     IP_FUNC_MODE_27TO33, /* Mode 26 */
    117     IP_FUNC_MODE_27TO33, /* Mode 29 */
    118     IP_FUNC_MODE_27TO33, /* Mode 30 */
    119     IP_FUNC_MODE_27TO33, /* Mode 31 */
    120     IP_FUNC_MODE_27TO33, /* Mode 32 */
    121     IP_FUNC_MODE_27TO33, /* Mode 33 */
    122     IP_FUNC_MODE_18_34, /* Mode 34 */
    123 };
    124 
    125 
    126 const WORD16 *g_ai2_ihevc_trans_tables[] =
    127   { &g_ai2_ihevc_trans_dst_4[0][0],
    128     &g_ai2_ihevc_trans_4[0][0],
    129     &g_ai2_ihevc_trans_8[0][0],
    130     &g_ai2_ihevc_trans_16[0][0],
    131     &g_ai2_ihevc_trans_32[0][0]
    132 };
    133 
    134 
    135 /*****************************************************************************/
    136 /* Function Prototypes                                                       */
    137 /*****************************************************************************/
    138 /* Returns number of ai2_level read from ps_sblk_coeff */
    139 UWORD8* ihevcd_unpack_coeffs(WORD16 *pi2_tu_coeff,
    140                              WORD32 log2_trans_size,
    141                              UWORD8 *pu1_tu_coeff_data,
    142                              WORD16 *pi2_dequant_matrix,
    143                              WORD32 qp_rem,
    144                              WORD32 qp_div,
    145                              TRANSFORM_TYPE e_trans_type,
    146                              WORD32 trans_quant_bypass,
    147                              UWORD32 *pu4_zero_cols,
    148                              UWORD32 *pu4_zero_rows,
    149                              UWORD32 *pu4_coeff_type,
    150                              WORD16 *pi2_coeff_value)
    151 {
    152     /* Generating coeffs from coeff-map */
    153     WORD32 i;
    154     WORD16 *pi2_sblk_ptr;
    155     WORD32 subblk_pos_x, subblk_pos_y;
    156     WORD32 sblk_scan_idx, coeff_raster_idx;
    157     WORD32 sblk_non_zero_coeff_idx;
    158     tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
    159     UWORD8 u1_num_coded_sblks, u1_scan_type;
    160     UWORD8 *pu1_new_tu_coeff_data;
    161     WORD32 trans_size;
    162     WORD32 xs, ys;
    163     WORD32 trans_skip;
    164     WORD16 iquant_out;
    165     WORD32 shift_iq;
    166     {
    167         WORD32 bit_depth;
    168 
    169         bit_depth = 8 + 0;
    170         shift_iq = bit_depth + log2_trans_size - 5;
    171     }
    172     trans_size = (1 << log2_trans_size);
    173 
    174     /* First byte points to number of coded blocks */
    175     u1_num_coded_sblks = *pu1_tu_coeff_data++;
    176 
    177     /* Next byte points to scan type */
    178     u1_scan_type = *pu1_tu_coeff_data++;
    179     /* 0th bit has trans_skip */
    180     trans_skip = u1_scan_type & 1;
    181     u1_scan_type >>= 1;
    182 
    183     pi2_sblk_ptr = pi2_tu_coeff;
    184 
    185     /* Initially all columns are assumed to be zero */
    186     *pu4_zero_cols = 0xFFFFFFFF;
    187     /* Initially all rows are assumed to be zero */
    188     *pu4_zero_rows = 0xFFFFFFFF;
    189 
    190     ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)(pu1_tu_coeff_data);
    191 
    192     if(trans_skip)
    193         memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
    194 
    195     STATS_INIT_SBLK_AND_COEFF_POS();
    196 
    197     /* DC only case */
    198     if((e_trans_type != DST_4x4) && (1 == u1_num_coded_sblks)
    199                     && (0 == ps_tu_sblk_coeff_data->u2_subblk_pos)
    200                     && (1 == ps_tu_sblk_coeff_data->u2_sig_coeff_map))
    201     {
    202         *pu4_coeff_type = 1;
    203 
    204         if(!trans_quant_bypass)
    205         {
    206             if(4 == trans_size)
    207             {
    208                 IQUANT_4x4(iquant_out,
    209                            ps_tu_sblk_coeff_data->ai2_level[0],
    210                            pi2_dequant_matrix[0]
    211                                            * g_ihevc_iquant_scales[qp_rem],
    212                            shift_iq, qp_div);
    213             }
    214             else
    215             {
    216                 IQUANT(iquant_out, ps_tu_sblk_coeff_data->ai2_level[0],
    217                        pi2_dequant_matrix[0] * g_ihevc_iquant_scales[qp_rem],
    218                        shift_iq, qp_div);
    219             }
    220             if(trans_skip)
    221                 iquant_out = (iquant_out + 16) >> 5;
    222         }
    223         else
    224         {
    225             /* setting the column to zero */
    226             for(i = 0; i < trans_size; i++)
    227                 *(pi2_tu_coeff + i * trans_size) = 0;
    228 
    229             iquant_out = ps_tu_sblk_coeff_data->ai2_level[0];
    230         }
    231         *pi2_coeff_value = iquant_out;
    232         *pi2_tu_coeff = iquant_out;
    233         *pu4_zero_cols &= ~0x1;
    234         *pu4_zero_rows &= ~0x1;
    235         ps_tu_sblk_coeff_data =
    236                         (void *)&ps_tu_sblk_coeff_data->ai2_level[1];
    237 
    238         STATS_UPDATE_COEFF_COUNT();
    239         STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass),  0, 0);
    240         STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
    241         return ((UWORD8 *)ps_tu_sblk_coeff_data);
    242     }
    243     else
    244     {
    245         *pu4_coeff_type = 0;
    246         /* In case of trans skip, memset has already happened */
    247         if(!trans_skip)
    248             memset(pi2_tu_coeff, 0, trans_size * trans_size * sizeof(WORD16));
    249     }
    250 
    251     for(i = 0; i < u1_num_coded_sblks; i++)
    252     {
    253         UWORD32 u4_sig_coeff_map;
    254         subblk_pos_x = ps_tu_sblk_coeff_data->u2_subblk_pos & 0x00FF;
    255         subblk_pos_y = (ps_tu_sblk_coeff_data->u2_subblk_pos & 0xFF00) >> 8;
    256 
    257         STATS_LAST_SBLK_POS_UPDATE(e_trans_type, (trans_skip || trans_quant_bypass), subblk_pos_x, subblk_pos_y);
    258 
    259         subblk_pos_x = subblk_pos_x * MIN_TU_SIZE;
    260         subblk_pos_y = subblk_pos_y * MIN_TU_SIZE;
    261 
    262         pi2_sblk_ptr = pi2_tu_coeff + subblk_pos_y * trans_size
    263                         + subblk_pos_x;
    264 
    265         //*pu4_zero_cols &= ~(0xF << subblk_pos_x);
    266 
    267         sblk_non_zero_coeff_idx = 0;
    268         u4_sig_coeff_map = ps_tu_sblk_coeff_data->u2_sig_coeff_map;
    269         //for(sblk_scan_idx = (31 - CLZ(u4_sig_coeff_map)); sblk_scan_idx >= 0; sblk_scan_idx--)
    270         sblk_scan_idx = 31;
    271         do
    272         {
    273             WORD32 clz = CLZ(u4_sig_coeff_map);
    274 
    275             sblk_scan_idx -= clz;
    276             /* when clz is 31, u4_sig_coeff_map << (clz+1) might result in unknown behaviour in some cases */
    277             /* Hence either use SHL which takes care of handling these issues based on platform or shift in two stages */
    278             u4_sig_coeff_map = u4_sig_coeff_map << clz;
    279             /* Copying coeffs and storing in reverse order */
    280             {
    281                 STATS_UPDATE_COEFF_COUNT();
    282                 coeff_raster_idx =
    283                                 gau1_ihevc_invscan4x4[u1_scan_type][sblk_scan_idx];
    284 
    285                 xs = coeff_raster_idx & 0x3;
    286                 ys = coeff_raster_idx >> 2;
    287 
    288                 if(!trans_quant_bypass)
    289                 {
    290                     if(4 == trans_size)
    291                     {
    292                         IQUANT_4x4(iquant_out,
    293                                    ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
    294                                    pi2_dequant_matrix[(subblk_pos_x + xs)
    295                                                    + (subblk_pos_y + ys)
    296                                                    * trans_size]
    297                                    * g_ihevc_iquant_scales[qp_rem],
    298                                    shift_iq, qp_div);
    299                         sblk_non_zero_coeff_idx++;
    300                     }
    301                     else
    302                     {
    303                         IQUANT(iquant_out,
    304                                ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx],
    305                                pi2_dequant_matrix[(subblk_pos_x + xs)
    306                                                + (subblk_pos_y + ys)
    307                                                * trans_size]
    308                                * g_ihevc_iquant_scales[qp_rem],
    309                                shift_iq, qp_div);
    310                         sblk_non_zero_coeff_idx++;
    311                     }
    312 
    313                     if(trans_skip)
    314                         iquant_out = (iquant_out + 16) >> 5;
    315                 }
    316                 else
    317                 {
    318                     iquant_out = ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx++];
    319                 }
    320                 *pu4_zero_cols &= ~(0x1 << (subblk_pos_x + xs));
    321                 *pu4_zero_rows &= ~(0x1 << (subblk_pos_y + ys));
    322                 *(pi2_sblk_ptr + xs + ys * trans_size) = iquant_out;
    323             }
    324             sblk_scan_idx--;
    325             u4_sig_coeff_map <<= 1;
    326 
    327         }while(u4_sig_coeff_map);
    328         /* Updating the sblk pointer */
    329         ps_tu_sblk_coeff_data =
    330                         (void *)&ps_tu_sblk_coeff_data->ai2_level[sblk_non_zero_coeff_idx];
    331     }
    332 
    333     STATS_UPDATE_SBLK_AND_COEFF_HISTOGRAM(e_trans_type, (trans_quant_bypass || trans_skip));
    334 
    335     pu1_new_tu_coeff_data = (UWORD8 *)ps_tu_sblk_coeff_data;
    336 
    337     return pu1_new_tu_coeff_data;
    338 }
    339 
    340 WORD32 ihevcd_get_intra_nbr_flag(process_ctxt_t *ps_proc,
    341                                  tu_t *ps_tu,
    342                                  UWORD32 *pu4_intra_nbr_avail,
    343                                  WORD16 i2_pic_width_in_luma_samples,
    344                                  UWORD8 i1_constrained_intra_pred_flag,
    345                                  WORD32 trans_size,
    346                                  WORD32 ctb_size)
    347 {
    348     sps_t *ps_sps;
    349     UWORD8 u1_bot_lt_avail, u1_left_avail, u1_top_avail, u1_top_rt_avail,
    350                     u1_top_lt_avail;
    351     WORD32 x_cur, y_cur, x_nbr, y_nbr;
    352     UWORD8 *pu1_nbr_intra_flag;
    353     UWORD8 *pu1_pic_intra_flag;
    354     UWORD8 top_right, top, top_left, left, bot_left;
    355     WORD32 intra_pos;
    356     WORD32 num_8_blks, num_8_blks_in_bits;
    357     WORD32 numbytes_row = (i2_pic_width_in_luma_samples + 63) / 64;
    358     WORD32 cur_x, cur_y;
    359     WORD32 i;
    360     WORD32 nbr_flags;
    361 
    362     ps_sps = ps_proc->ps_sps;
    363     cur_x = ps_tu->b4_pos_x;
    364     cur_y = ps_tu->b4_pos_y;
    365 
    366     u1_bot_lt_avail = (pu4_intra_nbr_avail[1 + cur_y + trans_size / MIN_TU_SIZE]
    367                     >> (31 - (1 + cur_x - 1))) & 1;
    368     u1_left_avail = (pu4_intra_nbr_avail[1 + cur_y] >> (31 - (1 + cur_x - 1)))
    369                     & 1;
    370     u1_top_avail = (pu4_intra_nbr_avail[1 + cur_y - 1] >> (31 - (1 + cur_x)))
    371                     & 1;
    372     u1_top_rt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
    373                     >> (31 - (1 + cur_x + trans_size / MIN_TU_SIZE))) & 1;
    374     u1_top_lt_avail = (pu4_intra_nbr_avail[1 + cur_y - 1]
    375                     >> (31 - (1 + cur_x - 1))) & 1;
    376 
    377     x_cur = ps_proc->i4_ctb_x * ctb_size + cur_x * MIN_TU_SIZE;
    378     y_cur = ps_proc->i4_ctb_y * ctb_size + cur_y * MIN_TU_SIZE;
    379 
    380     pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
    381 
    382     /* WORD32 nbr_flags as below  MSB --> LSB */
    383     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
    384      *       1         4         4     4         4
    385      */
    386     bot_left = 0;
    387     left = 0;
    388     top_right = 0;
    389     top = 0;
    390     top_left = 0;
    391 
    392     num_8_blks = trans_size > 4 ? trans_size / 8 : 1;
    393     num_8_blks_in_bits = ((1 << num_8_blks) - 1);
    394 
    395     if(i1_constrained_intra_pred_flag)
    396     {
    397         /* TODO: constrained intra pred not tested */
    398         if(u1_bot_lt_avail)
    399         {
    400             x_nbr = x_cur - 1;
    401             y_nbr = y_cur + trans_size;
    402 
    403             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
    404                             + x_nbr / 64;
    405             intra_pos = ((x_nbr / 8) % 8);
    406             for(i = 0; i < num_8_blks; i++)
    407             {
    408                 bot_left |= ((*(pu1_nbr_intra_flag + i * numbytes_row)
    409                                 >> intra_pos) & 1) << i;
    410             }
    411             bot_left &= num_8_blks_in_bits;
    412         }
    413         if(u1_left_avail)
    414         {
    415             x_nbr = x_cur - 1;
    416             y_nbr = y_cur;
    417 
    418             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
    419                             + x_nbr / 64;
    420             intra_pos = ((x_nbr / 8) % 8);
    421 
    422             for(i = 0; i < num_8_blks; i++)
    423             {
    424                 left |= ((*(pu1_nbr_intra_flag + i * numbytes_row) >> intra_pos)
    425                                 & 1) << i;
    426             }
    427             left &= num_8_blks_in_bits;
    428         }
    429         if(u1_top_avail)
    430         {
    431             x_nbr = x_cur;
    432             y_nbr = y_cur - 1;
    433 
    434             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
    435                             + x_nbr / 64;
    436             intra_pos = ((x_nbr / 8) % 8);
    437 
    438             top = (*pu1_nbr_intra_flag >> intra_pos);
    439             top &= num_8_blks_in_bits;
    440             /*
    441              for(i=0;i<num_8_blks;i++)
    442              {
    443              top |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
    444              }
    445              */
    446         }
    447         if(u1_top_rt_avail)
    448         {
    449             x_nbr = x_cur + trans_size;
    450             y_nbr = y_cur - 1;
    451 
    452             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
    453                             + x_nbr / 64;
    454             intra_pos = ((x_nbr / 8) % 8);
    455 
    456             top_right = (*pu1_nbr_intra_flag >> intra_pos);
    457             top_right &= num_8_blks_in_bits;
    458             /*
    459              for(i=0;i<num_8_blks;i++)
    460              {
    461              top_right |= ( (*pu1_nbr_intra_flag >> (intra_pos+i)) & 1) << i;
    462              }
    463              */
    464         }
    465         if(u1_top_lt_avail)
    466         {
    467             x_nbr = x_cur - 1;
    468             y_nbr = y_cur - 1;
    469 
    470             pu1_nbr_intra_flag = pu1_pic_intra_flag + y_nbr / 8 * numbytes_row
    471                             + x_nbr / 64;
    472             intra_pos = ((x_nbr / 8) % 8);
    473 
    474             top_left = (*pu1_nbr_intra_flag >> intra_pos) & 1;
    475         }
    476     }
    477     else
    478     {
    479         if(u1_top_avail)
    480             top = 0xF;
    481         if(u1_top_rt_avail)
    482             top_right = 0xF;
    483         if(u1_bot_lt_avail)
    484             bot_left = 0xF;
    485         if(u1_left_avail)
    486             left = 0xF;
    487         if(u1_top_lt_avail)
    488             top_left = 0x1;
    489     }
    490 
    491     /* Handling incomplete CTBs */
    492     {
    493         WORD32 pu_size_limit = MIN(trans_size, 8);
    494         WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples
    495                         - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size)
    496                         - (ps_tu->b4_pos_x * MIN_TU_SIZE)
    497                         - (1 << (ps_tu->b3_size + 2));
    498         /* ctb_size_top gives number of valid pixels remaining in the current row */
    499         WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
    500         WORD32 ctb_size_top_bits = (1 << (ctb_size_top / pu_size_limit)) - 1;
    501 
    502         WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples
    503                         - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size)
    504                         - (ps_tu->b4_pos_y * MIN_TU_SIZE)
    505                         - (1 << (ps_tu->b3_size + 2));
    506         /* ctb_size_bot gives number of valid pixels remaining in the current column */
    507         WORD32 ctb_size_bot = MIN(ctb_size, rows_remaining);
    508         WORD32 ctb_size_bot_bits = (1 << (ctb_size_bot / pu_size_limit)) - 1;
    509 
    510         top_right &= ctb_size_top_bits;
    511         bot_left &= ctb_size_bot_bits;
    512     }
    513 
    514     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
    515      *      1         4         4     4         4
    516      */
    517 
    518     /*
    519      nbr_flags = (top_left << 16) | (gau4_ihevcd_4_bit_reverse[top_right] << 12) | (gau4_ihevcd_4_bit_reverse[top] << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
    520      | gau4_ihevcd_4_bit_reverse[bot_left];
    521      */
    522     nbr_flags = (top_left << 16) | (top_right << 12) | (top << 8) | (gau4_ihevcd_4_bit_reverse[left] << 4)
    523                     | gau4_ihevcd_4_bit_reverse[bot_left];
    524 
    525 
    526     return nbr_flags;
    527 
    528 }
    529 
    530 WORD32 ihevcd_iquant_itrans_recon_ctb(process_ctxt_t *ps_proc)
    531 {
    532     WORD16 *pi2_scaling_mat;
    533     UWORD8 *pu1_y_dst_ctb;
    534     UWORD8 *pu1_uv_dst_ctb;
    535     WORD32 ctb_size;
    536     codec_t *ps_codec;
    537     slice_header_t *ps_slice_hdr;
    538     tu_t *ps_tu;
    539     WORD16 *pi2_ctb_coeff;
    540     WORD32 tu_cnt;
    541     WORD16 *pi2_tu_coeff;
    542     WORD16 *pi2_tmp;
    543     WORD32 pic_strd;
    544     WORD32 luma_nbr_flags;
    545     WORD32 chroma_nbr_flags = 0;
    546     UWORD8 u1_luma_pred_mode_first_tu = 0;
    547     /* Pointers for generating 2d coeffs from coeff-map */
    548     UWORD8 *pu1_tu_coeff_data;
    549     /* nbr avail map for CTB */
    550     /* 1st bit points to neighbor (left/top_left/bot_left) */
    551     /* 1Tb starts at 2nd bit from msb of 2nd value in array, followed by number of min_tu's in that ctb */
    552     UWORD32 au4_intra_nbr_avail[MAX_CTB_SIZE / MIN_TU_SIZE
    553                     + 2 /* Top nbr + bot nbr */]; UWORD32
    554                     top_avail_bits;
    555     sps_t *ps_sps;
    556     pps_t *ps_pps;
    557     WORD32 intra_flag;
    558     UWORD8 *pu1_pic_intra_flag;
    559     /*************************************************************************/
    560     /* Contanis scaling matrix offset in the following order in a 1D buffer  */
    561     /* Intra 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
    562     /* Inter 4 x 4 Y, 4 x 4 U, 4 x 4 V                                       */
    563     /* Intra 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
    564     /* Inter 8 x 8 Y, 8 x 8 U, 8 x 8 V                                       */
    565     /* Intra 16x16 Y, 16x16 U, 16x16 V                                       */
    566     /* Inter 16x16 Y, 16x16 U, 16x16 V                                       */
    567     /* Intra 32x32 Y                                                         */
    568     /* Inter 32x32 Y                                                         */
    569     /*************************************************************************/
    570     /* Only first 20 entries are used. Array is extended to avoid out of bound
    571        reads. Skip CUs (64x64) read this table, but don't really use the value */
    572     static const WORD32 scaling_mat_offset[] =
    573       { 0, 16, 32, 48, 64, 80, 96, 160, 224, 288, 352, 416, 480, 736, 992,
    574         1248, 1504, 1760, 2016, 3040, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
    575 
    576     PROFILE_DISABLE_IQ_IT_RECON_INTRA_PRED();
    577 
    578     ps_sps = ps_proc->ps_sps;
    579     ps_pps = ps_proc->ps_pps;
    580     ps_slice_hdr = ps_proc->ps_slice_hdr;
    581     ps_codec = ps_proc->ps_codec;
    582 
    583     pu1_y_dst_ctb = ps_proc->pu1_cur_ctb_luma;
    584     pu1_uv_dst_ctb = ps_proc->pu1_cur_ctb_chroma;
    585 
    586     pi2_ctb_coeff = ps_proc->pi2_invscan_out;
    587 
    588     ctb_size = (1 << ps_sps->i1_log2_ctb_size);
    589     pu1_tu_coeff_data = (UWORD8 *)ps_proc->pv_tu_coeff_data;
    590 
    591     pic_strd = ps_codec->i4_strd;
    592 
    593     pi2_tmp = ps_proc->pi2_itrans_intrmd_buf;
    594 
    595     pi2_tu_coeff = pi2_ctb_coeff;
    596 
    597     ps_tu = ps_proc->ps_tu;
    598 
    599     if((1 == ps_sps->i1_scaling_list_enable_flag) && (1 == ps_pps->i1_pps_scaling_list_data_present_flag))
    600     {
    601         pi2_scaling_mat = ps_pps->pi2_scaling_mat;
    602     }
    603     else
    604     {
    605         pi2_scaling_mat = ps_sps->pi2_scaling_mat;
    606     }
    607 
    608     {
    609         /* Updating the initial availability map */
    610         WORD32 i;
    611         UWORD8 u1_left_ctb_avail, u1_top_lt_ctb_avail, u1_top_rt_ctb_avail,
    612                         u1_top_ctb_avail;
    613 
    614         u1_left_ctb_avail = ps_proc->u1_left_ctb_avail;
    615         u1_top_lt_ctb_avail = ps_proc->u1_top_lt_ctb_avail;
    616         u1_top_ctb_avail = ps_proc->u1_top_ctb_avail;
    617         u1_top_rt_ctb_avail = ps_proc->u1_top_rt_ctb_avail;
    618 
    619         /* Initializing the availability array */
    620         memset(au4_intra_nbr_avail, 0,
    621                (MAX_CTB_SIZE / MIN_TU_SIZE + 2) * sizeof(UWORD32));
    622         /* Initializing the availability array with CTB level availability flags */
    623         {
    624             WORD32 rows_remaining = ps_sps->i2_pic_height_in_luma_samples - (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size);
    625             WORD32 ctb_size_left = MIN(ctb_size, rows_remaining);
    626             for(i = 0; i < ctb_size_left / MIN_TU_SIZE; i++)
    627             {
    628                 au4_intra_nbr_avail[i + 1] = ((UWORD32)u1_left_ctb_avail << 31);
    629             }
    630         }
    631         au4_intra_nbr_avail[0] |= (((UWORD32)u1_top_rt_ctb_avail << 31)
    632                         >> (1 + ctb_size / MIN_TU_SIZE)); /* 1+ctb_size/4 position bit pos from msb */
    633 
    634         au4_intra_nbr_avail[0] |= ((UWORD32)u1_top_lt_ctb_avail << 31);
    635 
    636         {
    637             WORD32 cols_remaining = ps_sps->i2_pic_width_in_luma_samples - (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size);
    638             WORD32 ctb_size_top = MIN(ctb_size, cols_remaining);
    639             WORD32 shift = (31 - (ctb_size / MIN_TU_SIZE));
    640 
    641             /* ctb_size_top gives number of valid pixels remaining in the current row */
    642             /* Since we need pattern of 1's starting from the MSB, an additional shift */
    643             /* is needed */
    644             shift += ((ctb_size - ctb_size_top) / MIN_TU_SIZE);
    645 
    646             top_avail_bits = ((1 << (ctb_size_top / MIN_TU_SIZE)) - 1)
    647                             << shift;
    648         }
    649         au4_intra_nbr_avail[0] |= (
    650                         (u1_top_ctb_avail == 1) ? top_avail_bits : 0x0);
    651         /* Starting from msb 2nd bit to (1+ctb_size/4) bit, set 1 if top avail,or 0 */
    652 
    653     }
    654 
    655     /* Applying Inverse transform on all the TU's in CTB */
    656     for(tu_cnt = 0; tu_cnt < ps_proc->i4_ctb_tu_cnt; tu_cnt++, ps_tu++)
    657     {
    658         WORD32 transform_skip_flag = 0;
    659         WORD32 transform_skip_flag_v = 0;
    660         WORD32 num_comp, c_idx, func_idx;
    661         WORD32 src_strd, pred_strd, dst_strd;
    662         WORD32 qp_div = 0, qp_rem = 0;
    663         WORD32 qp_div_v = 0, qp_rem_v = 0;
    664         UWORD32 zero_cols = 0, zero_cols_v = 0;
    665         UWORD32 zero_rows = 0, zero_rows_v = 0;
    666         UWORD32 coeff_type = 0, coeff_type_v = 0;
    667         WORD16 i2_coeff_value, i2_coeff_value_v;
    668         WORD32 trans_size = 0;
    669         TRANSFORM_TYPE e_trans_type;
    670         WORD32 log2_y_trans_size_minus_2, log2_uv_trans_size_minus_2;
    671         WORD32 log2_trans_size;
    672         WORD32 chroma_qp_idx;
    673         WORD16 *pi2_src = NULL, *pi2_src_v = NULL;
    674         UWORD8 *pu1_pred = NULL, *pu1_pred_v = NULL;
    675         UWORD8 *pu1_dst = NULL, *pu1_dst_v = NULL;
    676         WORD16 *pi2_dequant_matrix = NULL, *pi2_dequant_matrix_v = NULL;
    677         WORD32 tu_x, tu_y;
    678         WORD32 tu_y_offset, tu_uv_offset;
    679         WORD8 i1_chroma_pic_qp_offset, i1_chroma_slice_qp_offset;
    680         UWORD8 u1_cbf = 0, u1_cbf_v = 0, u1_luma_pred_mode, u1_chroma_pred_mode;
    681         WORD32 luma_nbr_flags_4x4[4];
    682         WORD32 offset;
    683         WORD32 pcm_flag;
    684         WORD32  chroma_yuv420sp_vu = (ps_codec->e_ref_chroma_fmt == IV_YUV_420SP_VU);
    685         /* If 420SP_VU is chroma format, pred and dst pointer   */
    686         /* will be added +1 to point to U                       */
    687         WORD32 chroma_yuv420sp_vu_u_offset = 1 * chroma_yuv420sp_vu;
    688         /* If 420SP_VU is chroma format, pred and dst pointer   */
    689         /* will be added U offset of +1 and subtracted 2        */
    690         /* to point to V                                        */
    691         WORD32 chroma_yuv420sp_vu_v_offset = -2 * chroma_yuv420sp_vu;
    692 
    693         tu_x = ps_tu->b4_pos_x * 4; /* Converting minTU unit to pixel unit */
    694         tu_y = ps_tu->b4_pos_y * 4; /* Converting minTU unit to pixel unit */
    695         {
    696             WORD32 tu_abs_x = (ps_proc->i4_ctb_x << ps_sps->i1_log2_ctb_size) + (tu_x);
    697             WORD32 tu_abs_y = (ps_proc->i4_ctb_y << ps_sps->i1_log2_ctb_size) + (tu_y);
    698 
    699             WORD32 numbytes_row =  (ps_sps->i2_pic_width_in_luma_samples + 63) / 64;
    700 
    701             pu1_pic_intra_flag = ps_proc->pu1_pic_intra_flag;
    702             pu1_pic_intra_flag += (tu_abs_y >> 3) * numbytes_row;
    703             pu1_pic_intra_flag += (tu_abs_x >> 6);
    704 
    705             intra_flag = *pu1_pic_intra_flag;
    706             intra_flag &= (1 << ((tu_abs_x >> 3) % 8));
    707         }
    708 
    709         u1_luma_pred_mode = ps_tu->b6_luma_intra_mode;
    710         u1_chroma_pred_mode = ps_tu->b3_chroma_intra_mode_idx;
    711 
    712         if(u1_chroma_pred_mode != 7)
    713             num_comp = 2; /* Y and UV */
    714         else
    715             num_comp = 1; /* Y */
    716 
    717 
    718         pcm_flag = 0;
    719 
    720         if((intra_flag) && (u1_luma_pred_mode == INTRA_PRED_NONE))
    721         {
    722             UWORD8 *pu1_buf;
    723             UWORD8 *pu1_y_dst = pu1_y_dst_ctb;
    724             UWORD8 *pu1_uv_dst = pu1_uv_dst_ctb;
    725             WORD32 i, j;
    726             tu_sblk_coeff_data_t *ps_tu_sblk_coeff_data;
    727             WORD32 cb_size = 1 << (ps_tu->b3_size + 2);
    728 
    729             /* trans_size is used to update availability after reconstruction */
    730             trans_size = cb_size;
    731 
    732             pcm_flag = 1;
    733 
    734             tu_y_offset = tu_x + tu_y * pic_strd;
    735             pu1_y_dst += tu_x + tu_y * pic_strd;
    736             pu1_uv_dst += tu_x + (tu_y >> 1) * pic_strd;
    737 
    738             /* First byte points to number of coded blocks */
    739             pu1_tu_coeff_data++;
    740 
    741             /* Next byte points to scan type */
    742             pu1_tu_coeff_data++;
    743 
    744             ps_tu_sblk_coeff_data = (tu_sblk_coeff_data_t *)pu1_tu_coeff_data;
    745 
    746             pu1_buf = (UWORD8 *)&ps_tu_sblk_coeff_data->ai2_level[0];
    747             {
    748 
    749                 for(i = 0; i < cb_size; i++)
    750                 {
    751                     //pu1_y_dst[i * pic_strd + j] = *pu1_buf++;
    752                     memcpy(&pu1_y_dst[i * pic_strd], pu1_buf, cb_size);
    753                     pu1_buf += cb_size;
    754                 }
    755 
    756                 pu1_uv_dst = pu1_uv_dst + chroma_yuv420sp_vu_u_offset;
    757 
    758                 /* U */
    759                 for(i = 0; i < cb_size / 2; i++)
    760                 {
    761                     for(j = 0; j < cb_size / 2; j++)
    762                     {
    763                         pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
    764                     }
    765                 }
    766 
    767                 pu1_uv_dst = pu1_uv_dst + 1 + chroma_yuv420sp_vu_v_offset;
    768 
    769                 /* V */
    770                 for(i = 0; i < cb_size / 2; i++)
    771                 {
    772                     for(j = 0; j < cb_size / 2; j++)
    773                     {
    774                         pu1_uv_dst[i * pic_strd + 2 * j] = *pu1_buf++;
    775                     }
    776                 }
    777             }
    778 
    779             pu1_tu_coeff_data = pu1_buf;
    780 
    781         }
    782 
    783 
    784 
    785 
    786 
    787         for(c_idx = 0; c_idx < num_comp; c_idx++)
    788         {
    789             if(0 == pcm_flag)
    790             {
    791                 /* Initializing variables */
    792                 pred_strd = pic_strd;
    793                 dst_strd = pic_strd;
    794 
    795                 if(c_idx == 0) /* Y */
    796                 {
    797                     log2_y_trans_size_minus_2 = ps_tu->b3_size;
    798                     trans_size = 1 << (log2_y_trans_size_minus_2 + 2);
    799                     log2_trans_size = log2_y_trans_size_minus_2 + 2;
    800 
    801                     tu_y_offset = tu_x + tu_y * pic_strd;
    802 
    803                     pi2_src = pi2_tu_coeff;
    804                     pu1_pred = pu1_y_dst_ctb + tu_y_offset;
    805                     pu1_dst = pu1_y_dst_ctb + tu_y_offset;
    806 
    807                     /* Calculating scaling matrix offset */
    808                     offset = log2_y_trans_size_minus_2 * 6
    809                                     + (!intra_flag)
    810                                     * ((log2_y_trans_size_minus_2
    811                                                     == 3) ? 1 : 3)
    812                                     + c_idx;
    813                     pi2_dequant_matrix = pi2_scaling_mat
    814                                     + scaling_mat_offset[offset];
    815 
    816                     src_strd = trans_size;
    817 
    818                     /* 4x4 transform Luma in INTRA mode is DST */
    819                     if(log2_y_trans_size_minus_2 == 0 && intra_flag)
    820                     {
    821                         func_idx = log2_y_trans_size_minus_2;
    822                         e_trans_type = DST_4x4;
    823                     }
    824                     else
    825                     {
    826                         func_idx = log2_y_trans_size_minus_2 + 1;
    827                         e_trans_type = (TRANSFORM_TYPE)(log2_y_trans_size_minus_2 + 1);
    828                     }
    829 
    830                     qp_div = ps_tu->b7_qp / 6;
    831                     qp_rem = ps_tu->b7_qp % 6;
    832 
    833                     u1_cbf = ps_tu->b1_y_cbf;
    834 
    835                     transform_skip_flag = pu1_tu_coeff_data[1] & 1;
    836                     /* Unpacking coeffs */
    837                     if(1 == u1_cbf)
    838                     {
    839                         pu1_tu_coeff_data = ihevcd_unpack_coeffs(
    840                                         pi2_src, log2_y_trans_size_minus_2 + 2,
    841                                         pu1_tu_coeff_data, pi2_dequant_matrix,
    842                                         qp_rem, qp_div, e_trans_type,
    843                                         ps_tu->b1_transquant_bypass, &zero_cols,
    844                                         &zero_rows, &coeff_type,
    845                                         &i2_coeff_value);
    846                     }
    847                 }
    848                 else /* UV interleaved */
    849                 {
    850                     /* Chroma :If Transform size is 4x4, keep 4x4 else do transform on (trans_size/2 x trans_size/2) */
    851                     if(ps_tu->b3_size == 0)
    852                     {
    853                         /* Chroma 4x4 is present with 4th luma 4x4 block. For this case chroma postion has to be (luma pos x- 4,luma pos y- 4) */
    854                         log2_uv_trans_size_minus_2 = ps_tu->b3_size;
    855                         tu_uv_offset = (tu_x - 4) + ((tu_y - 4) / 2) * pic_strd;
    856                     }
    857                     else
    858                     {
    859                         log2_uv_trans_size_minus_2 = ps_tu->b3_size - 1;
    860                         tu_uv_offset = tu_x + (tu_y >> 1) * pic_strd;
    861                     }
    862                     trans_size = 1 << (log2_uv_trans_size_minus_2 + 2);
    863                     log2_trans_size = log2_uv_trans_size_minus_2 + 2;
    864 
    865                     pi2_src = pi2_tu_coeff;
    866                     pi2_src_v = pi2_tu_coeff + trans_size * trans_size;
    867                     pu1_pred = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
    868                     pu1_pred_v = pu1_pred + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
    869                     pu1_dst = pu1_uv_dst_ctb + tu_uv_offset + chroma_yuv420sp_vu_u_offset; /* Pointing to start byte of U*/
    870                     pu1_dst_v = pu1_dst + 1 + chroma_yuv420sp_vu_v_offset; /* Pointing to start byte of V*/
    871 
    872                     /*TODO: Add support for choosing different tables for U and V,
    873                      * change this to a single array to handle flat/default/custom, intra/inter, luma/chroma and various sizes
    874                      */
    875                     /* Calculating scaling matrix offset */
    876                     /* ((log2_uv_trans_size_minus_2 == 3) ? 1:3) condition check is not needed, since
    877                      * max uv trans size is 16x16
    878                      */
    879                     offset = log2_uv_trans_size_minus_2 * 6
    880                                     + (!intra_flag) * 3 + c_idx;
    881                     pi2_dequant_matrix = pi2_scaling_mat
    882                                     + scaling_mat_offset[offset];
    883                     pi2_dequant_matrix_v = pi2_scaling_mat
    884                                     + scaling_mat_offset[offset + 1];
    885 
    886                     src_strd = trans_size;
    887 
    888                     func_idx = 1 + 4 + log2_uv_trans_size_minus_2; /* DST func + Y funcs + cur func index*/
    889 
    890                     /* Handle error cases where 64x64 TU is signalled which results in 32x32 chroma.
    891                      * By limiting func_idx to 7, max of 16x16 chroma is called */
    892                     func_idx = MIN(func_idx, 7);
    893 
    894                     e_trans_type = (TRANSFORM_TYPE)(log2_uv_trans_size_minus_2 + 1);
    895                     /* QP for U */
    896                     i1_chroma_pic_qp_offset = ps_pps->i1_pic_cb_qp_offset;
    897                     i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cb_qp_offset;
    898                     u1_cbf = ps_tu->b1_cb_cbf;
    899 
    900                     chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
    901                                     + i1_chroma_slice_qp_offset;
    902                     chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
    903                     qp_div = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
    904                     qp_rem = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;
    905 
    906                     /* QP for V */
    907                     i1_chroma_pic_qp_offset = ps_pps->i1_pic_cr_qp_offset;
    908                     i1_chroma_slice_qp_offset = ps_slice_hdr->i1_slice_cr_qp_offset;
    909                     u1_cbf_v = ps_tu->b1_cr_cbf;
    910 
    911                     chroma_qp_idx = ps_tu->b7_qp + i1_chroma_pic_qp_offset
    912                                     + i1_chroma_slice_qp_offset;
    913                     chroma_qp_idx = CLIP3(chroma_qp_idx, 0, 57);
    914                     qp_div_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] / 6;
    915                     qp_rem_v = gai2_ihevcd_chroma_qp[chroma_qp_idx] % 6;
    916 
    917                     /* Unpacking coeffs */
    918                     transform_skip_flag = pu1_tu_coeff_data[1] & 1;
    919                     if(1 == u1_cbf)
    920                     {
    921                         pu1_tu_coeff_data = ihevcd_unpack_coeffs(
    922                                         pi2_src, log2_uv_trans_size_minus_2 + 2,
    923                                         pu1_tu_coeff_data, pi2_dequant_matrix,
    924                                         qp_rem, qp_div, e_trans_type,
    925                                         ps_tu->b1_transquant_bypass, &zero_cols,
    926                                         &zero_rows, &coeff_type,
    927                                         &i2_coeff_value);
    928                     }
    929 
    930                     transform_skip_flag_v = pu1_tu_coeff_data[1] & 1;
    931                     if(1 == u1_cbf_v)
    932                     {
    933                         pu1_tu_coeff_data = ihevcd_unpack_coeffs(
    934                                         pi2_src_v, log2_uv_trans_size_minus_2 + 2,
    935                                         pu1_tu_coeff_data, pi2_dequant_matrix_v,
    936                                         qp_rem_v, qp_div_v, e_trans_type,
    937                                         ps_tu->b1_transquant_bypass, &zero_cols_v,
    938                                         &zero_rows_v, &coeff_type_v, &i2_coeff_value_v);
    939                     }
    940                 }
    941                 /***************************************************************/
    942                 /******************  Intra Prediction **************************/
    943                 /***************************************************************/
    944                 if(intra_flag) /* Intra */
    945                 {
    946                     /* While (MAX_TU_SIZE * 2 * 2) + 1 is the actaul size needed,
    947                        au1_ref_sub_out size is kept as multiple of 8,
    948                        so that SIMD functions can load 64 bits */
    949                     UWORD8 au1_ref_sub_out[(MAX_TU_SIZE * 2 * 2) + 8];
    950                     UWORD8 *pu1_top_left, *pu1_top, *pu1_left;
    951                     WORD32 luma_pred_func_idx, chroma_pred_func_idx;
    952 
    953                     /* Get the neighbour availability flags */
    954                     /* Done for only Y */
    955                     if(c_idx == 0)
    956                     {
    957                         /* Get neighbor availability for Y only */
    958                         luma_nbr_flags = ihevcd_get_intra_nbr_flag(ps_proc,
    959                                                                    ps_tu,
    960                                                                    au4_intra_nbr_avail,
    961                                                                    ps_sps->i2_pic_width_in_luma_samples,
    962                                                                    ps_pps->i1_constrained_intra_pred_flag,
    963                                                                    trans_size,
    964                                                                    ctb_size);
    965 
    966                         if(trans_size == 4)
    967                             luma_nbr_flags_4x4[(ps_tu->b4_pos_x % 2) + (ps_tu->b4_pos_y % 2) * 2] = luma_nbr_flags;
    968 
    969                         if((ps_tu->b4_pos_x % 2 == 0) && (ps_tu->b4_pos_y % 2 == 0))
    970                         {
    971                             chroma_nbr_flags = luma_nbr_flags;
    972                         }
    973 
    974                         /* Initializing nbr pointers */
    975                         pu1_top = pu1_pred - pic_strd;
    976                         pu1_left = pu1_pred - 1;
    977                         pu1_top_left = pu1_pred - pic_strd - 1;
    978 
    979                         /* call reference array substitution */
    980                         if(luma_nbr_flags == 0x1ffff)
    981                             ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_subst_all_avlble_fptr(
    982                                             pu1_top_left,
    983                                             pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, au1_ref_sub_out, 1);
    984                         else
    985                             ps_codec->s_func_selector.ihevc_intra_pred_luma_ref_substitution_fptr(
    986                                             pu1_top_left,
    987                                             pu1_top, pu1_left, pred_strd, trans_size, luma_nbr_flags, au1_ref_sub_out, 1);
    988 
    989                         /* call reference filtering */
    990                         ps_codec->s_func_selector.ihevc_intra_pred_ref_filtering_fptr(
    991                                         au1_ref_sub_out, trans_size,
    992                                         au1_ref_sub_out,
    993                                         u1_luma_pred_mode, ps_sps->i1_strong_intra_smoothing_enable_flag);
    994 
    995                         /* use the look up to get the function idx */
    996                         luma_pred_func_idx = g_i4_ip_funcs[u1_luma_pred_mode];
    997 
    998                         /* call the intra prediction function */
    999                         ps_codec->apf_intra_pred_luma[luma_pred_func_idx](au1_ref_sub_out, 1, pu1_pred, pred_strd, trans_size, u1_luma_pred_mode);
   1000                     }
   1001                     else
   1002                     {
   1003                         /* In case of yuv420sp_vu, prediction happens as usual.         */
   1004                         /* So point the pu1_pred pointer to original prediction pointer */
   1005                         UWORD8 *pu1_pred_orig = pu1_pred - chroma_yuv420sp_vu_u_offset;
   1006 
   1007                         /*    Top-Left | Top-Right | Top | Left | Bottom-Left
   1008                          *      1         4         4     4         4
   1009                          *
   1010                          * Generating chroma_nbr_flags depending upon the transform size */
   1011                         if(ps_tu->b3_size == 0)
   1012                         {
   1013                             /* Take TL,T,L flags of First luma 4x4 block */
   1014                             chroma_nbr_flags = (luma_nbr_flags_4x4[0] & 0x10FF0);
   1015                             /* Take TR flags of Second luma 4x4 block */
   1016                             chroma_nbr_flags |= (luma_nbr_flags_4x4[1] & 0x0F000);
   1017                             /* Take BL flags of Third luma 4x4 block */
   1018                             chroma_nbr_flags |= (luma_nbr_flags_4x4[2] & 0x0000F);
   1019                         }
   1020 
   1021                         /* Initializing nbr pointers */
   1022                         pu1_top = pu1_pred_orig - pic_strd;
   1023                         pu1_left = pu1_pred_orig - 2;
   1024                         pu1_top_left = pu1_pred_orig - pic_strd - 2;
   1025 
   1026                         /* Chroma pred  mode derivation from luma pred mode */
   1027                         {
   1028                             tu_t *ps_tu_tmp = ps_tu;
   1029                             while(!ps_tu_tmp->b1_first_tu_in_cu)
   1030                             {
   1031                                 ps_tu_tmp--;
   1032                             }
   1033                             u1_luma_pred_mode_first_tu = ps_tu_tmp->b6_luma_intra_mode;
   1034                         }
   1035                         if(4 == u1_chroma_pred_mode)
   1036                             u1_chroma_pred_mode = u1_luma_pred_mode_first_tu;
   1037                         else
   1038                         {
   1039                             u1_chroma_pred_mode = gau1_intra_pred_chroma_modes[u1_chroma_pred_mode];
   1040 
   1041                             if(u1_chroma_pred_mode ==
   1042                                                             u1_luma_pred_mode_first_tu)
   1043                             {
   1044                                 u1_chroma_pred_mode = INTRA_ANGULAR(34);
   1045                             }
   1046                         }
   1047 
   1048                         /* call the chroma reference array substitution */
   1049                         ps_codec->s_func_selector.ihevc_intra_pred_chroma_ref_substitution_fptr(
   1050                                         pu1_top_left,
   1051                                         pu1_top, pu1_left, pic_strd, trans_size, chroma_nbr_flags, au1_ref_sub_out, 1);
   1052 
   1053                         /* use the look up to get the function idx */
   1054                         chroma_pred_func_idx =
   1055                                         g_i4_ip_funcs[u1_chroma_pred_mode];
   1056 
   1057                         /* call the intra prediction function */
   1058                         ps_codec->apf_intra_pred_chroma[chroma_pred_func_idx](au1_ref_sub_out, 1, pu1_pred_orig, pred_strd, trans_size, u1_chroma_pred_mode);
   1059                     }
   1060                 }
   1061 
   1062                 /* Updating number of transform types */
   1063                 STATS_UPDATE_ALL_TRANS(e_trans_type, c_idx);
   1064 
   1065                 /* IQ, IT and Recon for Y if c_idx == 0, and U if c_idx !=0 */
   1066                 if(1 == u1_cbf)
   1067                 {
   1068                     if(ps_tu->b1_transquant_bypass || transform_skip_flag)
   1069                     {
   1070                         /* Recon */
   1071                         ps_codec->apf_recon[func_idx](pi2_src, pu1_pred, pu1_dst,
   1072                                                       src_strd, pred_strd, dst_strd,
   1073                                                       zero_cols);
   1074                     }
   1075                     else
   1076                     {
   1077 
   1078                         /* Updating coded number of transform types(excluding trans skip and trans quant skip) */
   1079                         STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);
   1080 
   1081                         /* iQuant , iTrans and Recon */
   1082                         if((0 == coeff_type))
   1083                         {
   1084                             ps_codec->apf_itrans_recon[func_idx](pi2_src, pi2_tmp,
   1085                                                                  pu1_pred, pu1_dst,
   1086                                                                  src_strd, pred_strd,
   1087                                                                  dst_strd, zero_cols,
   1088                                                                  zero_rows);
   1089                         }
   1090                         else /* DC only */
   1091                         {
   1092                             STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
   1093                             ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred, pu1_dst,
   1094                                                                  pred_strd, dst_strd,
   1095                                                                  log2_trans_size,
   1096                                                                  i2_coeff_value);
   1097                         }
   1098                     }
   1099                 }
   1100                 /* IQ, IT and Recon for V */
   1101                 if(c_idx != 0)
   1102                 {
   1103                     if(1 == u1_cbf_v)
   1104                     {
   1105                         if(ps_tu->b1_transquant_bypass || transform_skip_flag_v)
   1106                         {
   1107                             /* Recon */
   1108                             ps_codec->apf_recon[func_idx](pi2_src_v, pu1_pred_v,
   1109                                                           pu1_dst_v, src_strd,
   1110                                                           pred_strd, dst_strd,
   1111                                                           zero_cols_v);
   1112                         }
   1113                         else
   1114                         {
   1115                             /* Updating number of transform types */
   1116                             STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 0);
   1117 
   1118                             /* iQuant , iTrans and Recon */
   1119                             if((0 == coeff_type_v))
   1120                             {
   1121                                 ps_codec->apf_itrans_recon[func_idx](pi2_src_v,
   1122                                                                      pi2_tmp,
   1123                                                                      pu1_pred_v,
   1124                                                                      pu1_dst_v,
   1125                                                                      src_strd,
   1126                                                                      pred_strd,
   1127                                                                      dst_strd,
   1128                                                                      zero_cols_v,
   1129                                                                      zero_rows_v);
   1130                             }
   1131                             else  /* DC only */
   1132                             {
   1133                                 STATS_UPDATE_CODED_TRANS(e_trans_type, c_idx, 1);
   1134                                 ps_codec->apf_itrans_recon_dc[c_idx](pu1_pred_v, pu1_dst_v,
   1135                                                                      pred_strd, dst_strd,
   1136                                                                      log2_trans_size,
   1137                                                                      i2_coeff_value_v);
   1138                             }
   1139                         }
   1140                     }
   1141                 }
   1142             }
   1143 
   1144             /* Neighbor availability inside CTB */
   1145             /* 1bit per 4x4. Indicates whether that 4x4 block has been reconstructed(avialable) */
   1146             /* Used for neighbor availability in intra pred */
   1147             if(c_idx == 0)
   1148             {
   1149                 WORD32 i;
   1150                 WORD32 trans_in_min_tu;
   1151                 UWORD32 cur_tu_in_bits;
   1152                 UWORD32 cur_tu_avail_flag;
   1153 
   1154                 trans_in_min_tu = trans_size / MIN_TU_SIZE;
   1155                 cur_tu_in_bits = (1 << trans_in_min_tu) - 1;
   1156                 cur_tu_in_bits = cur_tu_in_bits << (32 - trans_in_min_tu);
   1157 
   1158                 cur_tu_avail_flag = cur_tu_in_bits >> (ps_tu->b4_pos_x + 1);
   1159 
   1160                 for(i = 0; i < trans_in_min_tu; i++)
   1161                     au4_intra_nbr_avail[1 + ps_tu->b4_pos_y + i] |=
   1162                                     cur_tu_avail_flag;
   1163             }
   1164         }
   1165     }
   1166     ps_proc->pv_tu_coeff_data = pu1_tu_coeff_data;
   1167 
   1168     return ps_proc->i4_ctb_tu_cnt;
   1169 }
   1170 
   1171