Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22 *******************************************************************************
     23 * @file
     24 *  ih264e_intra_modes_eval.c
     25 *
     26 * @brief
     27 *  This file contains definitions of routines that perform rate distortion
     28 *  analysis on a macroblock if they are to be coded as intra.
     29 *
     30 * @author
     31 *  ittiam
     32 *
     33 * @par List of Functions:
     34 *  - ih264e_derive_neighbor_availability_of_mbs()
     35 *  - ih264e_derive_ngbr_avbl_of_mb_partitions()
     36 *  - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff()
     37 *  - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff()
     38 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff()
     39 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton()
     40 *  - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff()
     41 *  - ih264e_evaluate_intra16x16_modes()
     42 *  - ih264e_evaluate_intra4x4_modes()
     43 *  - ih264e_evaluate_intra_chroma_modes()
     44 *
     45 * @remarks
     46 *  None
     47 *
     48 *******************************************************************************
     49 */
     50 
     51 /*****************************************************************************/
     52 /* File Includes                                                             */
     53 /*****************************************************************************/
     54 
     55 /* System include files */
     56 #include <stdio.h>
     57 #include <string.h>
     58 #include <limits.h>
     59 #include <assert.h>
     60 
     61 /* User include files */
     62 #include "ih264e_config.h"
     63 #include "ih264_typedefs.h"
     64 #include "ih264e_defs.h"
     65 #include "iv2.h"
     66 #include "ive2.h"
     67 #include "ih264_debug.h"
     68 #include "ih264_defs.h"
     69 #include "ih264_macros.h"
     70 #include "ih264_intra_pred_filters.h"
     71 #include "ih264_structs.h"
     72 #include "ih264_common_tables.h"
     73 #include "ih264_trans_quant_itrans_iquant.h"
     74 #include "ih264_inter_pred_filters.h"
     75 #include "ih264_mem_fns.h"
     76 #include "ih264_padding.h"
     77 #include "ih264_deblk_edge_filters.h"
     78 #include "ih264_cabac_tables.h"
     79 #include "ime_distortion_metrics.h"
     80 #include "ih264e_error.h"
     81 #include "ih264e_bitstream.h"
     82 #include "ime_defs.h"
     83 #include "ime_structs.h"
     84 #include "irc_cntrl_param.h"
     85 #include "irc_frame_info_collector.h"
     86 #include "ih264e_rate_control.h"
     87 #include "ih264e_cabac_structs.h"
     88 #include "ih264e_structs.h"
     89 #include "ih264e_intra_modes_eval.h"
     90 #include "ih264e_globals.h"
     91 #include "ime_platform_macros.h"
     92 
     93 
     94 /*****************************************************************************/
     95 /* Function Definitions                                                      */
     96 /*****************************************************************************/
     97 
     98 /**
     99 ******************************************************************************
    100 *
    101 * @brief
    102 *  derivation process for macroblock availability
    103 *
    104 * @par   Description
    105 *  Calculates the availability of the left, top, topright and topleft macroblocks.
    106 *
    107 * @param[in] ps_proc_ctxt
    108 *  pointer to proc context (handle)
    109 *
    110 * @remarks Based on section 6.4.5 in H264 spec
    111 *
    112 * @return  none
    113 *
    114 ******************************************************************************
    115 */
    116 void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc)
    117 {
    118     UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx;
    119     UWORD8 *pu1_slice_idx_b;
    120     UWORD8 *pu1_slice_idx_a;
    121     UWORD8 *pu1_slice_idx_c;
    122     UWORD8 *pu1_slice_idx_d;
    123     block_neighbors_t *ps_ngbr_avbl;
    124     WORD32 i4_mb_x, i4_mb_y;
    125     WORD32 i4_wd_mbs;
    126 
    127     i4_mb_x = ps_proc->i4_mb_x;
    128     i4_mb_y = ps_proc->i4_mb_y;
    129 
    130     i4_wd_mbs = ps_proc->i4_wd_mbs;
    131 
    132     pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x;
    133     pu1_slice_idx_a = pu1_slice_idx_curr - 1;
    134     pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs;
    135     pu1_slice_idx_c = pu1_slice_idx_b + 1;
    136     pu1_slice_idx_d = pu1_slice_idx_b - 1;
    137     ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
    138 
    139     /**********************************************************************/
    140     /* The macroblock is marked as available, unless one of the following */
    141     /* conditions is true in which case the macroblock shall be marked as */
    142     /* not available.                                                     */
    143     /* 1. mbAddr < 0                                                      */
    144     /* 2  mbAddr > CurrMbAddr                                             */
    145     /* 3. the macroblock with address mbAddr belongs to a different slice */
    146     /* than the macroblock with address CurrMbAddr                        */
    147     /**********************************************************************/
    148 
    149     /* left macroblock availability */
    150     if (i4_mb_x == 0)
    151     { /* macroblocks along first column */
    152         ps_ngbr_avbl->u1_mb_a = 0;
    153     }
    154     else
    155     { /* macroblocks belong to same slice? */
    156         if (*pu1_slice_idx_a != *pu1_slice_idx_curr)
    157             ps_ngbr_avbl->u1_mb_a = 0;
    158         else
    159             ps_ngbr_avbl->u1_mb_a = 1;
    160     }
    161 
    162     /* top macroblock availability */
    163     if (i4_mb_y == 0)
    164     { /* macroblocks along first row */
    165         ps_ngbr_avbl->u1_mb_b = 0;
    166     }
    167     else
    168     { /* macroblocks belong to same slice? */
    169         if (*pu1_slice_idx_b != *pu1_slice_idx_curr)
    170             ps_ngbr_avbl->u1_mb_b = 0;
    171         else
    172             ps_ngbr_avbl->u1_mb_b = 1;
    173     }
    174 
    175     /* top right macroblock availability */
    176     if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0)
    177     { /* macroblocks along last column */
    178         ps_ngbr_avbl->u1_mb_c = 0;
    179     }
    180     else
    181     { /* macroblocks belong to same slice? */
    182         if (*pu1_slice_idx_c != *pu1_slice_idx_curr)
    183             ps_ngbr_avbl->u1_mb_c = 0;
    184         else
    185             ps_ngbr_avbl->u1_mb_c = 1;
    186     }
    187 
    188     /* top left macroblock availability */
    189     if (i4_mb_x == 0 || i4_mb_y == 0)
    190     { /* macroblocks along first column */
    191         ps_ngbr_avbl->u1_mb_d = 0;
    192     }
    193     else
    194     { /* macroblocks belong to same slice? */
    195         if (*pu1_slice_idx_d != *pu1_slice_idx_curr)
    196             ps_ngbr_avbl->u1_mb_d = 0;
    197         else
    198             ps_ngbr_avbl->u1_mb_d = 1;
    199     }
    200 }
    201 
    202 /**
    203 ******************************************************************************
    204 *
    205 * @brief
    206 *  derivation process for subblock/partition availability
    207 *
    208 * @par   Description
    209 *  Calculates the availability of the left, top, topright and topleft subblock
    210 *  or partitions.
    211 *
    212 * @param[in]    ps_proc_ctxt
    213 *  pointer to macroblock context (handle)
    214 *
    215 * @param[in]    i1_pel_pos_x
    216 *  column position of the pel wrt the current block
    217 *
    218 * @param[in]    i1_pel_pos_y
    219 *  row position of the pel in wrt current block
    220 *
    221 * @remarks     Assumptions: before calling this function it is assumed that
    222 *   the neighbor availability of the current macroblock is already derived.
    223 *   Based on table 6-3 of H264 specification
    224 *
    225 * @return      availability status (yes or no)
    226 *
    227 ******************************************************************************
    228 */
    229 UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl,
    230                                                 WORD8 i1_pel_pos_x,
    231                                                 WORD8 i1_pel_pos_y)
    232 {
    233     UWORD8 u1_neighbor_avail=0;
    234 
    235     /**********************************************************************/
    236     /* values of i1_pel_pos_x in the range 0-15 inclusive correspond to   */
    237     /* various columns of a macroblock                                    */
    238     /*                                                                    */
    239     /* values of i1_pel_pos_y in the range 0-15 inclusive correspond to   */
    240     /* various rows of a macroblock                                       */
    241     /*                                                                    */
    242     /* other values of i1_pel_pos_x & i1_pel_pos_y represents elements    */
    243     /* outside the bound of an mb ie., represents its neighbors.          */
    244     /**********************************************************************/
    245     if (i1_pel_pos_x < 0)
    246     { /* column(-1) */
    247         if (i1_pel_pos_y < 0)
    248         { /* row(-1) */
    249             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */
    250         }
    251         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
    252         { /* all rows of a macroblock */
    253             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */
    254         }
    255         else /* if (i1_pel_pos_y >= 16) */
    256         { /* rows(+16) */
    257             u1_neighbor_avail = 0;  /* current mb bottom left availability */
    258         }
    259     }
    260     else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16)
    261     { /* all columns of a macroblock */
    262         if (i1_pel_pos_y < 0)
    263         { /* row(-1) */
    264             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */
    265         }
    266         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
    267         { /* all rows of a macroblock */
    268             u1_neighbor_avail = 1; /* current mb availability */
    269             /* availability of the partition is dependent on the position of the partition inside the mb */
    270             /* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */
    271         }
    272         else /* if (i1_pel_pos_y >= 16) */
    273         { /* rows(+16) */
    274             u1_neighbor_avail = 0;  /* current mb bottom availability */
    275         }
    276     }
    277     else if (i1_pel_pos_x >= 16)
    278     { /* column(+16) */
    279         if (i1_pel_pos_y < 0)
    280         { /* row(-1) */
    281             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */
    282         }
    283         else /* if (i1_pel_pos_y >= 0) */
    284         { /* all other rows */
    285             u1_neighbor_avail = 0;  /* current mb right & bottom right availability */
    286         }
    287     }
    288 
    289     return u1_neighbor_avail;
    290 }
    291 
    292 /**
    293 ******************************************************************************
    294 *
    295 * @brief
    296 *  evaluate best intra 16x16 mode (rate distortion opt off)
    297 *
    298 * @par Description
    299 *  This function evaluates all the possible intra 16x16 modes and finds the mode
    300 *  that best represents the macro-block (least distortion) and occupies fewer
    301 *  bits in the bit-stream.
    302 *
    303 * @param[in]   ps_proc_ctxt
    304 *  pointer to process context (handle)
    305 *
    306 * @remarks
    307 *  Ideally the cost of encoding a macroblock is calculated as
    308 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    309 *  input block and the reconstructed block and rate is the number of bits taken
    310 *  to place the macroblock in the bit-stream. In this routine the rate does not
    311 *  exactly point to the total number of bits it takes, rather it points to header
    312 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    313 *  and residual bits fall in to texture bits the number of bits taken to encoding
    314 *  mbtype is considered as rate, we compute cost. Further we will approximate
    315 *  the distortion as the deviation b/w input and the predicted block as opposed
    316 *  to input and reconstructed block.
    317 *
    318 *  NOTE: As per the Document JVT-O079, for intra 16x16 macroblock,
    319 *  the SAD and cost are one and the same.
    320 *
    321 * @return     none
    322 *
    323 ******************************************************************************
    324 */
    325 
    326 void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    327 {
    328     /* Codec Context */
    329     codec_t *ps_codec = ps_proc->ps_codec;
    330 
    331     /* SAD(distortion metric) of an 8x8 block */
    332     WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX;
    333 
    334     /* lambda */
    335     UWORD32 u4_lambda = ps_proc->u4_lambda;
    336 
    337     /* cost = distortion + lambda*rate */
    338     WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX;
    339 
    340     /* intra mode */
    341     UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16;
    342 
    343     /* neighbor pels for intra prediction */
    344     UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels;
    345 
    346     /* neighbor availability */
    347     WORD32 i4_ngbr_avbl;
    348 
    349     /* pointer to src macro block */
    350     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
    351     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
    352 
    353     /* pointer to prediction macro block */
    354     UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16;
    355     UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane;
    356 
    357     /* strides */
    358     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    359     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    360     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
    361 
    362     /* pointer to neighbors left, top, topleft */
    363     UWORD8 *pu1_mb_a = pu1_ref_mb - 1;
    364     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd;
    365     UWORD8 *pu1_mb_d = pu1_mb_b - 1;
    366     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
    367     /* valid intra modes map */
    368     UWORD32 u4_valid_intra_modes;
    369 
    370     /* lut for valid intra modes */
    371     const UWORD8 u1_valid_intra_modes[8] = {4, 6, 4, 6, 5, 7, 5, 15};
    372 
    373     /* temp var */
    374     UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
    375     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    376     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
    377 
    378     /* init temp var */
    379     if (ps_proc->i4_slice_type != ISLICE)
    380     {
    381         /* Offset for MBtype */
    382         offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
    383         u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
    384     }
    385 
    386     /* locating neighbors that are available for prediction */
    387 
    388     /* gather prediction pels from the neighbors, if particular set is not available
    389      * it is set to zero*/
    390     /* left pels */
    391     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
    392                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
    393     if (u1_mb_a)
    394     {
    395         for(i = 0; i < 16; i++)
    396             pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd];
    397     }
    398     else
    399     {
    400         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE);
    401     }
    402     /* top pels */
    403     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
    404                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
    405     if (u1_mb_b)
    406     {
    407         ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16);
    408     }
    409     else
    410     {
    411         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE);
    412     }
    413     /* topleft pels */
    414     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
    415                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
    416     if (u1_mb_d)
    417     {
    418         pu1_ngbr_pels_i16[16] = *pu1_mb_d;
    419     }
    420     else
    421     {
    422         pu1_ngbr_pels_i16[16] = 0;
    423     }
    424 
    425     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
    426     ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl;
    427 
    428     /* set valid intra modes for evaluation */
    429     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
    430 
    431     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST ||
    432                     ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST)
    433         u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
    434 
    435     /* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */
    436     ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16,
    437                                                   i4_src_strd, i4_pred_strd,
    438                                                   i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least,
    439                                                   u4_valid_intra_modes);
    440 
    441     /* cost = distortion + lambda*rate */
    442     i4_mb_cost_least = i4_mb_distortion_least;
    443 
    444     if (((u4_valid_intra_modes >> 3) & 1) != 0)
    445     {
    446         /* intra prediction for PLANE mode*/
    447         (ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl);
    448 
    449         /* evaluate distortion between the actual blk and the estimated blk for the given mode */
    450         ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion);
    451 
    452         /* cost = distortion + lambda*rate */
    453         i4_mb_cost = i4_mb_distortion;
    454 
    455         /* update the least cost information if necessary */
    456         if(i4_mb_cost < i4_mb_distortion_least)
    457         {
    458             u4_intra_mode = PLANE_I16x16;
    459 
    460             i4_mb_cost_least = i4_mb_cost;
    461             i4_mb_distortion_least = i4_mb_distortion;
    462         }
    463     }
    464 
    465     u4_best_intra_16x16_mode = u4_intra_mode;
    466 
    467     DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode);
    468 
    469     ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode;
    470 
    471     /* cost = distortion + lambda*rate */
    472     i4_mb_cost_least    = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode];
    473 
    474 
    475     /* update the type of the mb if necessary */
    476     if (i4_mb_cost_least < ps_proc->i4_mb_cost)
    477     {
    478         ps_proc->i4_mb_cost = i4_mb_cost_least;
    479         ps_proc->i4_mb_distortion = i4_mb_distortion_least;
    480         ps_proc->u4_mb_type = I16x16;
    481     }
    482 
    483     return ;
    484 }
    485 
    486 
    487 /**
    488 ******************************************************************************
    489 *
    490 * @brief
    491 *  evaluate best intra 8x8 mode (rate distortion opt on)
    492 *
    493 * @par Description
    494 *  This function evaluates all the possible intra 8x8 modes and finds the mode
    495 *  that best represents the macro-block (least distortion) and occupies fewer
    496 *  bits in the bit-stream.
    497 *
    498 * @param[in]    ps_proc_ctxt
    499 *  pointer to proc ctxt
    500 *
    501 * @remarks Ideally the cost of encoding a macroblock is calculated as
    502 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    503 *  input block and the reconstructed block and rate is the number of bits taken
    504 *  to place the macroblock in the bit-stream. In this routine the rate does not
    505 *  exactly point to the total number of bits it takes, rather it points to header
    506 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    507 *  and residual bits fall in to texture bits the number of bits taken to encoding
    508 *  mbtype is considered as rate, we compute cost. Further we will approximate
    509 *  the distortion as the deviation b/w input and the predicted block as opposed
    510 *  to input and reconstructed block.
    511 *
    512 *  NOTE: TODO: This function needs to be tested
    513 *
    514 *  @return      none
    515 *
    516 ******************************************************************************
    517 */
    518 void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    519 {
    520     /* Codec Context */
    521     codec_t *ps_codec = ps_proc->ps_codec;
    522 
    523     /* SAD(distortion metric) of an 4x4 block */
    524     WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
    525 
    526     /* lambda */
    527     UWORD32 u4_lambda = ps_proc->u4_lambda;
    528 
    529     /* cost = distortion + lambda*rate */
    530     WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda;
    531 
    532     /* cost due to mbtype */
    533     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
    534 
    535     /* intra mode */
    536     UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode;
    537 
    538     /* neighbor pels for intra prediction */
    539     UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels;
    540 
    541     /* pointer to curr partition */
    542     UWORD8 *pu1_mb_curr;
    543 
    544     /* pointer to prediction macro block */
    545     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
    546 
    547     /* strides */
    548     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    549     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    550 
    551     /* neighbors left, top, top right, top left */
    552     UWORD8 *pu1_mb_a;
    553     UWORD8 *pu1_mb_b;
    554     UWORD8 *pu1_mb_d;
    555 
    556     /* neighbor availability */
    557     WORD32 i4_ngbr_avbl;
    558     block_neighbors_t s_ngbr_avbl;
    559 
    560     /* temp vars */
    561     UWORD32  b8, u4_pix_x, u4_pix_y;
    562     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
    563     block_neighbors_t s_ngbr_avbl_MB;
    564 
    565     /* ngbr mb syntax information */
    566     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
    567     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    568     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    569     /* valid intra modes map */
    570     UWORD32 u4_valid_intra_modes;
    571 
    572     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
    573     {
    574         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
    575     }
    576     /* left pels */
    577     s_ngbr_avbl_MB.u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
    578                                   && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
    579 
    580     /* top pels */
    581     s_ngbr_avbl_MB.u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
    582                                   && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
    583 
    584     /* topleft pels */
    585     s_ngbr_avbl_MB.u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
    586                                   && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
    587 
    588     /* top right */
    589     s_ngbr_avbl_MB.u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
    590                                   && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
    591 
    592 
    593     for(b8 = 0; b8 < 4; b8++)
    594     {
    595         u4_pix_x = (b8 & 0x01) << 3;
    596         u4_pix_y = (b8 >> 1) << 3;
    597 
    598         pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
    599         /* when rdopt is off, we use the input as reference for constructing prediction buffer */
    600         /* as opposed to using the recon pels. (open loop intra prediction) */
    601         pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
    602         pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
    603         pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
    604 
    605         /* locating neighbors that are available for prediction */
    606         /* TODO : update the neighbor availability information basing on constrained intra pred information */
    607         /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
    608         /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
    609         s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */
    610         s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */
    611         s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */
    612         s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */
    613 
    614         /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
    615         i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  (s_ngbr_avbl.u1_mb_c << 3) +
    616                         (s_ngbr_avbl.u1_mb_a << 4);
    617         /* if top partition is available and top right is not available for intra prediction, then */
    618         /* padd top right samples using top sample and make top right also available */
    619         /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
    620         ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl;
    621 
    622 
    623         ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8,
    624                                                      i4_src_strd, i4_ngbr_avbl);
    625 
    626         i4_partition_cost_least = INT_MAX;
    627         /* set valid intra modes for evaluation */
    628         u4_valid_intra_modes = 0x1ff;
    629 
    630         if (!s_ngbr_avbl.u1_mb_b)
    631         {
    632             u4_valid_intra_modes &= ~(1 << VERT_I4x4);
    633             u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4);
    634             u4_valid_intra_modes &= ~(1 << VERT_L_I4x4);
    635         }
    636         if (!s_ngbr_avbl.u1_mb_a)
    637         {
    638             u4_valid_intra_modes &= ~(1 << HORZ_I4x4);
    639             u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4);
    640         }
    641         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d)
    642         {
    643             u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4);
    644             u4_valid_intra_modes &= ~(1 << VERT_R_I4x4);
    645             u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4);
    646         }
    647 
    648         /* estimate the intra 8x8 mode for the current partition (for evaluating cost) */
    649         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
    650         {
    651             u4_estimated_intra_8x8_mode = DC_I8x8;
    652         }
    653         else
    654         {
    655             UWORD32 u4_left_intra_8x8_mode = DC_I8x8;
    656             UWORD32 u4_top_intra_8x8_mode = DC_I8x8;
    657 
    658             if (u4_pix_x == 0)
    659             {
    660                 if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
    661                 {
    662                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1];
    663                 }
    664                 else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
    665                 {
    666                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2];
    667                 }
    668             }
    669             else
    670             {
    671                 u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1];
    672             }
    673 
    674             if (u4_pix_y == 0)
    675             {
    676                 if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
    677                 {
    678                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2];
    679                 }
    680                 else if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
    681                 {
    682                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2];
    683                 }
    684             }
    685             else
    686             {
    687                 u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2];
    688             }
    689 
    690             u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode);
    691         }
    692 
    693         /* perform intra mode 8x8 evaluation */
    694         for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1)
    695         {
    696             if ( (u4_valid_intra_modes & 1) == 0)
    697                 continue;
    698 
    699             /* intra prediction */
    700             (ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl);
    701 
    702             /* evaluate distortion between the actual blk and the estimated blk for the given mode */
    703             ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion);
    704 
    705             i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits);
    706 
    707             /* update the least cost information if necessary */
    708             if (i4_partition_cost < i4_partition_cost_least)
    709             {
    710                 i4_partition_cost_least = i4_partition_cost;
    711                 i4_partition_distortion_least = i4_partition_distortion;
    712                 u4_best_intra_8x8_mode = u4_intra_mode;
    713             }
    714         }
    715         /* macroblock distortion */
    716         i4_total_cost += i4_partition_cost_least;
    717         i4_total_distortion += i4_partition_distortion_least;
    718         /* mb partition mode */
    719         ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode;
    720 
    721     }
    722 
    723     /* update the type of the mb if necessary */
    724     if (i4_total_cost < ps_proc->i4_mb_cost)
    725     {
    726         ps_proc->i4_mb_cost = i4_total_cost;
    727         ps_proc->i4_mb_distortion = i4_total_distortion;
    728         ps_proc->u4_mb_type = I8x8;
    729     }
    730 
    731     return ;
    732 }
    733 
    734 
    735 /**
    736 ******************************************************************************
    737 *
    738 * @brief
    739 *  evaluate best intra 4x4 mode (rate distortion opt off)
    740 *
    741 * @par Description
    742 *  This function evaluates all the possible intra 4x4 modes and finds the mode
    743 *  that best represents the macro-block (least distortion) and occupies fewer
    744 *  bits in the bit-stream.
    745 *
    746 * @param[in]    ps_proc_ctxt
    747 *  pointer to proc ctxt
    748 *
    749 * @remarks
    750 *  Ideally the cost of encoding a macroblock is calculated as
    751 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    752 *  input block and the reconstructed block and rate is the number of bits taken
    753 *  to place the macroblock in the bit-stream. In this routine the rate does not
    754 *  exactly point to the total number of bits it takes, rather it points to header
    755 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    756 *  and residual bits fall in to texture bits the number of bits taken to encoding
    757 *  mbtype is considered as rate, we compute cost. Further we will approximate
    758 *  the distortion as the deviation b/w input and the predicted block as opposed
    759 *  to input and reconstructed block.
    760 *
    761 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
    762 *  24*lambda is added to the SAD before comparison with the best SAD for
    763 *  inter prediction. This is an empirical value to prevent using too many intra
    764 *  blocks.
    765 *
    766 * @return      none
    767 *
    768 ******************************************************************************
    769 */
    770 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    771 {
    772     /* Codec Context */
    773     codec_t *ps_codec = ps_proc->ps_codec;
    774 
    775     /* SAD(distortion metric) of an 4x4 block */
    776     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
    777 
    778     /* lambda */
    779     UWORD32 u4_lambda = ps_proc->u4_lambda;
    780 
    781     /* cost = distortion + lambda*rate */
    782     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
    783 
    784     /* cost due to mbtype */
    785     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
    786 
    787     /* intra mode */
    788     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
    789 
    790     /* neighbor pels for intra prediction */
    791     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
    792 
    793     /* pointer to curr partition */
    794     UWORD8 *pu1_mb_curr;
    795 
    796     /* pointer to prediction macro block */
    797     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
    798 
    799     /* strides */
    800     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    801     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    802 
    803     /* neighbors left, top, top right, top left */
    804     UWORD8 *pu1_mb_a;
    805     UWORD8 *pu1_mb_b;
    806     UWORD8 *pu1_mb_c;
    807     UWORD8 *pu1_mb_d;
    808 
    809     /* neighbor availability */
    810     WORD32 i4_ngbr_avbl;
    811     block_neighbors_t s_ngbr_avbl;
    812 
    813     /* temp vars */
    814     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
    815 
    816     /* scan order inside 4x4 block */
    817     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
    818 
    819     /* ngbr sub mb modes */
    820     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
    821     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    822     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    823 
    824     /* valid intra modes map */
    825     UWORD32 u4_valid_intra_modes;
    826     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
    827 
    828     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
    829     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
    830     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
    831     {
    832         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x + 1;
    833     }
    834     /* left pels */
    835     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
    836                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
    837 
    838     /* top pels */
    839     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
    840                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
    841 
    842     /* topleft pels */
    843     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
    844                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
    845 
    846     /* top right */
    847     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
    848                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
    849 
    850     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
    851     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
    852 
    853     for (b8 = 0; b8 < 4; b8++)
    854     {
    855         u4_blk_x = (b8 & 0x01) << 3;
    856         u4_blk_y = (b8 >> 1) << 3;
    857         for (b4 = 0; b4 < 4; b4++)
    858         {
    859             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
    860             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
    861 
    862             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
    863             /* when rdopt is off, we use the input as reference for constructing prediction buffer */
    864             /* as opposed to using the recon pels. (open loop intra prediction) */
    865             pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
    866             pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
    867             pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */
    868             pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
    869 
    870             /* locating neighbors that are available for prediction */
    871             /* TODO : update the neighbor availability information basing on constrained intra pred information */
    872             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
    873             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
    874 
    875             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
    876             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
    877             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
    878             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
    879             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
    880             /* set valid intra modes for evaluation */
    881             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
    882 
    883             /* if top partition is available and top right is not available for intra prediction, then */
    884             /* padd top right samples using top sample and make top right also available */
    885             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
    886 
    887             /* gather prediction pels from the neighbors */
    888             if (s_ngbr_avbl.u1_mb_a)
    889             {
    890                 for(i = 0; i < 4; i++)
    891                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd];
    892             }
    893             else
    894             {
    895                 memset(pu1_ngbr_pels_i4, 0, 4);
    896             }
    897 
    898             if (s_ngbr_avbl.u1_mb_b)
    899             {
    900                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
    901             }
    902             else
    903             {
    904                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
    905             }
    906 
    907             if (s_ngbr_avbl.u1_mb_d)
    908                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
    909             else
    910                 pu1_ngbr_pels_i4[4] = 0;
    911 
    912             if (s_ngbr_avbl.u1_mb_c)
    913             {
    914                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
    915             }
    916             else if (s_ngbr_avbl.u1_mb_b)
    917             {
    918                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
    919                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
    920             }
    921 
    922             i4_partition_cost_least = INT_MAX;
    923 
    924             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
    925             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
    926             {
    927                 u4_estimated_intra_4x4_mode = DC_I4x4;
    928             }
    929             else
    930             {
    931                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
    932                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
    933 
    934                 if (u4_pix_x == 0)
    935                 {
    936                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
    937                     {
    938                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
    939                     }
    940                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
    941                     {
    942                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
    943                     }
    944                 }
    945                 else
    946                 {
    947                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
    948                 }
    949 
    950                 if (u4_pix_y == 0)
    951                 {
    952                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
    953                     {
    954                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
    955                     }
    956                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
    957                     {
    958                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
    959                     }
    960                 }
    961                 else
    962                 {
    963                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
    964                 }
    965 
    966                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
    967             }
    968 
    969             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
    970 
    971             /* mode evaluation and prediction */
    972             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
    973                                                          pu1_ngbr_pels_i4,
    974                                                          pu1_pred_mb, i4_src_strd,
    975                                                          i4_pred_strd, i4_ngbr_avbl,
    976                                                          &u4_best_intra_4x4_mode,
    977                                                          &i4_partition_cost_least,
    978                                                          u4_valid_intra_modes,
    979                                                          u4_lambda,
    980                                                          u4_estimated_intra_4x4_mode);
    981 
    982 
    983             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits);
    984 
    985             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
    986             /* macroblock distortion */
    987             i4_total_distortion += i4_partition_distortion_least;
    988             i4_total_cost += i4_partition_cost_least;
    989             /* mb partition mode */
    990             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
    991         }
    992     }
    993 
    994     /* update the type of the mb if necessary */
    995     if (i4_total_cost < ps_proc->i4_mb_cost)
    996     {
    997         ps_proc->i4_mb_cost = i4_total_cost;
    998         ps_proc->i4_mb_distortion = i4_total_distortion;
    999         ps_proc->u4_mb_type = I4x4;
   1000     }
   1001 
   1002     return ;
   1003 }
   1004 
   1005 /**
   1006 ******************************************************************************
   1007 *
   1008 * @brief evaluate best intra 4x4 mode (rate distortion opt on)
   1009 *
   1010 * @par Description
   1011 *  This function evaluates all the possible intra 4x4 modes and finds the mode
   1012 *  that best represents the macro-block (least distortion) and occupies fewer
   1013 *  bits in the bit-stream.
   1014 *
   1015 * @param[in]    ps_proc_ctxt
   1016 *  pointer to proc ctxt
   1017 *
   1018 * @remarks
   1019 *  Ideally the cost of encoding a macroblock is calculated as
   1020 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
   1021 *  input block and the reconstructed block and rate is the number of bits taken
   1022 *  to place the macroblock in the bit-stream. In this routine the rate does not
   1023 *  exactly point to the total number of bits it takes, rather it points to header
   1024 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
   1025 *  and residual bits fall in to texture bits the number of bits taken to encoding
   1026 *  mbtype is considered as rate, we compute cost. Further we will approximate
   1027 *  the distortion as the deviation b/w input and the predicted block as opposed
   1028 *  to input and reconstructed block.
   1029 *
   1030 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
   1031 *  24*lambda is added to the SAD before comparison with the best SAD for
   1032 *  inter prediction. This is an empirical value to prevent using too many intra
   1033 *  blocks.
   1034 *
   1035 * @return      none
   1036 *
   1037 ******************************************************************************
   1038 */
   1039 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc)
   1040 {
   1041     /* Codec Context */
   1042     codec_t *ps_codec = ps_proc->ps_codec;
   1043 
   1044     /* SAD(distortion metric) of an 4x4 block */
   1045     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
   1046 
   1047     /* lambda */
   1048     UWORD32 u4_lambda = ps_proc->u4_lambda;
   1049 
   1050     /* cost = distortion + lambda*rate */
   1051     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
   1052 
   1053     /* cost due to mbtype */
   1054     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
   1055 
   1056     /* intra mode */
   1057     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
   1058 
   1059     /* neighbor pels for intra prediction */
   1060     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
   1061 
   1062     /* pointer to curr partition */
   1063     UWORD8 *pu1_mb_curr;
   1064     UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top;
   1065     UWORD8 *pu1_ref_mb_intra_4x4;
   1066 
   1067     /* pointer to residual macro block */
   1068     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
   1069 
   1070     /* pointer to prediction macro block */
   1071     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
   1072 
   1073     /* strides */
   1074     WORD32 i4_src_strd = ps_proc->i4_src_strd;
   1075     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   1076     WORD32 i4_ref_strd_left, i4_ref_strd_top;
   1077 
   1078     /* neighbors left, top, top right, top left */
   1079     UWORD8 *pu1_mb_a;
   1080     UWORD8 *pu1_mb_b;
   1081     UWORD8 *pu1_mb_c;
   1082     UWORD8 *pu1_mb_d;
   1083 
   1084     /* number of non zero coeffs*/
   1085     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
   1086 
   1087     /* quantization parameters */
   1088     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
   1089 
   1090     /* neighbor availability */
   1091     WORD32 i4_ngbr_avbl;
   1092     block_neighbors_t s_ngbr_avbl;
   1093 
   1094     /* temp vars */
   1095     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
   1096 
   1097     /* scan order inside 4x4 block */
   1098     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
   1099 
   1100     /* ngbr sub mb modes */
   1101     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
   1102     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
   1103     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
   1104 
   1105     /* valid intra modes map */
   1106     UWORD32 u4_valid_intra_modes;
   1107     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
   1108 
   1109     /* Dummy variable for 4x4 trans function */
   1110     WORD16 i2_dc_dummy;
   1111     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
   1112     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
   1113 
   1114     /* compute ngbr availability for sub blks */
   1115     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
   1116     {
   1117         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
   1118     }
   1119 
   1120     /* left pels */
   1121     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
   1122                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
   1123 
   1124        /* top pels */
   1125     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
   1126                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
   1127 
   1128        /* topleft pels */
   1129     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
   1130                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
   1131 
   1132        /* top right pels */
   1133     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
   1134                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
   1135 
   1136     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
   1137     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
   1138 
   1139     for(b8 = 0; b8 < 4; b8++)
   1140     {
   1141         u4_blk_x = (b8 & 0x01) << 3;
   1142         u4_blk_y = (b8 >> 1) << 3;
   1143         for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
   1144         {
   1145             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
   1146             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
   1147 
   1148             pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd);
   1149             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
   1150             if (u4_pix_x == 0)
   1151             {
   1152                 i4_ref_strd_left = ps_proc->i4_rec_strd;
   1153                 pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left);
   1154             }
   1155             else
   1156             {
   1157                 i4_ref_strd_left = i4_pred_strd;
   1158                 pu1_mb_ref_left = pu1_ref_mb_intra_4x4;
   1159             }
   1160             if (u4_pix_y == 0)
   1161             {
   1162                 i4_ref_strd_top = ps_proc->i4_rec_strd;
   1163                 pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top);
   1164             }
   1165             else
   1166             {
   1167                 i4_ref_strd_top = i4_pred_strd;
   1168                 pu1_mb_ref_top = pu1_ref_mb_intra_4x4;
   1169             }
   1170 
   1171             pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */
   1172             pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */
   1173             pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */
   1174             if (u4_pix_y == 0)
   1175                 pu1_mb_d = pu1_mb_b - 1;
   1176             else
   1177                 pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */
   1178 
   1179             /* locating neighbors that are available for prediction */
   1180             /* TODO : update the neighbor availability information basing on constrained intra pred information */
   1181             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
   1182             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
   1183 
   1184             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
   1185             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
   1186             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
   1187             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
   1188             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
   1189             /* set valid intra modes for evaluation */
   1190             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
   1191 
   1192             /* if top partition is available and top right is not available for intra prediction, then */
   1193             /* padd top right samples using top sample and make top right also available */
   1194             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
   1195 
   1196             /* gather prediction pels from the neighbors */
   1197             if (s_ngbr_avbl.u1_mb_a)
   1198             {
   1199                 for(i = 0; i < 4; i++)
   1200                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left];
   1201             }
   1202             else
   1203             {
   1204                 memset(pu1_ngbr_pels_i4,0,4);
   1205             }
   1206             if(s_ngbr_avbl.u1_mb_b)
   1207             {
   1208                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
   1209             }
   1210             else
   1211             {
   1212                 memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4);
   1213             }
   1214             if (s_ngbr_avbl.u1_mb_d)
   1215                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
   1216             else
   1217                 pu1_ngbr_pels_i4[4] = 0;
   1218             if (s_ngbr_avbl.u1_mb_c)
   1219             {
   1220                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
   1221             }
   1222             else if (s_ngbr_avbl.u1_mb_b)
   1223             {
   1224                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
   1225                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
   1226             }
   1227 
   1228             i4_partition_cost_least = INT_MAX;
   1229 
   1230             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
   1231             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
   1232             {
   1233                 u4_estimated_intra_4x4_mode = DC_I4x4;
   1234             }
   1235             else
   1236             {
   1237                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
   1238                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
   1239 
   1240                 if (u4_pix_x == 0)
   1241                 {
   1242                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
   1243                     {
   1244                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
   1245                     }
   1246                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
   1247                     {
   1248                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
   1249                     }
   1250                 }
   1251                 else
   1252                 {
   1253                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
   1254                 }
   1255 
   1256                 if (u4_pix_y == 0)
   1257                 {
   1258                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
   1259                     {
   1260                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
   1261                     }
   1262                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
   1263                     {
   1264                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
   1265                     }
   1266                 }
   1267                 else
   1268                 {
   1269                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
   1270                 }
   1271 
   1272                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
   1273             }
   1274 
   1275             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
   1276 
   1277             /*mode evaluation and prediction*/
   1278             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
   1279                                                          pu1_ngbr_pels_i4,
   1280                                                          pu1_pred_mb, i4_src_strd,
   1281                                                          i4_pred_strd, i4_ngbr_avbl,
   1282                                                          &u4_best_intra_4x4_mode,
   1283                                                          &i4_partition_cost_least,
   1284                                                          u4_valid_intra_modes,
   1285                                                          u4_lambda,
   1286                                                          u4_estimated_intra_4x4_mode);
   1287 
   1288 
   1289             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits);
   1290 
   1291             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
   1292 
   1293             /* macroblock distortion */
   1294             i4_total_distortion += i4_partition_distortion_least;
   1295             i4_total_cost += i4_partition_cost_least;
   1296 
   1297             /* mb partition mode */
   1298             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
   1299 
   1300 
   1301             /********************************************************/
   1302             /*  error estimation,                                   */
   1303             /*  transform                                           */
   1304             /*  quantization                                        */
   1305             /********************************************************/
   1306             ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb,
   1307                                               pi2_res_mb, i4_src_strd,
   1308                                               i4_pred_strd,
   1309                                               /* No op stride, this implies a buff of lenght 1x16 */
   1310                                               ps_qp_params->pu2_scale_mat,
   1311                                               ps_qp_params->pu2_thres_mat,
   1312                                               ps_qp_params->u1_qbits,
   1313                                               ps_qp_params->u4_dead_zone,
   1314                                               pu1_nnz, &i2_dc_dummy);
   1315 
   1316             /********************************************************/
   1317             /*  ierror estimation,                                  */
   1318             /*  itransform                                          */
   1319             /*  iquantization                                       */
   1320             /********************************************************/
   1321             ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb,
   1322                                                  pu1_ref_mb_intra_4x4,
   1323                                                  i4_pred_strd, i4_pred_strd,
   1324                                                  ps_qp_params->pu2_iscale_mat,
   1325                                                  ps_qp_params->pu2_weigh_mat,
   1326                                                  ps_qp_params->u1_qp_div,
   1327                                                  ps_proc->pv_scratch_buff, 0,
   1328                                                  NULL);
   1329         }
   1330     }
   1331 
   1332     /* update the type of the mb if necessary */
   1333     if (i4_total_cost < ps_proc->i4_mb_cost)
   1334     {
   1335         ps_proc->i4_mb_cost = i4_total_cost;
   1336         ps_proc->i4_mb_distortion = i4_total_distortion;
   1337         ps_proc->u4_mb_type = I4x4;
   1338     }
   1339 
   1340     return ;
   1341 }
   1342 
   1343 /**
   1344 ******************************************************************************
   1345 *
   1346 * @brief
   1347 *  evaluate best chroma intra 8x8 mode (rate distortion opt off)
   1348 *
   1349 * @par Description
   1350 *  This function evaluates all the possible chroma intra 8x8 modes and finds
   1351 *  the mode that best represents the macroblock (least distortion) and occupies
   1352 *  fewer bits in the bitstream.
   1353 *
   1354 * @param[in] ps_proc_ctxt
   1355 *  pointer to macroblock context (handle)
   1356 *
   1357 * @remarks
   1358 *  For chroma best intra pred mode is calculated based only on SAD
   1359 *
   1360 * @returns none
   1361 *
   1362 ******************************************************************************
   1363 */
   1364 
   1365 void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
   1366 {
   1367     /* Codec Context */
   1368     codec_t *ps_codec = ps_proc->ps_codec;
   1369 
   1370     /* SAD(distortion metric) of an 8x8 block */
   1371     WORD32 i4_mb_distortion, i4_chroma_mb_distortion;
   1372 
   1373     /* intra mode */
   1374     UWORD32  u4_best_chroma_intra_8x8_mode = DC_CH_I8x8;
   1375 
   1376     /* neighbor pels for intra prediction */
   1377     UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels;
   1378 
   1379     /* pointer to curr macro block */
   1380     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
   1381     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
   1382 
   1383     /* pointer to prediction macro block */
   1384     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
   1385     UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
   1386 
   1387     /* strides */
   1388     WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
   1389     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   1390     WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
   1391 
   1392     /* neighbors left, top, top left */
   1393     UWORD8 *pu1_mb_a = pu1_ref_mb - 2;
   1394     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c;
   1395     UWORD8 *pu1_mb_d = pu1_mb_b - 2;
   1396 
   1397     /* neighbor availability */
   1398     const UWORD8  u1_valid_intra_modes[8] = {1, 3, 1, 3, 5, 7, 5, 15};
   1399     WORD32 i4_ngbr_avbl;
   1400 
   1401     /* valid intra modes map */
   1402     UWORD32 u4_valid_intra_modes;
   1403     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
   1404 
   1405     /* temp var */
   1406     UWORD8 i;
   1407     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
   1408     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
   1409     /* locating neighbors that are available for prediction */
   1410 
   1411     /* gather prediction pels from the neighbors */
   1412     /* left pels */
   1413     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
   1414                     && (u4_constrained_intra_pred ?  ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
   1415     if (u1_mb_a)
   1416     {
   1417         for (i = 0; i < 16; i += 2)
   1418         {
   1419             pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c];
   1420             pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1];
   1421         }
   1422     }
   1423     else
   1424     {
   1425         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE);
   1426     }
   1427 
   1428     /* top pels */
   1429     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
   1430                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
   1431     if (u1_mb_b)
   1432     {
   1433         ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16);
   1434     }
   1435     else
   1436     {
   1437         ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE);
   1438     }
   1439 
   1440     /* top left pels */
   1441     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
   1442                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
   1443     if (u1_mb_d)
   1444     {
   1445         pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d;
   1446         pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1);
   1447     }
   1448     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
   1449     ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl;
   1450 
   1451     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
   1452 
   1453     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST ||
   1454                     ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST)
   1455         u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8);
   1456 
   1457     i4_chroma_mb_distortion = INT_MAX;
   1458 
   1459     /* perform intra mode chroma  8x8 evaluation */
   1460     /* intra prediction */
   1461     ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb,
   1462                                                     pu1_ngbr_pels_c_i8x8,
   1463                                                     pu1_pred_mb,
   1464                                                     i4_src_strd_c,
   1465                                                     i4_pred_strd,
   1466                                                     i4_ngbr_avbl,
   1467                                                     &u4_best_chroma_intra_8x8_mode,
   1468                                                     &i4_chroma_mb_distortion,
   1469                                                     u4_valid_intra_modes);
   1470 
   1471     if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/
   1472     {
   1473         (ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl);
   1474 
   1475         /* evaluate distortion(sad) */
   1476         ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion);
   1477 
   1478         /* update the least distortion information if necessary */
   1479         if(i4_mb_distortion < i4_chroma_mb_distortion)
   1480         {
   1481             i4_chroma_mb_distortion = i4_mb_distortion;
   1482             u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8;
   1483         }
   1484     }
   1485 
   1486     DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode);
   1487 
   1488     ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode;
   1489 
   1490     return ;
   1491 }
   1492 
   1493 
   1494 /**
   1495 ******************************************************************************
   1496 *
   1497 * @brief
   1498 *  Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
   1499 *  prediction.
   1500 *
   1501 * @par Description
   1502 *  This function evaluates first three 16x16 modes and compute corresponding sad
   1503 *  and return the buffer predicted with best mode.
   1504 *
   1505 * @param[in] pu1_src
   1506 *  UWORD8 pointer to the source
   1507 *
   1508 * @param[in] pu1_ngbr_pels_i16
   1509 *  UWORD8 pointer to neighbouring pels
   1510 *
   1511 * @param[out] pu1_dst
   1512 *  UWORD8 pointer to the destination
   1513 *
   1514 * @param[in] src_strd
   1515 *  integer source stride
   1516 *
   1517 * @param[in] dst_strd
   1518 *  integer destination stride
   1519 *
   1520 * @param[in] u4_n_avblty
   1521 *  availability of neighbouring pixels
   1522 *
   1523 * @param[in] u4_intra_mode
   1524 *  Pointer to the variable in which best mode is returned
   1525 *
   1526 * @param[in] pu4_sadmin
   1527 *  Pointer to the variable in which minimum sad is returned
   1528 *
   1529 * @param[in] u4_valid_intra_modes
   1530 *  Says what all modes are valid
   1531 *
   1532 * @returns      none
   1533 *
   1534 ******************************************************************************
   1535 */
   1536 void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
   1537                                       UWORD8 *pu1_ngbr_pels_i16,
   1538                                       UWORD8 *pu1_dst,
   1539                                       UWORD32 src_strd,
   1540                                       UWORD32 dst_strd,
   1541                                       WORD32 u4_n_avblty,
   1542                                       UWORD32 *u4_intra_mode,
   1543                                       WORD32 *pu4_sadmin,
   1544                                       UWORD32 u4_valid_intra_modes)
   1545 {
   1546     UWORD8 *pu1_neighbour;
   1547     UWORD8 *pu1_src_temp = pu1_src;
   1548     UWORD8 left = 0, top = 0;
   1549     WORD32 u4_dcval = 0;
   1550     WORD32 i, j;
   1551     WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX,
   1552                     i4_min_sad = INT_MAX;
   1553     UWORD8 val;
   1554 
   1555     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   1556     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   1557 
   1558     /* left available */
   1559     if (left)
   1560     {
   1561         i4_sad_horz = 0;
   1562 
   1563         for (i = 0; i < 16; i++)
   1564         {
   1565             val = pu1_ngbr_pels_i16[15 - i];
   1566 
   1567             u4_dcval += val;
   1568 
   1569             for (j = 0; j < 16; j++)
   1570             {
   1571                 i4_sad_horz += ABS(val - pu1_src_temp[j]);
   1572             }
   1573 
   1574             pu1_src_temp += src_strd;
   1575         }
   1576         u4_dcval += 8;
   1577     }
   1578 
   1579     pu1_src_temp = pu1_src;
   1580     /* top available */
   1581     if (top)
   1582     {
   1583         i4_sad_vert = 0;
   1584 
   1585         for (i = 0; i < 16; i++)
   1586         {
   1587             u4_dcval += pu1_ngbr_pels_i16[17 + i];
   1588 
   1589             for (j = 0; j < 16; j++)
   1590             {
   1591                 i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]);
   1592             }
   1593             pu1_src_temp += src_strd;
   1594 
   1595         }
   1596         u4_dcval += 8;
   1597     }
   1598 
   1599     u4_dcval = (u4_dcval) >> (3 + left + top);
   1600 
   1601     pu1_src_temp = pu1_src;
   1602 
   1603     /* none available */
   1604     u4_dcval += (left == 0) * (top == 0) * 128;
   1605 
   1606     i4_sad_dc = 0;
   1607 
   1608     for (i = 0; i < 16; i++)
   1609     {
   1610         for (j = 0; j < 16; j++)
   1611         {
   1612             i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]);
   1613         }
   1614         pu1_src_temp += src_strd;
   1615     }
   1616 
   1617     if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */
   1618         i4_sad_dc = INT_MAX;
   1619 
   1620     if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */
   1621         i4_sad_vert = INT_MAX;
   1622 
   1623     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */
   1624         i4_sad_horz = INT_MAX;
   1625 
   1626     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
   1627 
   1628     /* Finding Minimum sad and doing corresponding prediction */
   1629     if (i4_min_sad < *pu4_sadmin)
   1630     {
   1631         *pu4_sadmin = i4_min_sad;
   1632         if (i4_min_sad == i4_sad_vert)
   1633         {
   1634             *u4_intra_mode = VERT_I16x16;
   1635             pu1_neighbour = pu1_ngbr_pels_i16 + 17;
   1636             for (j = 0; j < 16; j++)
   1637             {
   1638                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
   1639                 pu1_dst += dst_strd;
   1640             }
   1641         }
   1642         else if (i4_min_sad == i4_sad_horz)
   1643         {
   1644             *u4_intra_mode = HORZ_I16x16;
   1645             for (j = 0; j < 16; j++)
   1646             {
   1647                 val = pu1_ngbr_pels_i16[15 - j];
   1648                 memset(pu1_dst, val, MB_SIZE);
   1649                 pu1_dst += dst_strd;
   1650             }
   1651         }
   1652         else
   1653         {
   1654             *u4_intra_mode = DC_I16x16;
   1655             for (j = 0; j < 16; j++)
   1656             {
   1657                 memset(pu1_dst, u4_dcval, MB_SIZE);
   1658                 pu1_dst += dst_strd;
   1659             }
   1660         }
   1661     }
   1662     return;
   1663 }
   1664 
   1665 /**
   1666 ******************************************************************************
   1667 *
   1668 * @brief
   1669 *  Evaluate best intra 4x4 mode and perform prediction.
   1670 *
   1671 * @par Description
   1672 *  This function evaluates  4x4 modes and compute corresponding sad
   1673 *  and return the buffer predicted with best mode.
   1674 *
   1675 * @param[in] pu1_src
   1676 *  UWORD8 pointer to the source
   1677 *
   1678 * @param[in] pu1_ngbr_pels
   1679 *  UWORD8 pointer to neighbouring pels
   1680 *
   1681 * @param[out] pu1_dst
   1682 *  UWORD8 pointer to the destination
   1683 *
   1684 * @param[in] src_strd
   1685 *  integer source stride
   1686 *
   1687 * @param[in] dst_strd
   1688 *  integer destination stride
   1689 *
   1690 * @param[in] u4_n_avblty
   1691 *  availability of neighbouring pixels
   1692 *
   1693 * @param[in] u4_intra_mode
   1694 *  Pointer to the variable in which best mode is returned
   1695 *
   1696 * @param[in] pu4_sadmin
   1697 *  Pointer to the variable in which minimum cost is returned
   1698 *
   1699 * @param[in] u4_valid_intra_modes
   1700 *  Says what all modes are valid
   1701 *
   1702 * @param[in] u4_lambda
   1703 *  Lamda value for computing cost from SAD
   1704 *
   1705 * @param[in] u4_predictd_mode
   1706 *  Predicted mode for cost computation
   1707 *
   1708 * @returns      none
   1709 *
   1710 ******************************************************************************
   1711 */
   1712 void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
   1713                                      UWORD8 *pu1_ngbr_pels,
   1714                                      UWORD8 *pu1_dst,
   1715                                      UWORD32 src_strd,
   1716                                      UWORD32 dst_strd,
   1717                                      WORD32 u4_n_avblty,
   1718                                      UWORD32 *u4_intra_mode,
   1719                                      WORD32 *pu4_sadmin,
   1720                                      UWORD32 u4_valid_intra_modes,
   1721                                      UWORD32  u4_lambda,
   1722                                      UWORD32 u4_predictd_mode)
   1723 {
   1724     UWORD8 *pu1_src_temp = pu1_src;
   1725     UWORD8 *pu1_pred = pu1_ngbr_pels;
   1726     UWORD8 left = 0, top = 0;
   1727     UWORD8 u1_pred_val = 0;
   1728     UWORD8 u1_pred_vals[4] = {0};
   1729     UWORD8 *pu1_pred_val = NULL;
   1730     /* To store FILT121 operated values*/
   1731     UWORD8 u1_pred_vals_diag_121[15] = {0};
   1732     /* To store FILT11 operated values*/
   1733     UWORD8 u1_pred_vals_diag_11[15] = {0};
   1734     UWORD8 u1_pred_vals_vert_r[8] = {0};
   1735     UWORD8 u1_pred_vals_horz_d[10] = {0};
   1736     UWORD8 u1_pred_vals_horz_u[10] = {0};
   1737     WORD32 u4_dcval = 0;
   1738     WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
   1739                                INT_MAX, INT_MAX, INT_MAX, INT_MAX};
   1740 
   1741     WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
   1742                                 INT_MAX, INT_MAX, INT_MAX, INT_MAX};
   1743     WORD32 i, i4_min_cost = INT_MAX;
   1744 
   1745     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   1746     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   1747 
   1748     /* Computing SAD */
   1749 
   1750     /* VERT mode valid */
   1751     if (u4_valid_intra_modes & 1)
   1752     {
   1753         pu1_pred = pu1_ngbr_pels + 5;
   1754         i4_sad[VERT_I4x4] = 0;
   1755         i4_cost[VERT_I4x4] = 0;
   1756 
   1757         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1758         pu1_src_temp += src_strd;
   1759         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1760         pu1_src_temp += src_strd;
   1761         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1762         pu1_src_temp += src_strd;
   1763         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1764 
   1765         i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ?
   1766                                         u4_lambda : 4 * u4_lambda);
   1767     }
   1768 
   1769     /* HORZ mode valid */
   1770     if (u4_valid_intra_modes & 2)
   1771     {
   1772         i4_sad[HORZ_I4x4] = 0;
   1773         i4_cost[HORZ_I4x4] =0;
   1774         pu1_src_temp = pu1_src;
   1775 
   1776         u1_pred_val = pu1_ngbr_pels[3];
   1777 
   1778         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1779                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1780                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1781                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1782         pu1_src_temp += src_strd;
   1783 
   1784         u1_pred_val = pu1_ngbr_pels[2];
   1785 
   1786         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1787                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1788                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1789                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1790         pu1_src_temp += src_strd;
   1791 
   1792         u1_pred_val = pu1_ngbr_pels[1];
   1793 
   1794         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1795                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1796                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1797                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1798         pu1_src_temp += src_strd;
   1799 
   1800         u1_pred_val = pu1_ngbr_pels[0];
   1801 
   1802         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1803                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1804                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1805                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1806 
   1807         i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ?
   1808                                         u4_lambda : 4 * u4_lambda);
   1809     }
   1810 
   1811     /* DC mode valid */
   1812     if (u4_valid_intra_modes & 4)
   1813     {
   1814         i4_sad[DC_I4x4] = 0;
   1815         i4_cost[DC_I4x4] = 0;
   1816         pu1_src_temp = pu1_src;
   1817 
   1818         if (left)
   1819             u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2]
   1820                             + pu1_ngbr_pels[3] + 2;
   1821         if (top)
   1822             u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7]
   1823                             + pu1_ngbr_pels[8] + 2;
   1824 
   1825         u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128;
   1826 
   1827         /* none available */
   1828         memset(u1_pred_vals, u4_dcval, 4);
   1829         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1830         pu1_src_temp += src_strd;
   1831         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1832         pu1_src_temp += src_strd;
   1833         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1834         pu1_src_temp += src_strd;
   1835         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1836         pu1_src_temp += src_strd;
   1837 
   1838         i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ?
   1839                                         u4_lambda : 4 * u4_lambda);
   1840     }
   1841 
   1842     /* if modes other than VERT, HORZ and DC are  valid */
   1843     if (u4_valid_intra_modes > 7)
   1844     {
   1845         pu1_pred = pu1_ngbr_pels;
   1846         pu1_pred[13] = pu1_pred[14] = pu1_pred[12];
   1847 
   1848         /* Performing FILT121 and FILT11 operation for all neighbour values*/
   1849         for (i = 0; i < 13; i++)
   1850         {
   1851             u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]);
   1852             u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]);
   1853 
   1854             pu1_pred++;
   1855         }
   1856 
   1857         if (u4_valid_intra_modes & 8)/* DIAG_DL */
   1858         {
   1859             i4_sad[DIAG_DL_I4x4] = 0;
   1860             i4_cost[DIAG_DL_I4x4] = 0;
   1861             pu1_src_temp = pu1_src;
   1862             pu1_pred_val = u1_pred_vals_diag_121 + 5;
   1863 
   1864             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]);
   1865             pu1_src_temp += src_strd;
   1866             USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]);
   1867             pu1_src_temp += src_strd;
   1868             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]);
   1869             pu1_src_temp += src_strd;
   1870             USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]);
   1871             pu1_src_temp += src_strd;
   1872             i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ?
   1873                                             u4_lambda : 4 * u4_lambda);
   1874         }
   1875 
   1876         if (u4_valid_intra_modes & 16)/* DIAG_DR */
   1877         {
   1878             i4_sad[DIAG_DR_I4x4] = 0;
   1879             i4_cost[DIAG_DR_I4x4] = 0;
   1880             pu1_src_temp = pu1_src;
   1881             pu1_pred_val = u1_pred_vals_diag_121 + 3;
   1882 
   1883             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]);
   1884             pu1_src_temp += src_strd;
   1885             USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]);
   1886             pu1_src_temp += src_strd;
   1887             USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]);
   1888             pu1_src_temp += src_strd;
   1889             USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]);
   1890             pu1_src_temp += src_strd;
   1891             i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ?
   1892                                             u4_lambda : 4 * u4_lambda);
   1893 
   1894         }
   1895 
   1896         if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
   1897         {
   1898             i4_sad[VERT_R_I4x4] = 0;
   1899 
   1900             pu1_src_temp = pu1_src;
   1901             u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2];
   1902             memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3);
   1903             u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1];
   1904             memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3);
   1905 
   1906             pu1_pred_val = u1_pred_vals_diag_11 + 4;
   1907             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
   1908             pu1_pred_val = u1_pred_vals_diag_121 + 3;
   1909             pu1_src_temp += src_strd;
   1910             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
   1911             pu1_src_temp += src_strd;
   1912             USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]);
   1913             pu1_src_temp += src_strd;
   1914             USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4),
   1915                    i4_sad[VERT_R_I4x4]);
   1916 
   1917             i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ?
   1918                                             u4_lambda : 4 * u4_lambda);
   1919         }
   1920 
   1921         if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
   1922         {
   1923             i4_sad[HORZ_D_I4x4] = 0;
   1924 
   1925             pu1_src_temp = pu1_src;
   1926             u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3];
   1927             memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3);
   1928             u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0];
   1929             u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0];
   1930             u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1];
   1931             u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1];
   1932             u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2];
   1933             u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2];
   1934 
   1935             pu1_pred_val = u1_pred_vals_horz_d;
   1936             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]);
   1937             pu1_src_temp += src_strd;
   1938             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]);
   1939             pu1_src_temp += src_strd;
   1940             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]);
   1941             pu1_src_temp += src_strd;
   1942             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]);
   1943 
   1944             i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ?
   1945                                             u4_lambda : 4 * u4_lambda);
   1946         }
   1947 
   1948         if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
   1949         {
   1950             i4_sad[VERT_L_I4x4] = 0;
   1951             pu1_src_temp = pu1_src;
   1952             pu1_pred_val = u1_pred_vals_diag_11 + 5;
   1953             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1954             pu1_src_temp += src_strd;
   1955             pu1_pred_val = u1_pred_vals_diag_121 + 5;
   1956             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1957             pu1_src_temp += src_strd;
   1958             pu1_pred_val = u1_pred_vals_diag_11 + 6;
   1959             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1960             pu1_src_temp += src_strd;
   1961             pu1_pred_val = u1_pred_vals_diag_121 + 6;
   1962             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1963 
   1964             i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ?
   1965                                             u4_lambda : 4 * u4_lambda);
   1966         }
   1967 
   1968         if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
   1969         {
   1970             i4_sad[HORZ_U_I4x4] = 0;
   1971             pu1_src_temp = pu1_src;
   1972             u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2];
   1973             u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1];
   1974             u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1];
   1975             u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0];
   1976             u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0];
   1977             u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]);
   1978 
   1979             memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4);
   1980 
   1981             pu1_pred_val = u1_pred_vals_horz_u;
   1982             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]);
   1983             pu1_src_temp += src_strd;
   1984             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]);
   1985             pu1_src_temp += src_strd;
   1986             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]);
   1987             pu1_src_temp += src_strd;
   1988             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]);
   1989 
   1990             i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ?
   1991                                             u4_lambda : 4 * u4_lambda);
   1992         }
   1993 
   1994         i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]),
   1995                         MIN3(i4_cost[3], i4_cost[4], i4_cost[5]),
   1996                         MIN3(i4_cost[6], i4_cost[7], i4_cost[8]));
   1997 
   1998     }
   1999     else
   2000     {
   2001         /* Only first three modes valid */
   2002         i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]);
   2003     }
   2004 
   2005     *pu4_sadmin = i4_min_cost;
   2006 
   2007     if (i4_min_cost == i4_cost[0])
   2008     {
   2009         *u4_intra_mode = VERT_I4x4;
   2010         pu1_pred_val = pu1_ngbr_pels + 5;
   2011         memcpy(pu1_dst, (pu1_pred_val), 4);
   2012         pu1_dst += dst_strd;
   2013         memcpy(pu1_dst, (pu1_pred_val), 4);
   2014         pu1_dst += dst_strd;
   2015         memcpy(pu1_dst, (pu1_pred_val), 4);
   2016         pu1_dst += dst_strd;
   2017         memcpy(pu1_dst, (pu1_pred_val), 4);
   2018     }
   2019     else if (i4_min_cost == i4_cost[1])
   2020     {
   2021         *u4_intra_mode = HORZ_I4x4;
   2022         memset(pu1_dst, pu1_ngbr_pels[3], 4);
   2023         pu1_dst += dst_strd;
   2024         memset(pu1_dst, pu1_ngbr_pels[2], 4);
   2025         pu1_dst += dst_strd;
   2026         memset(pu1_dst, pu1_ngbr_pels[1], 4);
   2027         pu1_dst += dst_strd;
   2028         memset(pu1_dst, pu1_ngbr_pels[0], 4);
   2029     }
   2030     else if (i4_min_cost == i4_cost[2])
   2031     {
   2032         *u4_intra_mode = DC_I4x4;
   2033         memset(pu1_dst, u4_dcval, 4);
   2034         pu1_dst += dst_strd;
   2035         memset(pu1_dst, u4_dcval, 4);
   2036         pu1_dst += dst_strd;
   2037         memset(pu1_dst, u4_dcval, 4);
   2038         pu1_dst += dst_strd;
   2039         memset(pu1_dst, u4_dcval, 4);
   2040     }
   2041 
   2042     else if (i4_min_cost == i4_cost[3])
   2043     {
   2044         *u4_intra_mode = DIAG_DL_I4x4;
   2045         pu1_pred_val = u1_pred_vals_diag_121 + 5;
   2046         memcpy(pu1_dst, (pu1_pred_val), 4);
   2047         pu1_dst += dst_strd;
   2048         memcpy(pu1_dst, (pu1_pred_val + 1), 4);
   2049         pu1_dst += dst_strd;
   2050         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   2051         pu1_dst += dst_strd;
   2052         memcpy(pu1_dst, (pu1_pred_val + 3), 4);
   2053     }
   2054     else if (i4_min_cost == i4_cost[4])
   2055     {
   2056         *u4_intra_mode = DIAG_DR_I4x4;
   2057         pu1_pred_val = u1_pred_vals_diag_121 + 3;
   2058 
   2059         memcpy(pu1_dst, (pu1_pred_val), 4);
   2060         pu1_dst += dst_strd;
   2061         memcpy(pu1_dst, (pu1_pred_val - 1), 4);
   2062         pu1_dst += dst_strd;
   2063         memcpy(pu1_dst, (pu1_pred_val - 2), 4);
   2064         pu1_dst += dst_strd;
   2065         memcpy(pu1_dst, (pu1_pred_val - 3), 4);
   2066     }
   2067 
   2068     else if (i4_min_cost == i4_cost[5])
   2069     {
   2070         *u4_intra_mode = VERT_R_I4x4;
   2071         pu1_pred_val = u1_pred_vals_diag_11 + 4;
   2072         memcpy(pu1_dst, (pu1_pred_val), 4);
   2073         pu1_dst += dst_strd;
   2074         pu1_pred_val = u1_pred_vals_diag_121 + 3;
   2075         memcpy(pu1_dst, (pu1_pred_val), 4);
   2076         pu1_dst += dst_strd;
   2077         memcpy(pu1_dst, (u1_pred_vals_vert_r), 4);
   2078         pu1_dst += dst_strd;
   2079         memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4);
   2080     }
   2081     else if (i4_min_cost == i4_cost[6])
   2082     {
   2083         *u4_intra_mode = HORZ_D_I4x4;
   2084         pu1_pred_val = u1_pred_vals_horz_d;
   2085         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
   2086         pu1_dst += dst_strd;
   2087         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
   2088         pu1_dst += dst_strd;
   2089         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   2090         pu1_dst += dst_strd;
   2091         memcpy(pu1_dst, (pu1_pred_val), 4);
   2092         pu1_dst += dst_strd;
   2093     }
   2094     else if (i4_min_cost == i4_cost[7])
   2095     {
   2096         *u4_intra_mode = VERT_L_I4x4;
   2097         pu1_pred_val = u1_pred_vals_diag_11 + 5;
   2098         memcpy(pu1_dst, (pu1_pred_val), 4);
   2099         pu1_dst += dst_strd;
   2100         pu1_pred_val = u1_pred_vals_diag_121 + 5;
   2101         memcpy(pu1_dst, (pu1_pred_val), 4);
   2102         pu1_dst += dst_strd;
   2103         pu1_pred_val = u1_pred_vals_diag_11 + 6;
   2104         memcpy(pu1_dst, (pu1_pred_val), 4);
   2105         pu1_dst += dst_strd;
   2106         pu1_pred_val = u1_pred_vals_diag_121 + 6;
   2107         memcpy(pu1_dst, (pu1_pred_val), 4);
   2108     }
   2109     else if (i4_min_cost == i4_cost[8])
   2110     {
   2111         *u4_intra_mode = HORZ_U_I4x4;
   2112         pu1_pred_val = u1_pred_vals_horz_u;
   2113         memcpy(pu1_dst, (pu1_pred_val), 4);
   2114         pu1_dst += dst_strd;
   2115         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   2116         pu1_dst += dst_strd;
   2117         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
   2118         pu1_dst += dst_strd;
   2119         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
   2120         pu1_dst += dst_strd;
   2121     }
   2122 
   2123     return;
   2124 }
   2125 
   2126 /**
   2127 ******************************************************************************
   2128 *
   2129 * @brief:
   2130 *  Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction.
   2131 *
   2132 * @par Description
   2133 *  This function evaluates  first three intra chroma modes and compute corresponding sad
   2134 *  and return the buffer predicted with best mode.
   2135 *
   2136 * @param[in] pu1_src
   2137 *  UWORD8 pointer to the source
   2138 *
   2139 * @param[in] pu1_ngbr_pels
   2140 *  UWORD8 pointer to neighbouring pels
   2141 *
   2142 * @param[out] pu1_dst
   2143 *  UWORD8 pointer to the destination
   2144 *
   2145 * @param[in] src_strd
   2146 *  integer source stride
   2147 *
   2148 * @param[in] dst_strd
   2149 *  integer destination stride
   2150 *
   2151 * @param[in] u4_n_avblty
   2152 *  availability of neighbouring pixels
   2153 *
   2154 * @param[in] u4_intra_mode
   2155 *  Pointer to the variable in which best mode is returned
   2156 *
   2157 * @param[in] pu4_sadmin
   2158 *  Pointer to the variable in which minimum sad is returned
   2159 *
   2160 * @param[in] u4_valid_intra_modes
   2161 *  Says what all modes are valid
   2162 *
   2163 * @return      none
   2164 *
   2165 ******************************************************************************
   2166 */
   2167 void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
   2168                                         UWORD8 *pu1_ngbr_pels,
   2169                                         UWORD8 *pu1_dst,
   2170                                         UWORD32 src_strd,
   2171                                         UWORD32 dst_strd,
   2172                                         WORD32 u4_n_avblty,
   2173                                         UWORD32 *u4_intra_mode,
   2174                                         WORD32 *pu4_sadmin,
   2175                                         UWORD32 u4_valid_intra_modes)
   2176 {
   2177     UWORD8 *pu1_neighbour;
   2178     UWORD8 *pu1_src_temp = pu1_src;
   2179     UWORD8 left = 0, top = 0;
   2180     WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */
   2181            u4_dcval_u_t[2] = { 0, 0 };  /*sum top neighbours for 'U'*/
   2182 
   2183     WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/
   2184            u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/
   2185 
   2186     WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX,
   2187                     i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX;
   2188     UWORD8 val_u, val_v;
   2189 
   2190     WORD32 u4_dc_val[2][2][2];/*  -----------
   2191                                   |    |    |  Chroma can have four
   2192                                   | 00 | 01 |  separate dc value...
   2193                                   -----------  u4_dc_val corresponds to this dc values
   2194                                   |    |    |  with u4_dc_val[2][2][U] and u4_dc_val[2][2][V]
   2195                                   | 10 | 11 |
   2196                                   -----------                */
   2197     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   2198     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   2199 
   2200     /*Evaluating HORZ*/
   2201     if (left)/* Ifleft available*/
   2202     {
   2203         i4_sad_horz = 0;
   2204 
   2205         for (i = 0; i < 8; i++)
   2206         {
   2207             val_v = pu1_ngbr_pels[15 - 2 * i];
   2208             val_u = pu1_ngbr_pels[15 - 2 * i - 1];
   2209             row = i / 4;
   2210             u4_dcval_u_l[row] += val_u;
   2211             u4_dcval_v_l[row] += val_v;
   2212             for (j = 0; j < 8; j++)
   2213             {
   2214                 i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/
   2215                 i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]);
   2216             }
   2217 
   2218             pu1_src_temp += src_strd;
   2219         }
   2220         u4_dcval_u_l[0] += 2;
   2221         u4_dcval_u_l[1] += 2;
   2222         u4_dcval_v_l[0] += 2;
   2223         u4_dcval_v_l[1] += 2;
   2224     }
   2225 
   2226     /*Evaluating VERT**/
   2227     pu1_src_temp = pu1_src;
   2228     if (top) /* top available*/
   2229     {
   2230         i4_sad_vert = 0;
   2231 
   2232         for (i = 0; i < 8; i++)
   2233         {
   2234             col = i / 4;
   2235 
   2236             val_u = pu1_ngbr_pels[18 + i * 2];
   2237             val_v = pu1_ngbr_pels[18 + i * 2 + 1];
   2238             u4_dcval_u_t[col] += val_u;
   2239             u4_dcval_v_t[col] += val_v;
   2240 
   2241             for (j = 0; j < 16; j++)
   2242             {
   2243                 i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/
   2244             }
   2245             pu1_src_temp += src_strd;
   2246 
   2247         }
   2248         u4_dcval_u_t[0] += 2;
   2249         u4_dcval_u_t[1] += 2;
   2250         u4_dcval_v_t[0] += 2;
   2251         u4_dcval_v_t[1] += 2;
   2252     }
   2253 
   2254     /* computing DC value*/
   2255     /* Equation  8-128 in spec*/
   2256     u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top);
   2257     u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top);
   2258     u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top);
   2259     u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top);
   2260 
   2261     if (top)
   2262     {
   2263         /* Equation  8-132 in spec*/
   2264         u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top);
   2265         u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top);
   2266     }
   2267     else
   2268     {
   2269         u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left);
   2270         u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left);
   2271     }
   2272 
   2273     if (left)
   2274     {
   2275         u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left);
   2276         u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left);
   2277     }
   2278     else
   2279     {
   2280         u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top);
   2281         u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top);
   2282     }
   2283 
   2284     if (!(left || top))
   2285     {
   2286         /*none available*/
   2287         u4_dc_val[0][0][0] = u4_dc_val[0][0][1] =
   2288         u4_dc_val[0][1][0] = u4_dc_val[0][1][1] =
   2289         u4_dc_val[1][0][0] = u4_dc_val[1][0][1] =
   2290         u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128;
   2291     }
   2292 
   2293     /* Evaluating DC */
   2294     pu1_src_temp = pu1_src;
   2295     i4_sad_dc = 0;
   2296     for (i = 0; i < 8; i++)
   2297     {
   2298         for (j = 0; j < 8; j++)
   2299         {
   2300             col = j / 4;
   2301             row = i / 4;
   2302             val_u = u4_dc_val[row][col][0];
   2303             val_v = u4_dc_val[row][col][1];
   2304 
   2305             i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/
   2306             i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]);
   2307         }
   2308         pu1_src_temp += src_strd;
   2309     }
   2310 
   2311     if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/
   2312         i4_sad_dc = INT_MAX;
   2313     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/
   2314         i4_sad_horz = INT_MAX;
   2315     if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/
   2316         i4_sad_vert = INT_MAX;
   2317 
   2318     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
   2319 
   2320     /* Finding Minimum sad and doing corresponding prediction*/
   2321     if (i4_min_sad < *pu4_sadmin)
   2322     {
   2323         *pu4_sadmin = i4_min_sad;
   2324 
   2325         if (i4_min_sad == i4_sad_dc)
   2326         {
   2327             *u4_intra_mode = DC_CH_I8x8;
   2328             for (i = 0; i < 8; i++)
   2329             {
   2330                 for (j = 0; j < 8; j++)
   2331                 {
   2332                     col = j / 4;
   2333                     row = i / 4;
   2334 
   2335                     pu1_dst[2 * j] = u4_dc_val[row][col][0];
   2336                     pu1_dst[2 * j + 1] = u4_dc_val[row][col][1];
   2337                 }
   2338                 pu1_dst += dst_strd;
   2339             }
   2340         }
   2341         else if (i4_min_sad == i4_sad_horz)
   2342         {
   2343             *u4_intra_mode = HORZ_CH_I8x8;
   2344             for (j = 0; j < 8; j++)
   2345             {
   2346                 val_v = pu1_ngbr_pels[15 - 2 * j];
   2347                 val_u = pu1_ngbr_pels[15 - 2 * j - 1];
   2348 
   2349                 for (i = 0; i < 8; i++)
   2350                 {
   2351                     pu1_dst[2 * i] = val_u;
   2352                     pu1_dst[2 * i + 1] = val_v;
   2353 
   2354                 }
   2355                 pu1_dst += dst_strd;
   2356             }
   2357         }
   2358         else
   2359         {
   2360             *u4_intra_mode = VERT_CH_I8x8;
   2361             pu1_neighbour = pu1_ngbr_pels + 18;
   2362             for (j = 0; j < 8; j++)
   2363             {
   2364                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
   2365                 pu1_dst += dst_strd;
   2366             }
   2367         }
   2368     }
   2369 
   2370     return;
   2371 }
   2372