Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22 *******************************************************************************
     23 * @file
     24 *  ih264e_intra_modes_eval.c
     25 *
     26 * @brief
     27 *  This file contains definitions of routines that perform rate distortion
     28 *  analysis on a macroblock if they are to be coded as intra.
     29 *
     30 * @author
     31 *  ittiam
     32 *
     33 * @par List of Functions:
     34 *  - ih264e_derive_neighbor_availability_of_mbs()
     35 *  - ih264e_derive_ngbr_avbl_of_mb_partitions()
     36 *  - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff()
     37 *  - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff()
     38 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff()
     39 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton()
     40 *  - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff()
     41 *  - ih264e_evaluate_intra16x16_modes()
     42 *  - ih264e_evaluate_intra4x4_modes()
     43 *  - ih264e_evaluate_intra_chroma_modes()
     44 *
     45 * @remarks
     46 *  None
     47 *
     48 *******************************************************************************
     49 */
     50 
     51 /*****************************************************************************/
     52 /* File Includes                                                             */
     53 /*****************************************************************************/
     54 
     55 /* System include files */
     56 #include <stdio.h>
     57 #include <string.h>
     58 #include <limits.h>
     59 #include <assert.h>
     60 
     61 /* User include files */
     62 #include "ih264e_config.h"
     63 #include "ih264_typedefs.h"
     64 #include "ih264e_defs.h"
     65 #include "iv2.h"
     66 #include "ive2.h"
     67 #include "ih264_debug.h"
     68 #include "ih264_defs.h"
     69 #include "ih264_macros.h"
     70 #include "ih264_intra_pred_filters.h"
     71 #include "ih264_structs.h"
     72 #include "ih264_common_tables.h"
     73 #include "ih264_trans_quant_itrans_iquant.h"
     74 #include "ih264_inter_pred_filters.h"
     75 #include "ih264_mem_fns.h"
     76 #include "ih264_padding.h"
     77 #include "ih264_deblk_edge_filters.h"
     78 #include "ih264_cabac_tables.h"
     79 #include "ime_distortion_metrics.h"
     80 #include "ih264e_error.h"
     81 #include "ih264e_bitstream.h"
     82 #include "ime_defs.h"
     83 #include "ime_structs.h"
     84 #include "irc_cntrl_param.h"
     85 #include "irc_frame_info_collector.h"
     86 #include "ih264e_rate_control.h"
     87 #include "ih264e_cabac_structs.h"
     88 #include "ih264e_structs.h"
     89 #include "ih264e_intra_modes_eval.h"
     90 #include "ih264e_globals.h"
     91 #include "ime_platform_macros.h"
     92 
     93 
     94 /*****************************************************************************/
     95 /* Function Definitions                                                      */
     96 /*****************************************************************************/
     97 
     98 /**
     99 ******************************************************************************
    100 *
    101 * @brief
    102 *  derivation process for macroblock availability
    103 *
    104 * @par   Description
    105 *  Calculates the availability of the left, top, topright and topleft macroblocks.
    106 *
    107 * @param[in] ps_proc_ctxt
    108 *  pointer to proc context (handle)
    109 *
    110 * @remarks Based on section 6.4.5 in H264 spec
    111 *
    112 * @return  none
    113 *
    114 ******************************************************************************
    115 */
    116 void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc)
    117 {
    118     UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx;
    119     UWORD8 *pu1_slice_idx_b;
    120     UWORD8 *pu1_slice_idx_a;
    121     UWORD8 *pu1_slice_idx_c;
    122     UWORD8 *pu1_slice_idx_d;
    123     block_neighbors_t *ps_ngbr_avbl;
    124     WORD32 i4_mb_x, i4_mb_y;
    125     WORD32 i4_wd_mbs;
    126 
    127     i4_mb_x = ps_proc->i4_mb_x;
    128     i4_mb_y = ps_proc->i4_mb_y;
    129 
    130     i4_wd_mbs = ps_proc->i4_wd_mbs;
    131 
    132     pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x;
    133     pu1_slice_idx_a = pu1_slice_idx_curr - 1;
    134     pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs;
    135     pu1_slice_idx_c = pu1_slice_idx_b + 1;
    136     pu1_slice_idx_d = pu1_slice_idx_b - 1;
    137     ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
    138 
    139     /**********************************************************************/
    140     /* The macroblock is marked as available, unless one of the following */
    141     /* conditions is true in which case the macroblock shall be marked as */
    142     /* not available.                                                     */
    143     /* 1. mbAddr < 0                                                      */
    144     /* 2  mbAddr > CurrMbAddr                                             */
    145     /* 3. the macroblock with address mbAddr belongs to a different slice */
    146     /* than the macroblock with address CurrMbAddr                        */
    147     /**********************************************************************/
    148 
    149     /* left macroblock availability */
    150     if (i4_mb_x == 0)
    151     { /* macroblocks along first column */
    152         ps_ngbr_avbl->u1_mb_a = 0;
    153     }
    154     else
    155     { /* macroblocks belong to same slice? */
    156         if (*pu1_slice_idx_a != *pu1_slice_idx_curr)
    157             ps_ngbr_avbl->u1_mb_a = 0;
    158         else
    159             ps_ngbr_avbl->u1_mb_a = 1;
    160     }
    161 
    162     /* top macroblock availability */
    163     if (i4_mb_y == 0)
    164     { /* macroblocks along first row */
    165         ps_ngbr_avbl->u1_mb_b = 0;
    166     }
    167     else
    168     { /* macroblocks belong to same slice? */
    169         if (*pu1_slice_idx_b != *pu1_slice_idx_curr)
    170             ps_ngbr_avbl->u1_mb_b = 0;
    171         else
    172             ps_ngbr_avbl->u1_mb_b = 1;
    173     }
    174 
    175     /* top right macroblock availability */
    176     if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0)
    177     { /* macroblocks along last column */
    178         ps_ngbr_avbl->u1_mb_c = 0;
    179     }
    180     else
    181     { /* macroblocks belong to same slice? */
    182         if (*pu1_slice_idx_c != *pu1_slice_idx_curr)
    183             ps_ngbr_avbl->u1_mb_c = 0;
    184         else
    185             ps_ngbr_avbl->u1_mb_c = 1;
    186     }
    187 
    188     /* top left macroblock availability */
    189     if (i4_mb_x == 0 || i4_mb_y == 0)
    190     { /* macroblocks along first column */
    191         ps_ngbr_avbl->u1_mb_d = 0;
    192     }
    193     else
    194     { /* macroblocks belong to same slice? */
    195         if (*pu1_slice_idx_d != *pu1_slice_idx_curr)
    196             ps_ngbr_avbl->u1_mb_d = 0;
    197         else
    198             ps_ngbr_avbl->u1_mb_d = 1;
    199     }
    200 }
    201 
    202 /**
    203 ******************************************************************************
    204 *
    205 * @brief
    206 *  derivation process for subblock/partition availability
    207 *
    208 * @par   Description
    209 *  Calculates the availability of the left, top, topright and topleft subblock
    210 *  or partitions.
    211 *
    212 * @param[in]    ps_proc_ctxt
    213 *  pointer to macroblock context (handle)
    214 *
    215 * @param[in]    i1_pel_pos_x
    216 *  column position of the pel wrt the current block
    217 *
    218 * @param[in]    i1_pel_pos_y
    219 *  row position of the pel in wrt current block
    220 *
    221 * @remarks     Assumptions: before calling this function it is assumed that
    222 *   the neighbor availability of the current macroblock is already derived.
    223 *   Based on table 6-3 of H264 specification
    224 *
    225 * @return      availability status (yes or no)
    226 *
    227 ******************************************************************************
    228 */
    229 UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl,
    230                                                 WORD8 i1_pel_pos_x,
    231                                                 WORD8 i1_pel_pos_y)
    232 {
    233     UWORD8 u1_neighbor_avail=0;
    234 
    235     /**********************************************************************/
    236     /* values of i1_pel_pos_x in the range 0-15 inclusive correspond to   */
    237     /* various columns of a macroblock                                    */
    238     /*                                                                    */
    239     /* values of i1_pel_pos_y in the range 0-15 inclusive correspond to   */
    240     /* various rows of a macroblock                                       */
    241     /*                                                                    */
    242     /* other values of i1_pel_pos_x & i1_pel_pos_y represents elements    */
    243     /* outside the bound of an mb ie., represents its neighbors.          */
    244     /**********************************************************************/
    245     if (i1_pel_pos_x < 0)
    246     { /* column(-1) */
    247         if (i1_pel_pos_y < 0)
    248         { /* row(-1) */
    249             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */
    250         }
    251         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
    252         { /* all rows of a macroblock */
    253             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */
    254         }
    255         else /* if (i1_pel_pos_y >= 16) */
    256         { /* rows(+16) */
    257             u1_neighbor_avail = 0;  /* current mb bottom left availability */
    258         }
    259     }
    260     else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16)
    261     { /* all columns of a macroblock */
    262         if (i1_pel_pos_y < 0)
    263         { /* row(-1) */
    264             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */
    265         }
    266         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
    267         { /* all rows of a macroblock */
    268             u1_neighbor_avail = 1; /* current mb availability */
    269             /* availability of the partition is dependent on the position of the partition inside the mb */
    270             /* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */
    271         }
    272         else /* if (i1_pel_pos_y >= 16) */
    273         { /* rows(+16) */
    274             u1_neighbor_avail = 0;  /* current mb bottom availability */
    275         }
    276     }
    277     else if (i1_pel_pos_x >= 16)
    278     { /* column(+16) */
    279         if (i1_pel_pos_y < 0)
    280         { /* row(-1) */
    281             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */
    282         }
    283         else /* if (i1_pel_pos_y >= 0) */
    284         { /* all other rows */
    285             u1_neighbor_avail = 0;  /* current mb right & bottom right availability */
    286         }
    287     }
    288 
    289     return u1_neighbor_avail;
    290 }
    291 
    292 /**
    293 ******************************************************************************
    294 *
    295 * @brief
    296 *  evaluate best intra 16x16 mode (rate distortion opt off)
    297 *
    298 * @par Description
    299 *  This function evaluates all the possible intra 16x16 modes and finds the mode
    300 *  that best represents the macro-block (least distortion) and occupies fewer
    301 *  bits in the bit-stream.
    302 *
    303 * @param[in]   ps_proc_ctxt
    304 *  pointer to process context (handle)
    305 *
    306 * @remarks
    307 *  Ideally the cost of encoding a macroblock is calculated as
    308 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    309 *  input block and the reconstructed block and rate is the number of bits taken
    310 *  to place the macroblock in the bit-stream. In this routine the rate does not
    311 *  exactly point to the total number of bits it takes, rather it points to header
    312 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    313 *  and residual bits fall in to texture bits the number of bits taken to encoding
    314 *  mbtype is considered as rate, we compute cost. Further we will approximate
    315 *  the distortion as the deviation b/w input and the predicted block as opposed
    316 *  to input and reconstructed block.
    317 *
    318 *  NOTE: As per the Document JVT-O079, for intra 16x16 macroblock,
    319 *  the SAD and cost are one and the same.
    320 *
    321 * @return     none
    322 *
    323 ******************************************************************************
    324 */
    325 
    326 void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    327 {
    328     /* Codec Context */
    329     codec_t *ps_codec = ps_proc->ps_codec;
    330 
    331     /* SAD(distortion metric) of an 8x8 block */
    332     WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX;
    333 
    334     /* lambda */
    335     UWORD32 u4_lambda = ps_proc->u4_lambda;
    336 
    337     /* cost = distortion + lambda*rate */
    338     WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX;
    339 
    340     /* intra mode */
    341     UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16;
    342 
    343     /* neighbor pels for intra prediction */
    344     UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels;
    345 
    346     /* neighbor availability */
    347     WORD32 i4_ngbr_avbl;
    348 
    349     /* pointer to src macro block */
    350     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
    351     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
    352 
    353     /* pointer to prediction macro block */
    354     UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16;
    355     UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane;
    356 
    357     /* strides */
    358     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    359     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    360     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
    361 
    362     /* pointer to neighbors left, top, topleft */
    363     UWORD8 *pu1_mb_a = pu1_ref_mb - 1;
    364     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd;
    365     UWORD8 *pu1_mb_d = pu1_mb_b - 1;
    366     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
    367     /* valid intra modes map */
    368     UWORD32 u4_valid_intra_modes;
    369 
    370     /* lut for valid intra modes */
    371     const UWORD8 u1_valid_intra_modes[8] = {4, 6, 4, 6, 5, 7, 5, 15};
    372 
    373     /* temp var */
    374     UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
    375     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    376     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
    377 
    378     /* init temp var */
    379     if (ps_proc->i4_slice_type != ISLICE)
    380     {
    381         /* Offset for MBtype */
    382         offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
    383         u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
    384     }
    385 
    386     /* locating neighbors that are available for prediction */
    387 
    388     /* gather prediction pels from the neighbors, if particular set is not available
    389      * it is set to zero*/
    390     /* left pels */
    391     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
    392                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
    393     if (u1_mb_a)
    394     {
    395         for(i = 0; i < 16; i++)
    396             pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd];
    397     }
    398     else
    399     {
    400         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE);
    401     }
    402     /* top pels */
    403     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
    404                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
    405     if (u1_mb_b)
    406     {
    407         ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16);
    408     }
    409     else
    410     {
    411         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE);
    412     }
    413     /* topleft pels */
    414     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
    415                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
    416     if (u1_mb_d)
    417     {
    418         pu1_ngbr_pels_i16[16] = *pu1_mb_d;
    419     }
    420     else
    421     {
    422         pu1_ngbr_pels_i16[16] = 0;
    423     }
    424 
    425     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
    426     ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl;
    427 
    428     /* set valid intra modes for evaluation */
    429     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
    430 
    431     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
    432         u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
    433 
    434     /* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */
    435     ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16,
    436                                                   i4_src_strd, i4_pred_strd,
    437                                                   i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least,
    438                                                   u4_valid_intra_modes);
    439 
    440     /* cost = distortion + lambda*rate */
    441     i4_mb_cost_least = i4_mb_distortion_least;
    442 
    443     if ((( (u4_valid_intra_modes >> 3) & 1) != 0) && (ps_codec->s_cfg.u4_enc_speed_preset != IVE_FASTEST ||
    444                     ps_proc->i4_slice_type == ISLICE))
    445     {
    446         /* intra prediction for PLANE mode*/
    447         (ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl);
    448 
    449         /* evaluate distortion between the actual blk and the estimated blk for the given mode */
    450         ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion);
    451 
    452         /* cost = distortion + lambda*rate */
    453         i4_mb_cost = i4_mb_distortion;
    454 
    455         /* update the least cost information if necessary */
    456         if(i4_mb_cost < i4_mb_distortion_least)
    457         {
    458             u4_intra_mode = PLANE_I16x16;
    459 
    460             i4_mb_cost_least = i4_mb_cost;
    461             i4_mb_distortion_least = i4_mb_distortion;
    462         }
    463     }
    464 
    465     u4_best_intra_16x16_mode = u4_intra_mode;
    466 
    467     DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode);
    468 
    469     ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode;
    470 
    471     /* cost = distortion + lambda*rate */
    472     i4_mb_cost_least    = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode];
    473 
    474 
    475     /* update the type of the mb if necessary */
    476     if (i4_mb_cost_least < ps_proc->i4_mb_cost)
    477     {
    478         ps_proc->i4_mb_cost = i4_mb_cost_least;
    479         ps_proc->i4_mb_distortion = i4_mb_distortion_least;
    480         ps_proc->u4_mb_type = I16x16;
    481     }
    482 
    483     return ;
    484 }
    485 
    486 
    487 /**
    488 ******************************************************************************
    489 *
    490 * @brief
    491 *  evaluate best intra 8x8 mode (rate distortion opt on)
    492 *
    493 * @par Description
    494 *  This function evaluates all the possible intra 8x8 modes and finds the mode
    495 *  that best represents the macro-block (least distortion) and occupies fewer
    496 *  bits in the bit-stream.
    497 *
    498 * @param[in]    ps_proc_ctxt
    499 *  pointer to proc ctxt
    500 *
    501 * @remarks Ideally the cost of encoding a macroblock is calculated as
    502 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    503 *  input block and the reconstructed block and rate is the number of bits taken
    504 *  to place the macroblock in the bit-stream. In this routine the rate does not
    505 *  exactly point to the total number of bits it takes, rather it points to header
    506 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    507 *  and residual bits fall in to texture bits the number of bits taken to encoding
    508 *  mbtype is considered as rate, we compute cost. Further we will approximate
    509 *  the distortion as the deviation b/w input and the predicted block as opposed
    510 *  to input and reconstructed block.
    511 *
    512 *  NOTE: TODO: This function needs to be tested
    513 *
    514 *  @return      none
    515 *
    516 ******************************************************************************
    517 */
    518 void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    519 {
    520     /* Codec Context */
    521     codec_t *ps_codec = ps_proc->ps_codec;
    522 
    523     /* SAD(distortion metric) of an 4x4 block */
    524     WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
    525 
    526     /* lambda */
    527     UWORD32 u4_lambda = ps_proc->u4_lambda;
    528 
    529     /* cost = distortion + lambda*rate */
    530     WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda;
    531 
    532     /* cost due to mbtype */
    533     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
    534 
    535     /* intra mode */
    536     UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode;
    537 
    538     /* neighbor pels for intra prediction */
    539     UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels;
    540 
    541     /* pointer to curr partition */
    542     UWORD8 *pu1_mb_curr;
    543 
    544     /* pointer to prediction macro block */
    545     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
    546 
    547     /* strides */
    548     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    549     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    550 
    551     /* neighbors left, top, top right, top left */
    552     UWORD8 *pu1_mb_a;
    553     UWORD8 *pu1_mb_b;
    554     UWORD8 *pu1_mb_d;
    555 
    556     /* neighbor availability */
    557     WORD32 i4_ngbr_avbl;
    558     block_neighbors_t s_ngbr_avbl;
    559 
    560     /* temp vars */
    561     UWORD32  b8, u4_pix_x, u4_pix_y;
    562     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
    563     block_neighbors_t s_ngbr_avbl_MB;
    564 
    565     /* ngbr mb syntax information */
    566     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
    567     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    568     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    569     /* valid intra modes map */
    570     UWORD32 u4_valid_intra_modes;
    571 
    572     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
    573     {
    574         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
    575     }
    576     /* left pels */
    577     s_ngbr_avbl_MB.u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
    578                                   && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
    579 
    580     /* top pels */
    581     s_ngbr_avbl_MB.u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
    582                                   && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
    583 
    584     /* topleft pels */
    585     s_ngbr_avbl_MB.u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
    586                                   && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
    587 
    588     /* top right */
    589     s_ngbr_avbl_MB.u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
    590                                   && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
    591 
    592 
    593     for(b8 = 0; b8 < 4; b8++)
    594     {
    595         u4_pix_x = (b8 & 0x01) << 3;
    596         u4_pix_y = (b8 >> 1) << 3;
    597 
    598         pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
    599         /* when rdopt is off, we use the input as reference for constructing prediction buffer */
    600         /* as opposed to using the recon pels. (open loop intra prediction) */
    601         pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
    602         pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
    603         pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
    604 
    605         /* locating neighbors that are available for prediction */
    606         /* TODO : update the neighbor availability information basing on constrained intra pred information */
    607         /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
    608         /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
    609         s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */
    610         s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */
    611         s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */
    612         s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */
    613 
    614         /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
    615         i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  (s_ngbr_avbl.u1_mb_c << 3) +
    616                         (s_ngbr_avbl.u1_mb_a << 4);
    617         /* if top partition is available and top right is not available for intra prediction, then */
    618         /* padd top right samples using top sample and make top right also available */
    619         /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
    620         ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl;
    621 
    622 
    623         ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8,
    624                                                      i4_src_strd, i4_ngbr_avbl);
    625 
    626         i4_partition_cost_least = INT_MAX;
    627         /* set valid intra modes for evaluation */
    628         u4_valid_intra_modes = 0x1ff;
    629 
    630         if (!s_ngbr_avbl.u1_mb_b)
    631         {
    632             u4_valid_intra_modes &= ~(1 << VERT_I4x4);
    633             u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4);
    634             u4_valid_intra_modes &= ~(1 << VERT_L_I4x4);
    635         }
    636         if (!s_ngbr_avbl.u1_mb_a)
    637         {
    638             u4_valid_intra_modes &= ~(1 << HORZ_I4x4);
    639             u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4);
    640         }
    641         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d)
    642         {
    643             u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4);
    644             u4_valid_intra_modes &= ~(1 << VERT_R_I4x4);
    645             u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4);
    646         }
    647 
    648         /* estimate the intra 8x8 mode for the current partition (for evaluating cost) */
    649         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
    650         {
    651             u4_estimated_intra_8x8_mode = DC_I8x8;
    652         }
    653         else
    654         {
    655             UWORD32 u4_left_intra_8x8_mode = DC_I8x8;
    656             UWORD32 u4_top_intra_8x8_mode = DC_I8x8;
    657 
    658             if (u4_pix_x == 0)
    659             {
    660                 if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
    661                 {
    662                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1];
    663                 }
    664                 else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
    665                 {
    666                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2];
    667                 }
    668             }
    669             else
    670             {
    671                 u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1];
    672             }
    673 
    674             if (u4_pix_y == 0)
    675             {
    676                 if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
    677                 {
    678                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2];
    679                 }
    680                 else if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
    681                 {
    682                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2];
    683                 }
    684             }
    685             else
    686             {
    687                 u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2];
    688             }
    689 
    690             u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode);
    691         }
    692 
    693         /* perform intra mode 8x8 evaluation */
    694         for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1)
    695         {
    696             if ( (u4_valid_intra_modes & 1) == 0)
    697                 continue;
    698 
    699             /* intra prediction */
    700             (ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl);
    701 
    702             /* evaluate distortion between the actual blk and the estimated blk for the given mode */
    703             ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion);
    704 
    705             i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits);
    706 
    707             /* update the least cost information if necessary */
    708             if (i4_partition_cost < i4_partition_cost_least)
    709             {
    710                 i4_partition_cost_least = i4_partition_cost;
    711                 i4_partition_distortion_least = i4_partition_distortion;
    712                 u4_best_intra_8x8_mode = u4_intra_mode;
    713             }
    714         }
    715         /* macroblock distortion */
    716         i4_total_cost += i4_partition_cost_least;
    717         i4_total_distortion += i4_partition_distortion_least;
    718         /* mb partition mode */
    719         ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode;
    720 
    721     }
    722 
    723     /* update the type of the mb if necessary */
    724     if (i4_total_cost < ps_proc->i4_mb_cost)
    725     {
    726         ps_proc->i4_mb_cost = i4_total_cost;
    727         ps_proc->i4_mb_distortion = i4_total_distortion;
    728         ps_proc->u4_mb_type = I8x8;
    729     }
    730 
    731     return ;
    732 }
    733 
    734 
    735 /**
    736 ******************************************************************************
    737 *
    738 * @brief
    739 *  evaluate best intra 4x4 mode (rate distortion opt off)
    740 *
    741 * @par Description
    742 *  This function evaluates all the possible intra 4x4 modes and finds the mode
    743 *  that best represents the macro-block (least distortion) and occupies fewer
    744 *  bits in the bit-stream.
    745 *
    746 * @param[in]    ps_proc_ctxt
    747 *  pointer to proc ctxt
    748 *
    749 * @remarks
    750 *  Ideally the cost of encoding a macroblock is calculated as
    751 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    752 *  input block and the reconstructed block and rate is the number of bits taken
    753 *  to place the macroblock in the bit-stream. In this routine the rate does not
    754 *  exactly point to the total number of bits it takes, rather it points to header
    755 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    756 *  and residual bits fall in to texture bits the number of bits taken to encoding
    757 *  mbtype is considered as rate, we compute cost. Further we will approximate
    758 *  the distortion as the deviation b/w input and the predicted block as opposed
    759 *  to input and reconstructed block.
    760 *
    761 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
    762 *  24*lambda is added to the SAD before comparison with the best SAD for
    763 *  inter prediction. This is an empirical value to prevent using too many intra
    764 *  blocks.
    765 *
    766 * @return      none
    767 *
    768 ******************************************************************************
    769 */
    770 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    771 {
    772     /* Codec Context */
    773     codec_t *ps_codec = ps_proc->ps_codec;
    774 
    775     /* SAD(distortion metric) of an 4x4 block */
    776     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
    777 
    778     /* lambda */
    779     UWORD32 u4_lambda = ps_proc->u4_lambda;
    780 
    781     /* cost = distortion + lambda*rate */
    782     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
    783 
    784     /* cost due to mbtype */
    785     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
    786 
    787     /* intra mode */
    788     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
    789 
    790     /* neighbor pels for intra prediction */
    791     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
    792 
    793     /* pointer to curr partition */
    794     UWORD8 *pu1_mb_curr;
    795 
    796     /* pointer to prediction macro block */
    797     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
    798 
    799     /* strides */
    800     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    801     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    802 
    803     /* neighbors left, top, top right, top left */
    804     UWORD8 *pu1_mb_a;
    805     UWORD8 *pu1_mb_b;
    806     UWORD8 *pu1_mb_c;
    807     UWORD8 *pu1_mb_d;
    808 
    809     /* neighbor availability */
    810     WORD32 i4_ngbr_avbl;
    811     block_neighbors_t s_ngbr_avbl;
    812 
    813     /* temp vars */
    814     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
    815 
    816     /* scan order inside 4x4 block */
    817     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
    818 
    819     /* ngbr sub mb modes */
    820     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
    821     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    822     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    823 
    824     /* valid intra modes map */
    825     UWORD32 u4_valid_intra_modes;
    826     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
    827 
    828     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
    829     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
    830     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
    831     {
    832         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x + 1;
    833     }
    834     /* left pels */
    835     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
    836                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
    837 
    838     /* top pels */
    839     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
    840                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
    841 
    842     /* topleft pels */
    843     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
    844                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
    845 
    846     /* top right */
    847     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
    848                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
    849 
    850     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
    851     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
    852 
    853     for (b8 = 0; b8 < 4; b8++)
    854     {
    855         u4_blk_x = (b8 & 0x01) << 3;
    856         u4_blk_y = (b8 >> 1) << 3;
    857         for (b4 = 0; b4 < 4; b4++)
    858         {
    859             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
    860             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
    861 
    862             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
    863             /* when rdopt is off, we use the input as reference for constructing prediction buffer */
    864             /* as opposed to using the recon pels. (open loop intra prediction) */
    865             pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
    866             pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
    867             pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */
    868             pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
    869 
    870             /* locating neighbors that are available for prediction */
    871             /* TODO : update the neighbor availability information basing on constrained intra pred information */
    872             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
    873             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
    874 
    875             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
    876             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
    877             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
    878             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
    879             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
    880             /* set valid intra modes for evaluation */
    881             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
    882 
    883             /* if top partition is available and top right is not available for intra prediction, then */
    884             /* padd top right samples using top sample and make top right also available */
    885             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
    886 
    887             /* gather prediction pels from the neighbors */
    888             if (s_ngbr_avbl.u1_mb_a)
    889             {
    890                 for(i = 0; i < 4; i++)
    891                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd];
    892             }
    893             else
    894             {
    895                 memset(pu1_ngbr_pels_i4, 0, 4);
    896             }
    897 
    898             if (s_ngbr_avbl.u1_mb_b)
    899             {
    900                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
    901             }
    902             else
    903             {
    904                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
    905             }
    906 
    907             if (s_ngbr_avbl.u1_mb_d)
    908                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
    909             else
    910                 pu1_ngbr_pels_i4[4] = 0;
    911 
    912             if (s_ngbr_avbl.u1_mb_c)
    913             {
    914                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
    915             }
    916             else if (s_ngbr_avbl.u1_mb_b)
    917             {
    918                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
    919                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
    920             }
    921 
    922             i4_partition_cost_least = INT_MAX;
    923 
    924             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
    925             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
    926             {
    927                 u4_estimated_intra_4x4_mode = DC_I4x4;
    928             }
    929             else
    930             {
    931                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
    932                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
    933 
    934                 if (u4_pix_x == 0)
    935                 {
    936                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
    937                     {
    938                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
    939                     }
    940                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
    941                     {
    942                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
    943                     }
    944                 }
    945                 else
    946                 {
    947                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
    948                 }
    949 
    950                 if (u4_pix_y == 0)
    951                 {
    952                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
    953                     {
    954                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
    955                     }
    956                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
    957                     {
    958                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
    959                     }
    960                 }
    961                 else
    962                 {
    963                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
    964                 }
    965 
    966                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
    967             }
    968 
    969             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
    970 
    971             /* mode evaluation and prediction */
    972             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
    973                                                          pu1_ngbr_pels_i4,
    974                                                          pu1_pred_mb, i4_src_strd,
    975                                                          i4_pred_strd, i4_ngbr_avbl,
    976                                                          &u4_best_intra_4x4_mode,
    977                                                          &i4_partition_cost_least,
    978                                                          u4_valid_intra_modes,
    979                                                          u4_lambda,
    980                                                          u4_estimated_intra_4x4_mode);
    981 
    982 
    983             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits);
    984 
    985             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
    986             /* macroblock distortion */
    987             i4_total_distortion += i4_partition_distortion_least;
    988             i4_total_cost += i4_partition_cost_least;
    989             /* mb partition mode */
    990             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
    991         }
    992     }
    993 
    994     /* update the type of the mb if necessary */
    995     if (i4_total_cost < ps_proc->i4_mb_cost)
    996     {
    997         ps_proc->i4_mb_cost = i4_total_cost;
    998         ps_proc->i4_mb_distortion = i4_total_distortion;
    999         ps_proc->u4_mb_type = I4x4;
   1000     }
   1001 
   1002     return ;
   1003 }
   1004 
   1005 /**
   1006 ******************************************************************************
   1007 *
   1008 * @brief evaluate best intra 4x4 mode (rate distortion opt on)
   1009 *
   1010 * @par Description
   1011 *  This function evaluates all the possible intra 4x4 modes and finds the mode
   1012 *  that best represents the macro-block (least distortion) and occupies fewer
   1013 *  bits in the bit-stream.
   1014 *
   1015 * @param[in]    ps_proc_ctxt
   1016 *  pointer to proc ctxt
   1017 *
   1018 * @remarks
   1019 *  Ideally the cost of encoding a macroblock is calculated as
   1020 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
   1021 *  input block and the reconstructed block and rate is the number of bits taken
   1022 *  to place the macroblock in the bit-stream. In this routine the rate does not
   1023 *  exactly point to the total number of bits it takes, rather it points to header
   1024 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
   1025 *  and residual bits fall in to texture bits the number of bits taken to encoding
   1026 *  mbtype is considered as rate, we compute cost. Further we will approximate
   1027 *  the distortion as the deviation b/w input and the predicted block as opposed
   1028 *  to input and reconstructed block.
   1029 *
   1030 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
   1031 *  24*lambda is added to the SAD before comparison with the best SAD for
   1032 *  inter prediction. This is an empirical value to prevent using too many intra
   1033 *  blocks.
   1034 *
   1035 * @return      none
   1036 *
   1037 ******************************************************************************
   1038 */
   1039 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc)
   1040 {
   1041     /* Codec Context */
   1042     codec_t *ps_codec = ps_proc->ps_codec;
   1043 
   1044     /* SAD(distortion metric) of an 4x4 block */
   1045     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
   1046 
   1047     /* lambda */
   1048     UWORD32 u4_lambda = ps_proc->u4_lambda;
   1049 
   1050     /* cost = distortion + lambda*rate */
   1051     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
   1052 
   1053     /* cost due to mbtype */
   1054     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
   1055 
   1056     /* intra mode */
   1057     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
   1058 
   1059     /* neighbor pels for intra prediction */
   1060     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
   1061 
   1062     /* pointer to curr partition */
   1063     UWORD8 *pu1_mb_curr;
   1064     UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top;
   1065     UWORD8 *pu1_ref_mb_intra_4x4;
   1066 
   1067     /* pointer to residual macro block */
   1068     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
   1069 
   1070     /* pointer to prediction macro block */
   1071     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
   1072 
   1073     /* strides */
   1074     WORD32 i4_src_strd = ps_proc->i4_src_strd;
   1075     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   1076     WORD32 i4_ref_strd_left, i4_ref_strd_top;
   1077 
   1078     /* neighbors left, top, top right, top left */
   1079     UWORD8 *pu1_mb_a;
   1080     UWORD8 *pu1_mb_b;
   1081     UWORD8 *pu1_mb_c;
   1082     UWORD8 *pu1_mb_d;
   1083 
   1084     /* number of non zero coeffs*/
   1085     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
   1086 
   1087     /* quantization parameters */
   1088     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
   1089 
   1090     /* neighbor availability */
   1091     WORD32 i4_ngbr_avbl;
   1092     block_neighbors_t s_ngbr_avbl;
   1093 
   1094     /* temp vars */
   1095     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
   1096 
   1097     /* scan order inside 4x4 block */
   1098     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
   1099 
   1100     /* ngbr sub mb modes */
   1101     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
   1102     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
   1103     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
   1104 
   1105     /* valid intra modes map */
   1106     UWORD32 u4_valid_intra_modes;
   1107     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
   1108 
   1109     /* Dummy variable for 4x4 trans function */
   1110     WORD16 i2_dc_dummy;
   1111     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
   1112     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
   1113 
   1114     /* compute ngbr availability for sub blks */
   1115     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
   1116     {
   1117         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
   1118     }
   1119 
   1120     /* left pels */
   1121     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
   1122                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
   1123 
   1124        /* top pels */
   1125     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
   1126                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
   1127 
   1128        /* topleft pels */
   1129     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
   1130                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
   1131 
   1132        /* top right pels */
   1133     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
   1134                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
   1135 
   1136     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
   1137     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
   1138 
   1139     for(b8 = 0; b8 < 4; b8++)
   1140     {
   1141         u4_blk_x = (b8 & 0x01) << 3;
   1142         u4_blk_y = (b8 >> 1) << 3;
   1143         for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
   1144         {
   1145             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
   1146             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
   1147 
   1148             pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd);
   1149             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
   1150             if (u4_pix_x == 0)
   1151             {
   1152                 i4_ref_strd_left = ps_proc->i4_rec_strd;
   1153                 pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left);
   1154             }
   1155             else
   1156             {
   1157                 i4_ref_strd_left = i4_pred_strd;
   1158                 pu1_mb_ref_left = pu1_ref_mb_intra_4x4;
   1159             }
   1160             if (u4_pix_y == 0)
   1161             {
   1162                 i4_ref_strd_top = ps_proc->i4_rec_strd;
   1163                 pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top);
   1164             }
   1165             else
   1166             {
   1167                 i4_ref_strd_top = i4_pred_strd;
   1168                 pu1_mb_ref_top = pu1_ref_mb_intra_4x4;
   1169             }
   1170 
   1171             pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */
   1172             pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */
   1173             pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */
   1174             if (u4_pix_y == 0)
   1175                 pu1_mb_d = pu1_mb_b - 1;
   1176             else
   1177                 pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */
   1178 
   1179             /* locating neighbors that are available for prediction */
   1180             /* TODO : update the neighbor availability information basing on constrained intra pred information */
   1181             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
   1182             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
   1183 
   1184             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
   1185             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
   1186             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
   1187             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
   1188             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
   1189             /* set valid intra modes for evaluation */
   1190             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
   1191 
   1192             /* if top partition is available and top right is not available for intra prediction, then */
   1193             /* padd top right samples using top sample and make top right also available */
   1194             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
   1195 
   1196             /* gather prediction pels from the neighbors */
   1197             if (s_ngbr_avbl.u1_mb_a)
   1198             {
   1199                 for(i = 0; i < 4; i++)
   1200                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left];
   1201             }
   1202             else
   1203             {
   1204                 memset(pu1_ngbr_pels_i4,0,4);
   1205             }
   1206             if(s_ngbr_avbl.u1_mb_b)
   1207             {
   1208                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
   1209             }
   1210             else
   1211             {
   1212                 memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4);
   1213             }
   1214             if (s_ngbr_avbl.u1_mb_d)
   1215                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
   1216             else
   1217                 pu1_ngbr_pels_i4[4] = 0;
   1218             if (s_ngbr_avbl.u1_mb_c)
   1219             {
   1220                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
   1221             }
   1222             else if (s_ngbr_avbl.u1_mb_b)
   1223             {
   1224                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
   1225                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
   1226             }
   1227 
   1228             i4_partition_cost_least = INT_MAX;
   1229 
   1230             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
   1231             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
   1232             {
   1233                 u4_estimated_intra_4x4_mode = DC_I4x4;
   1234             }
   1235             else
   1236             {
   1237                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
   1238                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
   1239 
   1240                 if (u4_pix_x == 0)
   1241                 {
   1242                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
   1243                     {
   1244                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
   1245                     }
   1246                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
   1247                     {
   1248                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
   1249                     }
   1250                 }
   1251                 else
   1252                 {
   1253                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
   1254                 }
   1255 
   1256                 if (u4_pix_y == 0)
   1257                 {
   1258                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
   1259                     {
   1260                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
   1261                     }
   1262                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
   1263                     {
   1264                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
   1265                     }
   1266                 }
   1267                 else
   1268                 {
   1269                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
   1270                 }
   1271 
   1272                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
   1273             }
   1274 
   1275             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
   1276 
   1277             /*mode evaluation and prediction*/
   1278             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
   1279                                                          pu1_ngbr_pels_i4,
   1280                                                          pu1_pred_mb, i4_src_strd,
   1281                                                          i4_pred_strd, i4_ngbr_avbl,
   1282                                                          &u4_best_intra_4x4_mode,
   1283                                                          &i4_partition_cost_least,
   1284                                                          u4_valid_intra_modes,
   1285                                                          u4_lambda,
   1286                                                          u4_estimated_intra_4x4_mode);
   1287 
   1288 
   1289             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits);
   1290 
   1291             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
   1292 
   1293             /* macroblock distortion */
   1294             i4_total_distortion += i4_partition_distortion_least;
   1295             i4_total_cost += i4_partition_cost_least;
   1296 
   1297             /* mb partition mode */
   1298             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
   1299 
   1300 
   1301             /********************************************************/
   1302             /*  error estimation,                                   */
   1303             /*  transform                                           */
   1304             /*  quantization                                        */
   1305             /********************************************************/
   1306             ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb,
   1307                                               pi2_res_mb, i4_src_strd,
   1308                                               i4_pred_strd,
   1309                                               /* No op stride, this implies a buff of lenght 1x16 */
   1310                                               ps_qp_params->pu2_scale_mat,
   1311                                               ps_qp_params->pu2_thres_mat,
   1312                                               ps_qp_params->u1_qbits,
   1313                                               ps_qp_params->u4_dead_zone,
   1314                                               pu1_nnz, &i2_dc_dummy);
   1315 
   1316             /********************************************************/
   1317             /*  ierror estimation,                                  */
   1318             /*  itransform                                          */
   1319             /*  iquantization                                       */
   1320             /********************************************************/
   1321             ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb,
   1322                                                  pu1_ref_mb_intra_4x4,
   1323                                                  i4_pred_strd, i4_pred_strd,
   1324                                                  ps_qp_params->pu2_iscale_mat,
   1325                                                  ps_qp_params->pu2_weigh_mat,
   1326                                                  ps_qp_params->u1_qp_div,
   1327                                                  ps_proc->pv_scratch_buff, 0,
   1328                                                  NULL);
   1329         }
   1330     }
   1331 
   1332     /* update the type of the mb if necessary */
   1333     if (i4_total_cost < ps_proc->i4_mb_cost)
   1334     {
   1335         ps_proc->i4_mb_cost = i4_total_cost;
   1336         ps_proc->i4_mb_distortion = i4_total_distortion;
   1337         ps_proc->u4_mb_type = I4x4;
   1338     }
   1339 
   1340     return ;
   1341 }
   1342 
   1343 /**
   1344 ******************************************************************************
   1345 *
   1346 * @brief
   1347 *  evaluate best chroma intra 8x8 mode (rate distortion opt off)
   1348 *
   1349 * @par Description
   1350 *  This function evaluates all the possible chroma intra 8x8 modes and finds
   1351 *  the mode that best represents the macroblock (least distortion) and occupies
   1352 *  fewer bits in the bitstream.
   1353 *
   1354 * @param[in] ps_proc_ctxt
   1355 *  pointer to macroblock context (handle)
   1356 *
   1357 * @remarks
   1358 *  For chroma best intra pred mode is calculated based only on SAD
   1359 *
   1360 * @returns none
   1361 *
   1362 ******************************************************************************
   1363 */
   1364 
   1365 void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
   1366 {
   1367     /* Codec Context */
   1368     codec_t *ps_codec = ps_proc->ps_codec;
   1369 
   1370     /* SAD(distortion metric) of an 8x8 block */
   1371     WORD32 i4_mb_distortion, i4_chroma_mb_distortion;
   1372 
   1373     /* intra mode */
   1374     UWORD32  u4_best_chroma_intra_8x8_mode = DC_CH_I8x8;
   1375 
   1376     /* neighbor pels for intra prediction */
   1377     UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels;
   1378 
   1379     /* pointer to curr macro block */
   1380     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
   1381     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
   1382 
   1383     /* pointer to prediction macro block */
   1384     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
   1385     UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
   1386 
   1387     /* strides */
   1388     WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
   1389     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   1390     WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
   1391 
   1392     /* neighbors left, top, top left */
   1393     UWORD8 *pu1_mb_a = pu1_ref_mb - 2;
   1394     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c;
   1395     UWORD8 *pu1_mb_d = pu1_mb_b - 2;
   1396 
   1397     /* neighbor availability */
   1398     const UWORD8  u1_valid_intra_modes[8] = {1, 3, 1, 3, 5, 7, 5, 15};
   1399     WORD32 i4_ngbr_avbl;
   1400 
   1401     /* valid intra modes map */
   1402     UWORD32 u4_valid_intra_modes;
   1403     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
   1404 
   1405     /* temp var */
   1406     UWORD8 i;
   1407     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
   1408     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
   1409     /* locating neighbors that are available for prediction */
   1410 
   1411     /* gather prediction pels from the neighbors */
   1412     /* left pels */
   1413     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
   1414                     && (u4_constrained_intra_pred ?  ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
   1415     if (u1_mb_a)
   1416     {
   1417         for (i = 0; i < 16; i += 2)
   1418         {
   1419             pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c];
   1420             pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1];
   1421         }
   1422     }
   1423     else
   1424     {
   1425         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE);
   1426     }
   1427 
   1428     /* top pels */
   1429     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
   1430                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
   1431     if (u1_mb_b)
   1432     {
   1433         ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16);
   1434     }
   1435     else
   1436     {
   1437         ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE);
   1438     }
   1439 
   1440     /* top left pels */
   1441     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
   1442                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
   1443     if (u1_mb_d)
   1444     {
   1445         pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d;
   1446         pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1);
   1447     }
   1448     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
   1449     ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl;
   1450 
   1451     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
   1452 
   1453     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
   1454         u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8);
   1455 
   1456     i4_chroma_mb_distortion = INT_MAX;
   1457 
   1458     /* perform intra mode chroma  8x8 evaluation */
   1459     /* intra prediction */
   1460     ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb,
   1461                                                     pu1_ngbr_pels_c_i8x8,
   1462                                                     pu1_pred_mb,
   1463                                                     i4_src_strd_c,
   1464                                                     i4_pred_strd,
   1465                                                     i4_ngbr_avbl,
   1466                                                     &u4_best_chroma_intra_8x8_mode,
   1467                                                     &i4_chroma_mb_distortion,
   1468                                                     u4_valid_intra_modes);
   1469 
   1470     if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/
   1471     {
   1472         (ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl);
   1473 
   1474         /* evaluate distortion(sad) */
   1475         ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion);
   1476 
   1477         /* update the least distortion information if necessary */
   1478         if(i4_mb_distortion < i4_chroma_mb_distortion)
   1479         {
   1480             i4_chroma_mb_distortion = i4_mb_distortion;
   1481             u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8;
   1482         }
   1483     }
   1484 
   1485     DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode);
   1486 
   1487     ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode;
   1488 
   1489     return ;
   1490 }
   1491 
   1492 
   1493 /**
   1494 ******************************************************************************
   1495 *
   1496 * @brief
   1497 *  Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
   1498 *  prediction.
   1499 *
   1500 * @par Description
   1501 *  This function evaluates first three 16x16 modes and compute corresponding sad
   1502 *  and return the buffer predicted with best mode.
   1503 *
   1504 * @param[in] pu1_src
   1505 *  UWORD8 pointer to the source
   1506 *
   1507 * @param[in] pu1_ngbr_pels_i16
   1508 *  UWORD8 pointer to neighbouring pels
   1509 *
   1510 * @param[out] pu1_dst
   1511 *  UWORD8 pointer to the destination
   1512 *
   1513 * @param[in] src_strd
   1514 *  integer source stride
   1515 *
   1516 * @param[in] dst_strd
   1517 *  integer destination stride
   1518 *
   1519 * @param[in] u4_n_avblty
   1520 *  availability of neighbouring pixels
   1521 *
   1522 * @param[in] u4_intra_mode
   1523 *  Pointer to the variable in which best mode is returned
   1524 *
   1525 * @param[in] pu4_sadmin
   1526 *  Pointer to the variable in which minimum sad is returned
   1527 *
   1528 * @param[in] u4_valid_intra_modes
   1529 *  Says what all modes are valid
   1530 *
   1531 * @returns      none
   1532 *
   1533 ******************************************************************************
   1534 */
   1535 void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
   1536                                       UWORD8 *pu1_ngbr_pels_i16,
   1537                                       UWORD8 *pu1_dst,
   1538                                       UWORD32 src_strd,
   1539                                       UWORD32 dst_strd,
   1540                                       WORD32 u4_n_avblty,
   1541                                       UWORD32 *u4_intra_mode,
   1542                                       WORD32 *pu4_sadmin,
   1543                                       UWORD32 u4_valid_intra_modes)
   1544 {
   1545     UWORD8 *pu1_neighbour;
   1546     UWORD8 *pu1_src_temp = pu1_src;
   1547     UWORD8 left = 0, top = 0;
   1548     WORD32 u4_dcval = 0;
   1549     WORD32 i, j;
   1550     WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX,
   1551                     i4_min_sad = INT_MAX;
   1552     UWORD8 val;
   1553 
   1554     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   1555     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   1556 
   1557     /* left available */
   1558     if (left)
   1559     {
   1560         i4_sad_horz = 0;
   1561 
   1562         for (i = 0; i < 16; i++)
   1563         {
   1564             val = pu1_ngbr_pels_i16[15 - i];
   1565 
   1566             u4_dcval += val;
   1567 
   1568             for (j = 0; j < 16; j++)
   1569             {
   1570                 i4_sad_horz += ABS(val - pu1_src_temp[j]);
   1571             }
   1572 
   1573             pu1_src_temp += src_strd;
   1574         }
   1575         u4_dcval += 8;
   1576     }
   1577 
   1578     pu1_src_temp = pu1_src;
   1579     /* top available */
   1580     if (top)
   1581     {
   1582         i4_sad_vert = 0;
   1583 
   1584         for (i = 0; i < 16; i++)
   1585         {
   1586             u4_dcval += pu1_ngbr_pels_i16[17 + i];
   1587 
   1588             for (j = 0; j < 16; j++)
   1589             {
   1590                 i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]);
   1591             }
   1592             pu1_src_temp += src_strd;
   1593 
   1594         }
   1595         u4_dcval += 8;
   1596     }
   1597 
   1598     u4_dcval = (u4_dcval) >> (3 + left + top);
   1599 
   1600     pu1_src_temp = pu1_src;
   1601 
   1602     /* none available */
   1603     u4_dcval += (left == 0) * (top == 0) * 128;
   1604 
   1605     i4_sad_dc = 0;
   1606 
   1607     for (i = 0; i < 16; i++)
   1608     {
   1609         for (j = 0; j < 16; j++)
   1610         {
   1611             i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]);
   1612         }
   1613         pu1_src_temp += src_strd;
   1614     }
   1615 
   1616     if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */
   1617         i4_sad_dc = INT_MAX;
   1618 
   1619     if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */
   1620         i4_sad_vert = INT_MAX;
   1621 
   1622     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */
   1623         i4_sad_horz = INT_MAX;
   1624 
   1625     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
   1626 
   1627     /* Finding Minimum sad and doing corresponding prediction */
   1628     if (i4_min_sad < *pu4_sadmin)
   1629     {
   1630         *pu4_sadmin = i4_min_sad;
   1631         if (i4_min_sad == i4_sad_vert)
   1632         {
   1633             *u4_intra_mode = VERT_I16x16;
   1634             pu1_neighbour = pu1_ngbr_pels_i16 + 17;
   1635             for (j = 0; j < 16; j++)
   1636             {
   1637                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
   1638                 pu1_dst += dst_strd;
   1639             }
   1640         }
   1641         else if (i4_min_sad == i4_sad_horz)
   1642         {
   1643             *u4_intra_mode = HORZ_I16x16;
   1644             for (j = 0; j < 16; j++)
   1645             {
   1646                 val = pu1_ngbr_pels_i16[15 - j];
   1647                 memset(pu1_dst, val, MB_SIZE);
   1648                 pu1_dst += dst_strd;
   1649             }
   1650         }
   1651         else
   1652         {
   1653             *u4_intra_mode = DC_I16x16;
   1654             for (j = 0; j < 16; j++)
   1655             {
   1656                 memset(pu1_dst, u4_dcval, MB_SIZE);
   1657                 pu1_dst += dst_strd;
   1658             }
   1659         }
   1660     }
   1661     return;
   1662 }
   1663 
   1664 /**
   1665 ******************************************************************************
   1666 *
   1667 * @brief
   1668 *  Evaluate best intra 4x4 mode and perform prediction.
   1669 *
   1670 * @par Description
   1671 *  This function evaluates  4x4 modes and compute corresponding sad
   1672 *  and return the buffer predicted with best mode.
   1673 *
   1674 * @param[in] pu1_src
   1675 *  UWORD8 pointer to the source
   1676 *
   1677 * @param[in] pu1_ngbr_pels
   1678 *  UWORD8 pointer to neighbouring pels
   1679 *
   1680 * @param[out] pu1_dst
   1681 *  UWORD8 pointer to the destination
   1682 *
   1683 * @param[in] src_strd
   1684 *  integer source stride
   1685 *
   1686 * @param[in] dst_strd
   1687 *  integer destination stride
   1688 *
   1689 * @param[in] u4_n_avblty
   1690 *  availability of neighbouring pixels
   1691 *
   1692 * @param[in] u4_intra_mode
   1693 *  Pointer to the variable in which best mode is returned
   1694 *
   1695 * @param[in] pu4_sadmin
   1696 *  Pointer to the variable in which minimum cost is returned
   1697 *
   1698 * @param[in] u4_valid_intra_modes
   1699 *  Says what all modes are valid
   1700 *
   1701 * @param[in] u4_lambda
   1702 *  Lamda value for computing cost from SAD
   1703 *
   1704 * @param[in] u4_predictd_mode
   1705 *  Predicted mode for cost computation
   1706 *
   1707 * @returns      none
   1708 *
   1709 ******************************************************************************
   1710 */
   1711 void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
   1712                                      UWORD8 *pu1_ngbr_pels,
   1713                                      UWORD8 *pu1_dst,
   1714                                      UWORD32 src_strd,
   1715                                      UWORD32 dst_strd,
   1716                                      WORD32 u4_n_avblty,
   1717                                      UWORD32 *u4_intra_mode,
   1718                                      WORD32 *pu4_sadmin,
   1719                                      UWORD32 u4_valid_intra_modes,
   1720                                      UWORD32  u4_lambda,
   1721                                      UWORD32 u4_predictd_mode)
   1722 {
   1723     UWORD8 *pu1_src_temp = pu1_src;
   1724     UWORD8 *pu1_pred = pu1_ngbr_pels;
   1725     UWORD8 left = 0, top = 0;
   1726     UWORD8 u1_pred_val = 0;
   1727     UWORD8 u1_pred_vals[4] = {0};
   1728     UWORD8 *pu1_pred_val = NULL;
   1729     /* To store FILT121 operated values*/
   1730     UWORD8 u1_pred_vals_diag_121[15] = {0};
   1731     /* To store FILT11 operated values*/
   1732     UWORD8 u1_pred_vals_diag_11[15] = {0};
   1733     UWORD8 u1_pred_vals_vert_r[8] = {0};
   1734     UWORD8 u1_pred_vals_horz_d[10] = {0};
   1735     UWORD8 u1_pred_vals_horz_u[10] = {0};
   1736     WORD32 u4_dcval = 0;
   1737     WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
   1738                                INT_MAX, INT_MAX, INT_MAX, INT_MAX};
   1739 
   1740     WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
   1741                                 INT_MAX, INT_MAX, INT_MAX, INT_MAX};
   1742     WORD32 i, i4_min_cost = INT_MAX;
   1743 
   1744     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   1745     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   1746 
   1747     /* Computing SAD */
   1748 
   1749     /* VERT mode valid */
   1750     if (u4_valid_intra_modes & 1)
   1751     {
   1752         pu1_pred = pu1_ngbr_pels + 5;
   1753         i4_sad[VERT_I4x4] = 0;
   1754         i4_cost[VERT_I4x4] = 0;
   1755 
   1756         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1757         pu1_src_temp += src_strd;
   1758         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1759         pu1_src_temp += src_strd;
   1760         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1761         pu1_src_temp += src_strd;
   1762         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1763 
   1764         i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ?
   1765                                         u4_lambda : 4 * u4_lambda);
   1766     }
   1767 
   1768     /* HORZ mode valid */
   1769     if (u4_valid_intra_modes & 2)
   1770     {
   1771         i4_sad[HORZ_I4x4] = 0;
   1772         i4_cost[HORZ_I4x4] =0;
   1773         pu1_src_temp = pu1_src;
   1774 
   1775         u1_pred_val = pu1_ngbr_pels[3];
   1776 
   1777         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1778                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1779                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1780                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1781         pu1_src_temp += src_strd;
   1782 
   1783         u1_pred_val = pu1_ngbr_pels[2];
   1784 
   1785         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1786                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1787                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1788                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1789         pu1_src_temp += src_strd;
   1790 
   1791         u1_pred_val = pu1_ngbr_pels[1];
   1792 
   1793         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1794                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1795                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1796                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1797         pu1_src_temp += src_strd;
   1798 
   1799         u1_pred_val = pu1_ngbr_pels[0];
   1800 
   1801         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1802                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1803                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1804                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1805 
   1806         i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ?
   1807                                         u4_lambda : 4 * u4_lambda);
   1808     }
   1809 
   1810     /* DC mode valid */
   1811     if (u4_valid_intra_modes & 4)
   1812     {
   1813         i4_sad[DC_I4x4] = 0;
   1814         i4_cost[DC_I4x4] = 0;
   1815         pu1_src_temp = pu1_src;
   1816 
   1817         if (left)
   1818             u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2]
   1819                             + pu1_ngbr_pels[3] + 2;
   1820         if (top)
   1821             u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7]
   1822                             + pu1_ngbr_pels[8] + 2;
   1823 
   1824         u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128;
   1825 
   1826         /* none available */
   1827         memset(u1_pred_vals, u4_dcval, 4);
   1828         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1829         pu1_src_temp += src_strd;
   1830         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1831         pu1_src_temp += src_strd;
   1832         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1833         pu1_src_temp += src_strd;
   1834         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1835         pu1_src_temp += src_strd;
   1836 
   1837         i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ?
   1838                                         u4_lambda : 4 * u4_lambda);
   1839     }
   1840 
   1841     /* if modes other than VERT, HORZ and DC are  valid */
   1842     if (u4_valid_intra_modes > 7)
   1843     {
   1844         pu1_pred = pu1_ngbr_pels;
   1845         pu1_pred[13] = pu1_pred[14] = pu1_pred[12];
   1846 
   1847         /* Performing FILT121 and FILT11 operation for all neighbour values*/
   1848         for (i = 0; i < 13; i++)
   1849         {
   1850             u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]);
   1851             u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]);
   1852 
   1853             pu1_pred++;
   1854         }
   1855 
   1856         if (u4_valid_intra_modes & 8)/* DIAG_DL */
   1857         {
   1858             i4_sad[DIAG_DL_I4x4] = 0;
   1859             i4_cost[DIAG_DL_I4x4] = 0;
   1860             pu1_src_temp = pu1_src;
   1861             pu1_pred_val = u1_pred_vals_diag_121 + 5;
   1862 
   1863             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]);
   1864             pu1_src_temp += src_strd;
   1865             USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]);
   1866             pu1_src_temp += src_strd;
   1867             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]);
   1868             pu1_src_temp += src_strd;
   1869             USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]);
   1870             pu1_src_temp += src_strd;
   1871             i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ?
   1872                                             u4_lambda : 4 * u4_lambda);
   1873         }
   1874 
   1875         if (u4_valid_intra_modes & 16)/* DIAG_DR */
   1876         {
   1877             i4_sad[DIAG_DR_I4x4] = 0;
   1878             i4_cost[DIAG_DR_I4x4] = 0;
   1879             pu1_src_temp = pu1_src;
   1880             pu1_pred_val = u1_pred_vals_diag_121 + 3;
   1881 
   1882             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]);
   1883             pu1_src_temp += src_strd;
   1884             USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]);
   1885             pu1_src_temp += src_strd;
   1886             USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]);
   1887             pu1_src_temp += src_strd;
   1888             USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]);
   1889             pu1_src_temp += src_strd;
   1890             i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ?
   1891                                             u4_lambda : 4 * u4_lambda);
   1892 
   1893         }
   1894 
   1895         if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
   1896         {
   1897             i4_sad[VERT_R_I4x4] = 0;
   1898 
   1899             pu1_src_temp = pu1_src;
   1900             u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2];
   1901             memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3);
   1902             u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1];
   1903             memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3);
   1904 
   1905             pu1_pred_val = u1_pred_vals_diag_11 + 4;
   1906             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
   1907             pu1_pred_val = u1_pred_vals_diag_121 + 3;
   1908             pu1_src_temp += src_strd;
   1909             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
   1910             pu1_src_temp += src_strd;
   1911             USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]);
   1912             pu1_src_temp += src_strd;
   1913             USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4),
   1914                    i4_sad[VERT_R_I4x4]);
   1915 
   1916             i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ?
   1917                                             u4_lambda : 4 * u4_lambda);
   1918         }
   1919 
   1920         if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
   1921         {
   1922             i4_sad[HORZ_D_I4x4] = 0;
   1923 
   1924             pu1_src_temp = pu1_src;
   1925             u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3];
   1926             memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3);
   1927             u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0];
   1928             u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0];
   1929             u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1];
   1930             u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1];
   1931             u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2];
   1932             u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2];
   1933 
   1934             pu1_pred_val = u1_pred_vals_horz_d;
   1935             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]);
   1936             pu1_src_temp += src_strd;
   1937             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]);
   1938             pu1_src_temp += src_strd;
   1939             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]);
   1940             pu1_src_temp += src_strd;
   1941             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]);
   1942 
   1943             i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ?
   1944                                             u4_lambda : 4 * u4_lambda);
   1945         }
   1946 
   1947         if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
   1948         {
   1949             i4_sad[VERT_L_I4x4] = 0;
   1950             pu1_src_temp = pu1_src;
   1951             pu1_pred_val = u1_pred_vals_diag_11 + 5;
   1952             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1953             pu1_src_temp += src_strd;
   1954             pu1_pred_val = u1_pred_vals_diag_121 + 5;
   1955             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1956             pu1_src_temp += src_strd;
   1957             pu1_pred_val = u1_pred_vals_diag_11 + 6;
   1958             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1959             pu1_src_temp += src_strd;
   1960             pu1_pred_val = u1_pred_vals_diag_121 + 6;
   1961             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1962 
   1963             i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ?
   1964                                             u4_lambda : 4 * u4_lambda);
   1965         }
   1966 
   1967         if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
   1968         {
   1969             i4_sad[HORZ_U_I4x4] = 0;
   1970             pu1_src_temp = pu1_src;
   1971             u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2];
   1972             u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1];
   1973             u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1];
   1974             u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0];
   1975             u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0];
   1976             u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]);
   1977 
   1978             memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4);
   1979 
   1980             pu1_pred_val = u1_pred_vals_horz_u;
   1981             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]);
   1982             pu1_src_temp += src_strd;
   1983             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]);
   1984             pu1_src_temp += src_strd;
   1985             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]);
   1986             pu1_src_temp += src_strd;
   1987             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]);
   1988 
   1989             i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ?
   1990                                             u4_lambda : 4 * u4_lambda);
   1991         }
   1992 
   1993         i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]),
   1994                         MIN3(i4_cost[3], i4_cost[4], i4_cost[5]),
   1995                         MIN3(i4_cost[6], i4_cost[7], i4_cost[8]));
   1996 
   1997     }
   1998     else
   1999     {
   2000         /* Only first three modes valid */
   2001         i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]);
   2002     }
   2003 
   2004     *pu4_sadmin = i4_min_cost;
   2005 
   2006     if (i4_min_cost == i4_cost[0])
   2007     {
   2008         *u4_intra_mode = VERT_I4x4;
   2009         pu1_pred_val = pu1_ngbr_pels + 5;
   2010         memcpy(pu1_dst, (pu1_pred_val), 4);
   2011         pu1_dst += dst_strd;
   2012         memcpy(pu1_dst, (pu1_pred_val), 4);
   2013         pu1_dst += dst_strd;
   2014         memcpy(pu1_dst, (pu1_pred_val), 4);
   2015         pu1_dst += dst_strd;
   2016         memcpy(pu1_dst, (pu1_pred_val), 4);
   2017     }
   2018     else if (i4_min_cost == i4_cost[1])
   2019     {
   2020         *u4_intra_mode = HORZ_I4x4;
   2021         memset(pu1_dst, pu1_ngbr_pels[3], 4);
   2022         pu1_dst += dst_strd;
   2023         memset(pu1_dst, pu1_ngbr_pels[2], 4);
   2024         pu1_dst += dst_strd;
   2025         memset(pu1_dst, pu1_ngbr_pels[1], 4);
   2026         pu1_dst += dst_strd;
   2027         memset(pu1_dst, pu1_ngbr_pels[0], 4);
   2028     }
   2029     else if (i4_min_cost == i4_cost[2])
   2030     {
   2031         *u4_intra_mode = DC_I4x4;
   2032         memset(pu1_dst, u4_dcval, 4);
   2033         pu1_dst += dst_strd;
   2034         memset(pu1_dst, u4_dcval, 4);
   2035         pu1_dst += dst_strd;
   2036         memset(pu1_dst, u4_dcval, 4);
   2037         pu1_dst += dst_strd;
   2038         memset(pu1_dst, u4_dcval, 4);
   2039     }
   2040 
   2041     else if (i4_min_cost == i4_cost[3])
   2042     {
   2043         *u4_intra_mode = DIAG_DL_I4x4;
   2044         pu1_pred_val = u1_pred_vals_diag_121 + 5;
   2045         memcpy(pu1_dst, (pu1_pred_val), 4);
   2046         pu1_dst += dst_strd;
   2047         memcpy(pu1_dst, (pu1_pred_val + 1), 4);
   2048         pu1_dst += dst_strd;
   2049         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   2050         pu1_dst += dst_strd;
   2051         memcpy(pu1_dst, (pu1_pred_val + 3), 4);
   2052     }
   2053     else if (i4_min_cost == i4_cost[4])
   2054     {
   2055         *u4_intra_mode = DIAG_DR_I4x4;
   2056         pu1_pred_val = u1_pred_vals_diag_121 + 3;
   2057 
   2058         memcpy(pu1_dst, (pu1_pred_val), 4);
   2059         pu1_dst += dst_strd;
   2060         memcpy(pu1_dst, (pu1_pred_val - 1), 4);
   2061         pu1_dst += dst_strd;
   2062         memcpy(pu1_dst, (pu1_pred_val - 2), 4);
   2063         pu1_dst += dst_strd;
   2064         memcpy(pu1_dst, (pu1_pred_val - 3), 4);
   2065     }
   2066 
   2067     else if (i4_min_cost == i4_cost[5])
   2068     {
   2069         *u4_intra_mode = VERT_R_I4x4;
   2070         pu1_pred_val = u1_pred_vals_diag_11 + 4;
   2071         memcpy(pu1_dst, (pu1_pred_val), 4);
   2072         pu1_dst += dst_strd;
   2073         pu1_pred_val = u1_pred_vals_diag_121 + 3;
   2074         memcpy(pu1_dst, (pu1_pred_val), 4);
   2075         pu1_dst += dst_strd;
   2076         memcpy(pu1_dst, (u1_pred_vals_vert_r), 4);
   2077         pu1_dst += dst_strd;
   2078         memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4);
   2079     }
   2080     else if (i4_min_cost == i4_cost[6])
   2081     {
   2082         *u4_intra_mode = HORZ_D_I4x4;
   2083         pu1_pred_val = u1_pred_vals_horz_d;
   2084         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
   2085         pu1_dst += dst_strd;
   2086         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
   2087         pu1_dst += dst_strd;
   2088         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   2089         pu1_dst += dst_strd;
   2090         memcpy(pu1_dst, (pu1_pred_val), 4);
   2091         pu1_dst += dst_strd;
   2092     }
   2093     else if (i4_min_cost == i4_cost[7])
   2094     {
   2095         *u4_intra_mode = VERT_L_I4x4;
   2096         pu1_pred_val = u1_pred_vals_diag_11 + 5;
   2097         memcpy(pu1_dst, (pu1_pred_val), 4);
   2098         pu1_dst += dst_strd;
   2099         pu1_pred_val = u1_pred_vals_diag_121 + 5;
   2100         memcpy(pu1_dst, (pu1_pred_val), 4);
   2101         pu1_dst += dst_strd;
   2102         pu1_pred_val = u1_pred_vals_diag_11 + 6;
   2103         memcpy(pu1_dst, (pu1_pred_val), 4);
   2104         pu1_dst += dst_strd;
   2105         pu1_pred_val = u1_pred_vals_diag_121 + 6;
   2106         memcpy(pu1_dst, (pu1_pred_val), 4);
   2107     }
   2108     else if (i4_min_cost == i4_cost[8])
   2109     {
   2110         *u4_intra_mode = HORZ_U_I4x4;
   2111         pu1_pred_val = u1_pred_vals_horz_u;
   2112         memcpy(pu1_dst, (pu1_pred_val), 4);
   2113         pu1_dst += dst_strd;
   2114         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   2115         pu1_dst += dst_strd;
   2116         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
   2117         pu1_dst += dst_strd;
   2118         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
   2119         pu1_dst += dst_strd;
   2120     }
   2121 
   2122     return;
   2123 }
   2124 
   2125 /**
   2126 ******************************************************************************
   2127 *
   2128 * @brief:
   2129 *  Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction.
   2130 *
   2131 * @par Description
   2132 *  This function evaluates  first three intra chroma modes and compute corresponding sad
   2133 *  and return the buffer predicted with best mode.
   2134 *
   2135 * @param[in] pu1_src
   2136 *  UWORD8 pointer to the source
   2137 *
   2138 * @param[in] pu1_ngbr_pels
   2139 *  UWORD8 pointer to neighbouring pels
   2140 *
   2141 * @param[out] pu1_dst
   2142 *  UWORD8 pointer to the destination
   2143 *
   2144 * @param[in] src_strd
   2145 *  integer source stride
   2146 *
   2147 * @param[in] dst_strd
   2148 *  integer destination stride
   2149 *
   2150 * @param[in] u4_n_avblty
   2151 *  availability of neighbouring pixels
   2152 *
   2153 * @param[in] u4_intra_mode
   2154 *  Pointer to the variable in which best mode is returned
   2155 *
   2156 * @param[in] pu4_sadmin
   2157 *  Pointer to the variable in which minimum sad is returned
   2158 *
   2159 * @param[in] u4_valid_intra_modes
   2160 *  Says what all modes are valid
   2161 *
   2162 * @return      none
   2163 *
   2164 ******************************************************************************
   2165 */
   2166 void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
   2167                                         UWORD8 *pu1_ngbr_pels,
   2168                                         UWORD8 *pu1_dst,
   2169                                         UWORD32 src_strd,
   2170                                         UWORD32 dst_strd,
   2171                                         WORD32 u4_n_avblty,
   2172                                         UWORD32 *u4_intra_mode,
   2173                                         WORD32 *pu4_sadmin,
   2174                                         UWORD32 u4_valid_intra_modes)
   2175 {
   2176     UWORD8 *pu1_neighbour;
   2177     UWORD8 *pu1_src_temp = pu1_src;
   2178     UWORD8 left = 0, top = 0;
   2179     WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */
   2180            u4_dcval_u_t[2] = { 0, 0 };  /*sum top neighbours for 'U'*/
   2181 
   2182     WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/
   2183            u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/
   2184 
   2185     WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX,
   2186                     i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX;
   2187     UWORD8 val_u, val_v;
   2188 
   2189     WORD32 u4_dc_val[2][2][2];/*  -----------
   2190                                   |    |    |  Chroma can have four
   2191                                   | 00 | 01 |  separate dc value...
   2192                                   -----------  u4_dc_val corresponds to this dc values
   2193                                   |    |    |  with u4_dc_val[2][2][U] and u4_dc_val[2][2][V]
   2194                                   | 10 | 11 |
   2195                                   -----------                */
   2196     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   2197     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   2198 
   2199     /*Evaluating HORZ*/
   2200     if (left)/* Ifleft available*/
   2201     {
   2202         i4_sad_horz = 0;
   2203 
   2204         for (i = 0; i < 8; i++)
   2205         {
   2206             val_v = pu1_ngbr_pels[15 - 2 * i];
   2207             val_u = pu1_ngbr_pels[15 - 2 * i - 1];
   2208             row = i / 4;
   2209             u4_dcval_u_l[row] += val_u;
   2210             u4_dcval_v_l[row] += val_v;
   2211             for (j = 0; j < 8; j++)
   2212             {
   2213                 i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/
   2214                 i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]);
   2215             }
   2216 
   2217             pu1_src_temp += src_strd;
   2218         }
   2219         u4_dcval_u_l[0] += 2;
   2220         u4_dcval_u_l[1] += 2;
   2221         u4_dcval_v_l[0] += 2;
   2222         u4_dcval_v_l[1] += 2;
   2223     }
   2224 
   2225     /*Evaluating VERT**/
   2226     pu1_src_temp = pu1_src;
   2227     if (top) /* top available*/
   2228     {
   2229         i4_sad_vert = 0;
   2230 
   2231         for (i = 0; i < 8; i++)
   2232         {
   2233             col = i / 4;
   2234 
   2235             val_u = pu1_ngbr_pels[18 + i * 2];
   2236             val_v = pu1_ngbr_pels[18 + i * 2 + 1];
   2237             u4_dcval_u_t[col] += val_u;
   2238             u4_dcval_v_t[col] += val_v;
   2239 
   2240             for (j = 0; j < 16; j++)
   2241             {
   2242                 i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/
   2243             }
   2244             pu1_src_temp += src_strd;
   2245 
   2246         }
   2247         u4_dcval_u_t[0] += 2;
   2248         u4_dcval_u_t[1] += 2;
   2249         u4_dcval_v_t[0] += 2;
   2250         u4_dcval_v_t[1] += 2;
   2251     }
   2252 
   2253     /* computing DC value*/
   2254     /* Equation  8-128 in spec*/
   2255     u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top);
   2256     u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top);
   2257     u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top);
   2258     u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top);
   2259 
   2260     if (top)
   2261     {
   2262         /* Equation  8-132 in spec*/
   2263         u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top);
   2264         u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top);
   2265     }
   2266     else
   2267     {
   2268         u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left);
   2269         u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left);
   2270     }
   2271 
   2272     if (left)
   2273     {
   2274         u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left);
   2275         u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left);
   2276     }
   2277     else
   2278     {
   2279         u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top);
   2280         u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top);
   2281     }
   2282 
   2283     if (!(left || top))
   2284     {
   2285         /*none available*/
   2286         u4_dc_val[0][0][0] = u4_dc_val[0][0][1] =
   2287         u4_dc_val[0][1][0] = u4_dc_val[0][1][1] =
   2288         u4_dc_val[1][0][0] = u4_dc_val[1][0][1] =
   2289         u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128;
   2290     }
   2291 
   2292     /* Evaluating DC */
   2293     pu1_src_temp = pu1_src;
   2294     i4_sad_dc = 0;
   2295     for (i = 0; i < 8; i++)
   2296     {
   2297         for (j = 0; j < 8; j++)
   2298         {
   2299             col = j / 4;
   2300             row = i / 4;
   2301             val_u = u4_dc_val[row][col][0];
   2302             val_v = u4_dc_val[row][col][1];
   2303 
   2304             i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/
   2305             i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]);
   2306         }
   2307         pu1_src_temp += src_strd;
   2308     }
   2309 
   2310     if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/
   2311         i4_sad_dc = INT_MAX;
   2312     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/
   2313         i4_sad_horz = INT_MAX;
   2314     if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/
   2315         i4_sad_vert = INT_MAX;
   2316 
   2317     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
   2318 
   2319     /* Finding Minimum sad and doing corresponding prediction*/
   2320     if (i4_min_sad < *pu4_sadmin)
   2321     {
   2322         *pu4_sadmin = i4_min_sad;
   2323 
   2324         if (i4_min_sad == i4_sad_dc)
   2325         {
   2326             *u4_intra_mode = DC_CH_I8x8;
   2327             for (i = 0; i < 8; i++)
   2328             {
   2329                 for (j = 0; j < 8; j++)
   2330                 {
   2331                     col = j / 4;
   2332                     row = i / 4;
   2333 
   2334                     pu1_dst[2 * j] = u4_dc_val[row][col][0];
   2335                     pu1_dst[2 * j + 1] = u4_dc_val[row][col][1];
   2336                 }
   2337                 pu1_dst += dst_strd;
   2338             }
   2339         }
   2340         else if (i4_min_sad == i4_sad_horz)
   2341         {
   2342             *u4_intra_mode = HORZ_CH_I8x8;
   2343             for (j = 0; j < 8; j++)
   2344             {
   2345                 val_v = pu1_ngbr_pels[15 - 2 * j];
   2346                 val_u = pu1_ngbr_pels[15 - 2 * j - 1];
   2347 
   2348                 for (i = 0; i < 8; i++)
   2349                 {
   2350                     pu1_dst[2 * i] = val_u;
   2351                     pu1_dst[2 * i + 1] = val_v;
   2352 
   2353                 }
   2354                 pu1_dst += dst_strd;
   2355             }
   2356         }
   2357         else
   2358         {
   2359             *u4_intra_mode = VERT_CH_I8x8;
   2360             pu1_neighbour = pu1_ngbr_pels + 18;
   2361             for (j = 0; j < 8; j++)
   2362             {
   2363                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
   2364                 pu1_dst += dst_strd;
   2365             }
   2366         }
   2367     }
   2368 
   2369     return;
   2370 }
   2371