Home | History | Annotate | Download | only in encoder
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 
     21 /**
     22 *******************************************************************************
     23 * @file
     24 *  ih264e_intra_modes_eval.c
     25 *
     26 * @brief
     27 *  This file contains definitions of routines that perform rate distortion
     28 *  analysis on a macroblock if they are to be coded as intra.
     29 *
     30 * @author
     31 *  ittiam
     32 *
     33 * @par List of Functions:
     34 *  - ih264e_derive_neighbor_availability_of_mbs()
     35 *  - ih264e_derive_ngbr_avbl_of_mb_partitions()
     36 *  - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff()
     37 *  - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff()
     38 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff()
     39 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton()
     40 *  - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff()
     41 *  - ih264e_evaluate_intra16x16_modes()
     42 *  - ih264e_evaluate_intra4x4_modes()
     43 *  - ih264e_evaluate_intra_chroma_modes()
     44 *
     45 * @remarks
     46 *  None
     47 *
     48 *******************************************************************************
     49 */
     50 
     51 /*****************************************************************************/
     52 /* File Includes                                                             */
     53 /*****************************************************************************/
     54 
     55 /* System include files */
     56 #include <stdio.h>
     57 #include <string.h>
     58 #include <limits.h>
     59 #include <assert.h>
     60 
     61 /* User include files */
     62 #include "ih264e_config.h"
     63 #include "ih264_typedefs.h"
     64 #include "ih264e_defs.h"
     65 #include "iv2.h"
     66 #include "ive2.h"
     67 #include "ih264_debug.h"
     68 #include "ih264_defs.h"
     69 #include "ih264_macros.h"
     70 #include "ih264_intra_pred_filters.h"
     71 #include "ih264_structs.h"
     72 #include "ih264_common_tables.h"
     73 #include "ih264_trans_quant_itrans_iquant.h"
     74 #include "ih264_inter_pred_filters.h"
     75 #include "ih264_mem_fns.h"
     76 #include "ih264_padding.h"
     77 #include "ih264_deblk_edge_filters.h"
     78 #include "ih264_cabac_tables.h"
     79 #include "ime_distortion_metrics.h"
     80 #include "ih264e_error.h"
     81 #include "ih264e_bitstream.h"
     82 #include "ime_defs.h"
     83 #include "ime_structs.h"
     84 #include "irc_cntrl_param.h"
     85 #include "irc_frame_info_collector.h"
     86 #include "ih264e_rate_control.h"
     87 #include "ih264e_cabac_structs.h"
     88 #include "ih264e_structs.h"
     89 #include "ih264e_intra_modes_eval.h"
     90 #include "ih264e_globals.h"
     91 #include "ime_platform_macros.h"
     92 
     93 
     94 /*****************************************************************************/
     95 /* Function Definitions                                                      */
     96 /*****************************************************************************/
     97 
     98 /**
     99 ******************************************************************************
    100 *
    101 * @brief
    102 *  derivation process for macroblock availability
    103 *
    104 * @par   Description
    105 *  Calculates the availability of the left, top, topright and topleft macroblocks.
    106 *
    107 * @param[in] ps_proc_ctxt
    108 *  pointer to proc context (handle)
    109 *
    110 * @remarks Based on section 6.4.5 in H264 spec
    111 *
    112 * @return  none
    113 *
    114 ******************************************************************************
    115 */
    116 void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc)
    117 {
    118     UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx;
    119     UWORD8 *pu1_slice_idx_b;
    120     UWORD8 *pu1_slice_idx_a;
    121     UWORD8 *pu1_slice_idx_c;
    122     UWORD8 *pu1_slice_idx_d;
    123     block_neighbors_t *ps_ngbr_avbl;
    124     WORD32 i4_mb_x, i4_mb_y;
    125     WORD32 i4_wd_mbs;
    126 
    127     i4_mb_x = ps_proc->i4_mb_x;
    128     i4_mb_y = ps_proc->i4_mb_y;
    129 
    130     i4_wd_mbs = ps_proc->i4_wd_mbs;
    131 
    132     pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x;
    133     pu1_slice_idx_a = pu1_slice_idx_curr - 1;
    134     pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs;
    135     pu1_slice_idx_c = pu1_slice_idx_b + 1;
    136     pu1_slice_idx_d = pu1_slice_idx_b - 1;
    137     ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
    138 
    139     /**********************************************************************/
    140     /* The macroblock is marked as available, unless one of the following */
    141     /* conditions is true in which case the macroblock shall be marked as */
    142     /* not available.                                                     */
    143     /* 1. mbAddr < 0                                                      */
    144     /* 2  mbAddr > CurrMbAddr                                             */
    145     /* 3. the macroblock with address mbAddr belongs to a different slice */
    146     /* than the macroblock with address CurrMbAddr                        */
    147     /**********************************************************************/
    148 
    149     /* left macroblock availability */
    150     if (i4_mb_x == 0)
    151     { /* macroblocks along first column */
    152         ps_ngbr_avbl->u1_mb_a = 0;
    153     }
    154     else
    155     { /* macroblocks belong to same slice? */
    156         if (*pu1_slice_idx_a != *pu1_slice_idx_curr)
    157             ps_ngbr_avbl->u1_mb_a = 0;
    158         else
    159             ps_ngbr_avbl->u1_mb_a = 1;
    160     }
    161 
    162     /* top macroblock availability */
    163     if (i4_mb_y == 0)
    164     { /* macroblocks along first row */
    165         ps_ngbr_avbl->u1_mb_b = 0;
    166     }
    167     else
    168     { /* macroblocks belong to same slice? */
    169         if (*pu1_slice_idx_b != *pu1_slice_idx_curr)
    170             ps_ngbr_avbl->u1_mb_b = 0;
    171         else
    172             ps_ngbr_avbl->u1_mb_b = 1;
    173     }
    174 
    175     /* top right macroblock availability */
    176     if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0)
    177     { /* macroblocks along last column */
    178         ps_ngbr_avbl->u1_mb_c = 0;
    179     }
    180     else
    181     { /* macroblocks belong to same slice? */
    182         if (*pu1_slice_idx_c != *pu1_slice_idx_curr)
    183             ps_ngbr_avbl->u1_mb_c = 0;
    184         else
    185             ps_ngbr_avbl->u1_mb_c = 1;
    186     }
    187 
    188     /* top left macroblock availability */
    189     if (i4_mb_x == 0 || i4_mb_y == 0)
    190     { /* macroblocks along first column */
    191         ps_ngbr_avbl->u1_mb_d = 0;
    192     }
    193     else
    194     { /* macroblocks belong to same slice? */
    195         if (*pu1_slice_idx_d != *pu1_slice_idx_curr)
    196             ps_ngbr_avbl->u1_mb_d = 0;
    197         else
    198             ps_ngbr_avbl->u1_mb_d = 1;
    199     }
    200 }
    201 
    202 /**
    203 ******************************************************************************
    204 *
    205 * @brief
    206 *  derivation process for subblock/partition availability
    207 *
    208 * @par   Description
    209 *  Calculates the availability of the left, top, topright and topleft subblock
    210 *  or partitions.
    211 *
    212 * @param[in]    ps_proc_ctxt
    213 *  pointer to macroblock context (handle)
    214 *
    215 * @param[in]    i1_pel_pos_x
    216 *  column position of the pel wrt the current block
    217 *
    218 * @param[in]    i1_pel_pos_y
    219 *  row position of the pel in wrt current block
    220 *
    221 * @remarks     Assumptions: before calling this function it is assumed that
    222 *   the neighbor availability of the current macroblock is already derived.
    223 *   Based on table 6-3 of H264 specification
    224 *
    225 * @return      availability status (yes or no)
    226 *
    227 ******************************************************************************
    228 */
    229 UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl,
    230                                                 WORD8 i1_pel_pos_x,
    231                                                 WORD8 i1_pel_pos_y)
    232 {
    233     UWORD8 u1_neighbor_avail=0;
    234 
    235     /**********************************************************************/
    236     /* values of i1_pel_pos_x in the range 0-15 inclusive correspond to   */
    237     /* various columns of a macroblock                                    */
    238     /*                                                                    */
    239     /* values of i1_pel_pos_y in the range 0-15 inclusive correspond to   */
    240     /* various rows of a macroblock                                       */
    241     /*                                                                    */
    242     /* other values of i1_pel_pos_x & i1_pel_pos_y represents elements    */
    243     /* outside the bound of an mb ie., represents its neighbors.          */
    244     /**********************************************************************/
    245     if (i1_pel_pos_x < 0)
    246     { /* column(-1) */
    247         if (i1_pel_pos_y < 0)
    248         { /* row(-1) */
    249             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */
    250         }
    251         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
    252         { /* all rows of a macroblock */
    253             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */
    254         }
    255         else /* if (i1_pel_pos_y >= 16) */
    256         { /* rows(+16) */
    257             u1_neighbor_avail = 0;  /* current mb bottom left availability */
    258         }
    259     }
    260     else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16)
    261     { /* all columns of a macroblock */
    262         if (i1_pel_pos_y < 0)
    263         { /* row(-1) */
    264             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */
    265         }
    266         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
    267         { /* all rows of a macroblock */
    268             u1_neighbor_avail = 1; /* current mb availability */
    269             /* availability of the partition is dependent on the position of the partition inside the mb */
    270             /* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */
    271         }
    272         else /* if (i1_pel_pos_y >= 16) */
    273         { /* rows(+16) */
    274             u1_neighbor_avail = 0;  /* current mb bottom availability */
    275         }
    276     }
    277     else if (i1_pel_pos_x >= 16)
    278     { /* column(+16) */
    279         if (i1_pel_pos_y < 0)
    280         { /* row(-1) */
    281             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */
    282         }
    283         else /* if (i1_pel_pos_y >= 0) */
    284         { /* all other rows */
    285             u1_neighbor_avail = 0;  /* current mb right & bottom right availability */
    286         }
    287     }
    288 
    289     return u1_neighbor_avail;
    290 }
    291 
    292 /**
    293 ******************************************************************************
    294 *
    295 * @brief
    296 *  evaluate best intra 16x16 mode (rate distortion opt off)
    297 *
    298 * @par Description
    299 *  This function evaluates all the possible intra 16x16 modes and finds the mode
    300 *  that best represents the macro-block (least distortion) and occupies fewer
    301 *  bits in the bit-stream.
    302 *
    303 * @param[in]   ps_proc_ctxt
    304 *  pointer to process context (handle)
    305 *
    306 * @remarks
    307 *  Ideally the cost of encoding a macroblock is calculated as
    308 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    309 *  input block and the reconstructed block and rate is the number of bits taken
    310 *  to place the macroblock in the bit-stream. In this routine the rate does not
    311 *  exactly point to the total number of bits it takes, rather it points to header
    312 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    313 *  and residual bits fall in to texture bits the number of bits taken to encoding
    314 *  mbtype is considered as rate, we compute cost. Further we will approximate
    315 *  the distortion as the deviation b/w input and the predicted block as opposed
    316 *  to input and reconstructed block.
    317 *
    318 *  NOTE: As per the Document JVT-O079, for intra 16x16 macroblock,
    319 *  the SAD and cost are one and the same.
    320 *
    321 * @return     none
    322 *
    323 ******************************************************************************
    324 */
    325 
    326 void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    327 {
    328     /* Codec Context */
    329     codec_t *ps_codec = ps_proc->ps_codec;
    330 
    331     /* SAD(distortion metric) of an 8x8 block */
    332     WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX;
    333 
    334     /* lambda */
    335     UWORD32 u4_lambda = ps_proc->u4_lambda;
    336 
    337     /* cost = distortion + lambda*rate */
    338     WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX;
    339 
    340     /* intra mode */
    341     UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16;
    342 
    343     /* neighbor pels for intra prediction */
    344     UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels;
    345 
    346     /* neighbor availability */
    347     WORD32 i4_ngbr_avbl;
    348 
    349     /* pointer to src macro block */
    350     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
    351     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
    352 
    353     /* pointer to prediction macro block */
    354     UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16;
    355     UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane;
    356 
    357     /* strides */
    358     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    359     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    360     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
    361 
    362     /* pointer to neighbors left, top, topleft */
    363     UWORD8 *pu1_mb_a = pu1_ref_mb - 1;
    364     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd;
    365     UWORD8 *pu1_mb_d = pu1_mb_b - 1;
    366 
    367     /* valid intra modes map */
    368     UWORD32 u4_valid_intra_modes;
    369 
    370     /* lut for valid intra modes */
    371     const UWORD8 u1_valid_intra_modes[8] = {4, 6, 12, 14, 5, 7, 13, 15};
    372 
    373     /* temp var */
    374     UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
    375 
    376     /* init temp var */
    377     if (ps_proc->i4_slice_type != ISLICE)
    378     {
    379         /* Offset for MBtype */
    380         offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
    381         u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
    382     }
    383 
    384     /* locating neighbors that are available for prediction */
    385     /* TODO : update the neighbor availability information basing on constrained intra pred information */
    386     /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines
    387      * basing on neighbors available and hence evade the computation of neighbor availability totally. */
    388     /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
    389     i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1);
    390     ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl;
    391 
    392     /* gather prediction pels from the neighbors, if particular set is not available
    393      * it is set to zero*/
    394     /* left pels */
    395     if (ps_proc->ps_ngbr_avbl->u1_mb_a)
    396     {
    397         for(i = 0; i < 16; i++)
    398             pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd];
    399     }
    400     else
    401     {
    402         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE);
    403     }
    404     /* top pels */
    405     if (ps_proc->ps_ngbr_avbl->u1_mb_b)
    406     {
    407         ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16);
    408         /*for(i = 0; i < 16; i++)
    409             pu1_ngbr_pels_i16[16+1+i] = pu1_mb_b[i];*/
    410     }
    411     else
    412     {
    413         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE);
    414     }
    415     /* topleft pels */
    416     if (ps_proc->ps_ngbr_avbl->u1_mb_d)
    417         pu1_ngbr_pels_i16[16] = *pu1_mb_d;
    418     else
    419         pu1_ngbr_pels_i16[16] = 0;
    420 
    421     /* set valid intra modes for evaluation */
    422 //    u4_valid_intra_modes = 15;
    423 ////    ih264e_filter_intra16x16modes(pu1_mb_curr, i4_src_strd, &u4_valid_intra_modes);
    424 //    if (!ps_proc->ps_ngbr_avbl->u1_mb_a)
    425 //        u4_valid_intra_modes &= ~(1 << HORZ_I16x16);
    426 //    if (!ps_proc->ps_ngbr_avbl->u1_mb_b)
    427 //        u4_valid_intra_modes &= ~(1 << VERT_I16x16);
    428 ////    if (!ps_proc->ps_ngbr_avbl->u1_mb_a || !ps_proc->ps_ngbr_avbl->u1_mb_b || !ps_proc->ps_ngbr_avbl->u1_mb_d)
    429 //    if (i4_ngbr_avbl != 7)
    430 //        u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
    431 
    432     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
    433 
    434     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
    435         u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
    436 
    437     /* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */
    438     ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16,
    439                                                   i4_src_strd, i4_pred_strd,
    440                                                   i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least,
    441                                                   u4_valid_intra_modes);
    442 
    443     /* cost = distortion + lambda*rate */
    444     i4_mb_cost_least = i4_mb_distortion_least;
    445 
    446     if (( (u4_valid_intra_modes >> 3) & 1) != 0 && (ps_codec->s_cfg.u4_enc_speed_preset != IVE_FASTEST ||
    447                     ps_proc->i4_slice_type == ISLICE))
    448     {
    449         /* intra prediction for PLANE mode*/
    450         (ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl);
    451 
    452         /* evaluate distortion between the actual blk and the estimated blk for the given mode */
    453         ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion);
    454 
    455         /* cost = distortion + lambda*rate */
    456         i4_mb_cost = i4_mb_distortion;
    457 
    458         /* update the least cost information if necessary */
    459         if(i4_mb_cost < i4_mb_distortion_least)
    460         {
    461             u4_intra_mode = PLANE_I16x16;
    462 
    463             i4_mb_cost_least = i4_mb_cost;
    464             i4_mb_distortion_least = i4_mb_distortion;
    465         }
    466     }
    467 
    468     u4_best_intra_16x16_mode = u4_intra_mode;
    469 
    470     DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode);
    471 
    472     ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode;
    473 
    474     /* cost = distortion + lambda*rate */
    475     i4_mb_cost_least    = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode];
    476 
    477 
    478     /* update the type of the mb if necessary */
    479     if (i4_mb_cost_least < ps_proc->i4_mb_cost)
    480     {
    481         ps_proc->i4_mb_cost = i4_mb_cost_least;
    482         ps_proc->i4_mb_distortion = i4_mb_distortion_least;
    483         ps_proc->u4_mb_type = I16x16;
    484     }
    485 
    486     return ;
    487 }
    488 
    489 
    490 /**
    491 ******************************************************************************
    492 *
    493 * @brief
    494 *  evaluate best intra 8x8 mode (rate distortion opt on)
    495 *
    496 * @par Description
    497 *  This function evaluates all the possible intra 8x8 modes and finds the mode
    498 *  that best represents the macro-block (least distortion) and occupies fewer
    499 *  bits in the bit-stream.
    500 *
    501 * @param[in]    ps_proc_ctxt
    502 *  pointer to proc ctxt
    503 *
    504 * @remarks Ideally the cost of encoding a macroblock is calculated as
    505 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    506 *  input block and the reconstructed block and rate is the number of bits taken
    507 *  to place the macroblock in the bit-stream. In this routine the rate does not
    508 *  exactly point to the total number of bits it takes, rather it points to header
    509 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    510 *  and residual bits fall in to texture bits the number of bits taken to encoding
    511 *  mbtype is considered as rate, we compute cost. Further we will approximate
    512 *  the distortion as the deviation b/w input and the predicted block as opposed
    513 *  to input and reconstructed block.
    514 *
    515 *  NOTE: TODO: This function needs to be tested
    516 *
    517 *  @return      none
    518 *
    519 ******************************************************************************
    520 */
    521 void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    522 {
    523     /* Codec Context */
    524     codec_t *ps_codec = ps_proc->ps_codec;
    525 
    526     /* SAD(distortion metric) of an 4x4 block */
    527     WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
    528 
    529     /* lambda */
    530     UWORD32 u4_lambda = ps_proc->u4_lambda;
    531 
    532     /* cost = distortion + lambda*rate */
    533     WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda;
    534 
    535     /* cost due to mbtype */
    536     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
    537 
    538     /* intra mode */
    539     UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode;
    540 
    541     /* neighbor pels for intra prediction */
    542     UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels;
    543 
    544     /* pointer to curr partition */
    545     UWORD8 *pu1_mb_curr;
    546 
    547     /* pointer to prediction macro block */
    548     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
    549 
    550     /* strides */
    551     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    552     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    553 
    554     /* neighbors left, top, top right, top left */
    555     UWORD8 *pu1_mb_a;
    556     UWORD8 *pu1_mb_b;
    557     UWORD8 *pu1_mb_d;
    558 
    559     /* neighbor availability */
    560     WORD32 i4_ngbr_avbl;
    561     block_neighbors_t s_ngbr_avbl;
    562 
    563     /* temp vars */
    564     UWORD32  b8, u4_pix_x, u4_pix_y;
    565 
    566     /* ngbr mb syntax information */
    567     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
    568     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    569 
    570     /* valid intra modes map */
    571     UWORD32 u4_valid_intra_modes;
    572 
    573     for(b8 = 0; b8 < 4; b8++)
    574     {
    575         u4_pix_x = (b8 & 0x01) << 3;
    576         u4_pix_y = (b8 >> 1) << 3;
    577 
    578         pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
    579         /* when rdopt is off, we use the input as reference for constructing prediction buffer */
    580         /* as opposed to using the recon pels. (open loop intra prediction) */
    581         pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
    582         pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
    583         pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
    584 
    585         /* locating neighbors that are available for prediction */
    586         /* TODO : update the neighbor availability information basing on constrained intra pred information */
    587         /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
    588         /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
    589         s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */
    590         s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */
    591         s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */
    592         s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(ps_proc->ps_ngbr_avbl, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */
    593 
    594         /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
    595         i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  (s_ngbr_avbl.u1_mb_c << 3) +
    596                         (s_ngbr_avbl.u1_mb_a << 4);
    597         /* if top partition is available and top right is not available for intra prediction, then */
    598         /* padd top right samples using top sample and make top right also available */
    599         /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
    600         ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl;
    601 
    602 
    603         ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8,
    604                                                      i4_src_strd, i4_ngbr_avbl);
    605 
    606         i4_partition_cost_least = INT_MAX;
    607         /* set valid intra modes for evaluation */
    608         u4_valid_intra_modes = 0x1ff;
    609 
    610         if (!s_ngbr_avbl.u1_mb_b)
    611         {
    612             u4_valid_intra_modes &= ~(1 << VERT_I4x4);
    613             u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4);
    614             u4_valid_intra_modes &= ~(1 << VERT_L_I4x4);
    615         }
    616         if (!s_ngbr_avbl.u1_mb_a)
    617         {
    618             u4_valid_intra_modes &= ~(1 << HORZ_I4x4);
    619             u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4);
    620         }
    621         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d)
    622         {
    623             u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4);
    624             u4_valid_intra_modes &= ~(1 << VERT_R_I4x4);
    625             u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4);
    626         }
    627 
    628         /* estimate the intra 8x8 mode for the current partition (for evaluating cost) */
    629         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
    630         {
    631             u4_estimated_intra_8x8_mode = DC_I8x8;
    632         }
    633         else
    634         {
    635             UWORD32 u4_left_intra_8x8_mode = DC_I8x8;
    636             UWORD32 u4_top_intra_8x8_mode = DC_I8x8;
    637 
    638             if (u4_pix_x == 0)
    639             {
    640                 if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
    641                 {
    642                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1];
    643                 }
    644                 else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
    645                 {
    646                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2];
    647                 }
    648             }
    649             else
    650             {
    651                 u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1];
    652             }
    653 
    654             if (u4_pix_y == 0)
    655             {
    656                 if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
    657                 {
    658                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2];
    659                 }
    660                 else if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
    661                 {
    662                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2];
    663                 }
    664             }
    665             else
    666             {
    667                 u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2];
    668             }
    669 
    670             u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode);
    671         }
    672 
    673         /* perform intra mode 8x8 evaluation */
    674         for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1)
    675         {
    676             if ( (u4_valid_intra_modes & 1) == 0)
    677                 continue;
    678 
    679             /* intra prediction */
    680             (ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl);
    681 
    682             /* evaluate distortion between the actual blk and the estimated blk for the given mode */
    683             ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion);
    684 
    685             i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits);
    686 
    687             /* update the least cost information if necessary */
    688             if (i4_partition_cost < i4_partition_cost_least)
    689             {
    690                 i4_partition_cost_least = i4_partition_cost;
    691                 i4_partition_distortion_least = i4_partition_distortion;
    692                 u4_best_intra_8x8_mode = u4_intra_mode;
    693             }
    694         }
    695         /* macroblock distortion */
    696         i4_total_cost += i4_partition_cost_least;
    697         i4_total_distortion += i4_partition_distortion_least;
    698         /* mb partition mode */
    699         ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode;
    700 
    701     }
    702 
    703     /* update the type of the mb if necessary */
    704     if (i4_total_cost < ps_proc->i4_mb_cost)
    705     {
    706         ps_proc->i4_mb_cost = i4_total_cost;
    707         ps_proc->i4_mb_distortion = i4_total_distortion;
    708         ps_proc->u4_mb_type = I8x8;
    709     }
    710 
    711     return ;
    712 }
    713 
    714 
    715 /**
    716 ******************************************************************************
    717 *
    718 * @brief
    719 *  evaluate best intra 4x4 mode (rate distortion opt off)
    720 *
    721 * @par Description
    722 *  This function evaluates all the possible intra 4x4 modes and finds the mode
    723 *  that best represents the macro-block (least distortion) and occupies fewer
    724 *  bits in the bit-stream.
    725 *
    726 * @param[in]    ps_proc_ctxt
    727 *  pointer to proc ctxt
    728 *
    729 * @remarks
    730 *  Ideally the cost of encoding a macroblock is calculated as
    731 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    732 *  input block and the reconstructed block and rate is the number of bits taken
    733 *  to place the macroblock in the bit-stream. In this routine the rate does not
    734 *  exactly point to the total number of bits it takes, rather it points to header
    735 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    736 *  and residual bits fall in to texture bits the number of bits taken to encoding
    737 *  mbtype is considered as rate, we compute cost. Further we will approximate
    738 *  the distortion as the deviation b/w input and the predicted block as opposed
    739 *  to input and reconstructed block.
    740 *
    741 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
    742 *  24*lambda is added to the SAD before comparison with the best SAD for
    743 *  inter prediction. This is an empirical value to prevent using too many intra
    744 *  blocks.
    745 *
    746 * @return      none
    747 *
    748 ******************************************************************************
    749 */
    750 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
    751 {
    752     /* Codec Context */
    753     codec_t *ps_codec = ps_proc->ps_codec;
    754 
    755     /* SAD(distortion metric) of an 4x4 block */
    756     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
    757 
    758     /* lambda */
    759     UWORD32 u4_lambda = ps_proc->u4_lambda;
    760 
    761     /* cost = distortion + lambda*rate */
    762     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
    763 
    764     /* cost due to mbtype */
    765     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
    766 
    767     /* intra mode */
    768     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
    769 
    770     /* neighbor pels for intra prediction */
    771     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
    772 
    773     /* pointer to curr partition */
    774     UWORD8 *pu1_mb_curr;
    775 
    776     /* pointer to prediction macro block */
    777     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
    778 
    779     /* strides */
    780     WORD32 i4_src_strd = ps_proc->i4_src_strd;
    781     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
    782 
    783     /* neighbors left, top, top right, top left */
    784     UWORD8 *pu1_mb_a;
    785     UWORD8 *pu1_mb_b;
    786     UWORD8 *pu1_mb_c;
    787     UWORD8 *pu1_mb_d;
    788 
    789     /* neighbor availability */
    790     WORD32 i4_ngbr_avbl;
    791     block_neighbors_t s_ngbr_avbl;
    792 
    793     /* temp vars */
    794     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
    795 
    796     /* scan order inside 4x4 block */
    797     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
    798 
    799     /* ngbr sub mb modes */
    800     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
    801     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
    802 
    803     /* valid intra modes map */
    804     UWORD32 u4_valid_intra_modes;
    805     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
    806 
    807     i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_c << 3);
    808     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
    809 
    810     for (b8 = 0; b8 < 4; b8++)
    811     {
    812         u4_blk_x = (b8 & 0x01) << 3;
    813         u4_blk_y = (b8 >> 1) << 3;
    814         for (b4 = 0; b4 < 4; b4++)
    815         {
    816             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
    817             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
    818 
    819             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
    820             /* when rdopt is off, we use the input as reference for constructing prediction buffer */
    821             /* as opposed to using the recon pels. (open loop intra prediction) */
    822             pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
    823             pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
    824             pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */
    825             pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
    826 
    827             /* locating neighbors that are available for prediction */
    828             /* TODO : update the neighbor availability information basing on constrained intra pred information */
    829             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
    830             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
    831 
    832             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
    833             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
    834             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
    835             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
    836             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
    837             /* set valid intra modes for evaluation */
    838             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
    839 
    840             /* if top partition is available and top right is not available for intra prediction, then */
    841             /* padd top right samples using top sample and make top right also available */
    842             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
    843 
    844             /* gather prediction pels from the neighbors */
    845             if (s_ngbr_avbl.u1_mb_a)
    846             {
    847                 for(i = 0; i < 4; i++)
    848                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd];
    849             }
    850             else
    851             {
    852                 memset(pu1_ngbr_pels_i4, 0, 4);
    853             }
    854 
    855             if (s_ngbr_avbl.u1_mb_b)
    856             {
    857                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
    858             }
    859             else
    860             {
    861                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
    862             }
    863 
    864             if (s_ngbr_avbl.u1_mb_d)
    865                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
    866             else
    867                 pu1_ngbr_pels_i4[4] = 0;
    868 
    869             if (s_ngbr_avbl.u1_mb_c)
    870             {
    871                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
    872             }
    873             else if (s_ngbr_avbl.u1_mb_b)
    874             {
    875                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
    876                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
    877             }
    878 
    879             i4_partition_cost_least = INT_MAX;
    880 
    881             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
    882             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
    883             {
    884                 u4_estimated_intra_4x4_mode = DC_I4x4;
    885             }
    886             else
    887             {
    888                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
    889                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
    890 
    891                 if (u4_pix_x == 0)
    892                 {
    893                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
    894                     {
    895                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
    896                     }
    897                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
    898                     {
    899                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
    900                     }
    901                 }
    902                 else
    903                 {
    904                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
    905                 }
    906 
    907                 if (u4_pix_y == 0)
    908                 {
    909                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
    910                     {
    911                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
    912                     }
    913                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
    914                     {
    915                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
    916                     }
    917                 }
    918                 else
    919                 {
    920                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
    921                 }
    922 
    923                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
    924             }
    925 
    926             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
    927 
    928             /* mode evaluation and prediction */
    929             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
    930                                                          pu1_ngbr_pels_i4,
    931                                                          pu1_pred_mb, i4_src_strd,
    932                                                          i4_pred_strd, i4_ngbr_avbl,
    933                                                          &u4_best_intra_4x4_mode,
    934                                                          &i4_partition_cost_least,
    935                                                          u4_valid_intra_modes,
    936                                                          u4_lambda,
    937                                                          u4_estimated_intra_4x4_mode);
    938 
    939 
    940             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits);
    941 
    942             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
    943             /* macroblock distortion */
    944             i4_total_distortion += i4_partition_distortion_least;
    945             i4_total_cost += i4_partition_cost_least;
    946             /* mb partition mode */
    947             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
    948         }
    949     }
    950 
    951     /* update the type of the mb if necessary */
    952     if (i4_total_cost < ps_proc->i4_mb_cost)
    953     {
    954         ps_proc->i4_mb_cost = i4_total_cost;
    955         ps_proc->i4_mb_distortion = i4_total_distortion;
    956         ps_proc->u4_mb_type = I4x4;
    957     }
    958 
    959     return ;
    960 }
    961 
    962 /**
    963 ******************************************************************************
    964 *
    965 * @brief evaluate best intra 4x4 mode (rate distortion opt on)
    966 *
    967 * @par Description
    968 *  This function evaluates all the possible intra 4x4 modes and finds the mode
    969 *  that best represents the macro-block (least distortion) and occupies fewer
    970 *  bits in the bit-stream.
    971 *
    972 * @param[in]    ps_proc_ctxt
    973 *  pointer to proc ctxt
    974 *
    975 * @remarks
    976 *  Ideally the cost of encoding a macroblock is calculated as
    977 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
    978 *  input block and the reconstructed block and rate is the number of bits taken
    979 *  to place the macroblock in the bit-stream. In this routine the rate does not
    980 *  exactly point to the total number of bits it takes, rather it points to header
    981 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
    982 *  and residual bits fall in to texture bits the number of bits taken to encoding
    983 *  mbtype is considered as rate, we compute cost. Further we will approximate
    984 *  the distortion as the deviation b/w input and the predicted block as opposed
    985 *  to input and reconstructed block.
    986 *
    987 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
    988 *  24*lambda is added to the SAD before comparison with the best SAD for
    989 *  inter prediction. This is an empirical value to prevent using too many intra
    990 *  blocks.
    991 *
    992 * @return      none
    993 *
    994 ******************************************************************************
    995 */
    996 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc)
    997 {
    998     /* Codec Context */
    999     codec_t *ps_codec = ps_proc->ps_codec;
   1000 
   1001     /* SAD(distortion metric) of an 4x4 block */
   1002     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
   1003 
   1004     /* lambda */
   1005     UWORD32 u4_lambda = ps_proc->u4_lambda;
   1006 
   1007     /* cost = distortion + lambda*rate */
   1008     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
   1009 
   1010     /* cost due to mbtype */
   1011     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
   1012 
   1013     /* intra mode */
   1014     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
   1015 
   1016     /* neighbor pels for intra prediction */
   1017     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
   1018 
   1019     /* pointer to curr partition */
   1020     UWORD8 *pu1_mb_curr;
   1021     UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top;
   1022     UWORD8 *pu1_ref_mb_intra_4x4;
   1023 
   1024     /* pointer to residual macro block */
   1025     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
   1026 
   1027     /* pointer to prediction macro block */
   1028     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
   1029 
   1030     /* strides */
   1031     WORD32 i4_src_strd = ps_proc->i4_src_strd;
   1032     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   1033     WORD32 i4_ref_strd_left, i4_ref_strd_top;
   1034 
   1035     /* neighbors left, top, top right, top left */
   1036     UWORD8 *pu1_mb_a;
   1037     UWORD8 *pu1_mb_b;
   1038     UWORD8 *pu1_mb_c;
   1039     UWORD8 *pu1_mb_d;
   1040 
   1041     /* number of non zero coeffs*/
   1042     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
   1043 
   1044     /* quantization parameters */
   1045     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
   1046 
   1047     /* neighbor availability */
   1048     WORD32 i4_ngbr_avbl;
   1049     block_neighbors_t s_ngbr_avbl;
   1050 
   1051     /* temp vars */
   1052     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
   1053 
   1054     /* scan order inside 4x4 block */
   1055     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
   1056 
   1057     /* ngbr sub mb modes */
   1058     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
   1059     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
   1060 
   1061     /* valid intra modes map */
   1062     UWORD32 u4_valid_intra_modes;
   1063     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
   1064 
   1065     /* Dummy variable for 4x4 trans function */
   1066     WORD16 i2_dc_dummy;
   1067 
   1068     /* compute ngbr availability for sub blks */
   1069     i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_c << 3);
   1070     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
   1071 
   1072     for(b8 = 0; b8 < 4; b8++)
   1073     {
   1074         u4_blk_x = (b8 & 0x01) << 3;
   1075         u4_blk_y = (b8 >> 1) << 3;
   1076         for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
   1077         {
   1078             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
   1079             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
   1080 
   1081             pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd);
   1082             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
   1083             if (u4_pix_x == 0)
   1084             {
   1085                 i4_ref_strd_left = ps_proc->i4_rec_strd;
   1086                 pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left);
   1087             }
   1088             else
   1089             {
   1090                 i4_ref_strd_left = i4_pred_strd;
   1091                 pu1_mb_ref_left = pu1_ref_mb_intra_4x4;
   1092             }
   1093             if (u4_pix_y == 0)
   1094             {
   1095                 i4_ref_strd_top = ps_proc->i4_rec_strd;
   1096                 pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top);
   1097             }
   1098             else
   1099             {
   1100                 i4_ref_strd_top = i4_pred_strd;
   1101                 pu1_mb_ref_top = pu1_ref_mb_intra_4x4;
   1102             }
   1103 
   1104             pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */
   1105             pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */
   1106             pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */
   1107             if (u4_pix_y == 0)
   1108                 pu1_mb_d = pu1_mb_b - 1;
   1109             else
   1110                 pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */
   1111 
   1112             /* locating neighbors that are available for prediction */
   1113             /* TODO : update the neighbor availability information basing on constrained intra pred information */
   1114             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
   1115             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
   1116 
   1117             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
   1118             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
   1119             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
   1120             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
   1121             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
   1122             /* set valid intra modes for evaluation */
   1123             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
   1124 
   1125             /* if top partition is available and top right is not available for intra prediction, then */
   1126             /* padd top right samples using top sample and make top right also available */
   1127             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
   1128 
   1129             /* gather prediction pels from the neighbors */
   1130             if (s_ngbr_avbl.u1_mb_a)
   1131             {
   1132                 for(i = 0; i < 4; i++)
   1133                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left];
   1134             }
   1135             else
   1136             {
   1137                 memset(pu1_ngbr_pels_i4,0,4);
   1138             }
   1139             if(s_ngbr_avbl.u1_mb_b)
   1140             {
   1141                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
   1142             }
   1143             else
   1144             {
   1145                 memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4);
   1146             }
   1147             if (s_ngbr_avbl.u1_mb_d)
   1148                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
   1149             else
   1150                 pu1_ngbr_pels_i4[4] = 0;
   1151             if (s_ngbr_avbl.u1_mb_c)
   1152             {
   1153                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
   1154             }
   1155             else if (s_ngbr_avbl.u1_mb_b)
   1156             {
   1157                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
   1158                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
   1159             }
   1160 
   1161             i4_partition_cost_least = INT_MAX;
   1162 
   1163             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
   1164             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
   1165             {
   1166                 u4_estimated_intra_4x4_mode = DC_I4x4;
   1167             }
   1168             else
   1169             {
   1170                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
   1171                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
   1172 
   1173                 if (u4_pix_x == 0)
   1174                 {
   1175                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
   1176                     {
   1177                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
   1178                     }
   1179                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
   1180                     {
   1181                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
   1182                     }
   1183                 }
   1184                 else
   1185                 {
   1186                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
   1187                 }
   1188 
   1189                 if (u4_pix_y == 0)
   1190                 {
   1191                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
   1192                     {
   1193                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
   1194                     }
   1195                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
   1196                     {
   1197                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
   1198                     }
   1199                 }
   1200                 else
   1201                 {
   1202                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
   1203                 }
   1204 
   1205                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
   1206             }
   1207 
   1208             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
   1209 
   1210             /*mode evaluation and prediction*/
   1211             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
   1212                                                          pu1_ngbr_pels_i4,
   1213                                                          pu1_pred_mb, i4_src_strd,
   1214                                                          i4_pred_strd, i4_ngbr_avbl,
   1215                                                          &u4_best_intra_4x4_mode,
   1216                                                          &i4_partition_cost_least,
   1217                                                          u4_valid_intra_modes,
   1218                                                          u4_lambda,
   1219                                                          u4_estimated_intra_4x4_mode);
   1220 
   1221 
   1222             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits);
   1223 
   1224             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
   1225 
   1226             /* macroblock distortion */
   1227             i4_total_distortion += i4_partition_distortion_least;
   1228             i4_total_cost += i4_partition_cost_least;
   1229 
   1230             /* mb partition mode */
   1231             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
   1232 
   1233 
   1234             /********************************************************/
   1235             /*  error estimation,                                   */
   1236             /*  transform                                           */
   1237             /*  quantization                                        */
   1238             /********************************************************/
   1239             ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb,
   1240                                               pi2_res_mb, i4_src_strd,
   1241                                               i4_pred_strd,
   1242                                               /* No op stride, this implies a buff of lenght 1x16 */
   1243                                               ps_qp_params->pu2_scale_mat,
   1244                                               ps_qp_params->pu2_thres_mat,
   1245                                               ps_qp_params->u1_qbits,
   1246                                               ps_qp_params->u4_dead_zone,
   1247                                               pu1_nnz, &i2_dc_dummy);
   1248 
   1249             /********************************************************/
   1250             /*  ierror estimation,                                  */
   1251             /*  itransform                                          */
   1252             /*  iquantization                                       */
   1253             /********************************************************/
   1254             ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb,
   1255                                                  pu1_ref_mb_intra_4x4,
   1256                                                  i4_pred_strd, i4_pred_strd,
   1257                                                  ps_qp_params->pu2_iscale_mat,
   1258                                                  ps_qp_params->pu2_weigh_mat,
   1259                                                  ps_qp_params->u1_qp_div,
   1260                                                  ps_proc->pv_scratch_buff, 0,
   1261                                                  NULL);
   1262         }
   1263     }
   1264 
   1265     /* update the type of the mb if necessary */
   1266     if (i4_total_cost < ps_proc->i4_mb_cost)
   1267     {
   1268         ps_proc->i4_mb_cost = i4_total_cost;
   1269         ps_proc->i4_mb_distortion = i4_total_distortion;
   1270         ps_proc->u4_mb_type = I4x4;
   1271     }
   1272 
   1273     return ;
   1274 }
   1275 
   1276 /**
   1277 ******************************************************************************
   1278 *
   1279 * @brief
   1280 *  evaluate best chroma intra 8x8 mode (rate distortion opt off)
   1281 *
   1282 * @par Description
   1283 *  This function evaluates all the possible chroma intra 8x8 modes and finds
   1284 *  the mode that best represents the macroblock (least distortion) and occupies
   1285 *  fewer bits in the bitstream.
   1286 *
   1287 * @param[in] ps_proc_ctxt
   1288 *  pointer to macroblock context (handle)
   1289 *
   1290 * @remarks
   1291 *  For chroma best intra pred mode is calculated based only on SAD
   1292 *
   1293 * @returns none
   1294 *
   1295 ******************************************************************************
   1296 */
   1297 
   1298 void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
   1299 {
   1300     /* Codec Context */
   1301     codec_t *ps_codec = ps_proc->ps_codec;
   1302 
   1303     /* SAD(distortion metric) of an 8x8 block */
   1304     WORD32 i4_mb_distortion, i4_chroma_mb_distortion;
   1305 
   1306     /* intra mode */
   1307     UWORD32  u4_best_chroma_intra_8x8_mode = DC_CH_I8x8;
   1308 
   1309     /* neighbor pels for intra prediction */
   1310     UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels;
   1311 
   1312     /* pointer to curr macro block */
   1313     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
   1314     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
   1315 
   1316     /* pointer to prediction macro block */
   1317     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
   1318     UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
   1319 
   1320     /* strides */
   1321     WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
   1322     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
   1323     WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
   1324 
   1325     /* neighbors left, top, top left */
   1326     UWORD8 *pu1_mb_a = pu1_ref_mb - 2;
   1327     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c;
   1328     UWORD8 *pu1_mb_d = pu1_mb_b - 2;
   1329 
   1330     /* neighbor availability */
   1331     const UWORD8  u1_valid_intra_modes[8] = {1, 3, 9, 11, 5, 7, 13, 15,};
   1332     WORD32 i4_ngbr_avbl;
   1333 
   1334     /* valid intra modes map */
   1335     UWORD32 u4_valid_intra_modes;
   1336 
   1337     /* temp var */
   1338     UWORD8 i;
   1339 
   1340     /* locating neighbors that are available for prediction */
   1341     /* TODO : update the neighbor availability information basing on constrained intra pred information */
   1342     /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines
   1343      * basing on neighbors available and hence evade the computation of neighbor availability totally. */
   1344     /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
   1345     i4_ngbr_avbl = (ps_proc->ps_ngbr_avbl->u1_mb_a) + (ps_proc->ps_ngbr_avbl->u1_mb_b << 2) + (ps_proc->ps_ngbr_avbl->u1_mb_d << 1);
   1346     ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl;
   1347 
   1348     /* gather prediction pels from the neighbors */
   1349     /* left pels */
   1350     if (ps_proc->ps_ngbr_avbl->u1_mb_a)
   1351     {
   1352         for (i = 0; i < 16; i += 2)
   1353         {
   1354             pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c];
   1355             pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1];
   1356         }
   1357     }
   1358     else
   1359     {
   1360         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE);
   1361     }
   1362 
   1363     /* top pels */
   1364     if (ps_proc->ps_ngbr_avbl->u1_mb_b)
   1365     {
   1366         ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16);
   1367     }
   1368     else
   1369     {
   1370         ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE);
   1371     }
   1372 
   1373     /* top left pels */
   1374     if (ps_proc->ps_ngbr_avbl->u1_mb_d)
   1375     {
   1376         pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d;
   1377         pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1);
   1378     }
   1379 
   1380     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
   1381 
   1382     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST)
   1383         u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8);
   1384 
   1385     i4_chroma_mb_distortion = INT_MAX;
   1386 
   1387     /* perform intra mode chroma  8x8 evaluation */
   1388     /* intra prediction */
   1389     ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb,
   1390                                                     pu1_ngbr_pels_c_i8x8,
   1391                                                     pu1_pred_mb,
   1392                                                     i4_src_strd_c,
   1393                                                     i4_pred_strd,
   1394                                                     i4_ngbr_avbl,
   1395                                                     &u4_best_chroma_intra_8x8_mode,
   1396                                                     &i4_chroma_mb_distortion,
   1397                                                     u4_valid_intra_modes);
   1398 
   1399     if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/
   1400     {
   1401         (ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl);
   1402 
   1403         /* evaluate distortion(sad) */
   1404         ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion);
   1405 
   1406         /* update the least distortion information if necessary */
   1407         if(i4_mb_distortion < i4_chroma_mb_distortion)
   1408         {
   1409             i4_chroma_mb_distortion = i4_mb_distortion;
   1410             u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8;
   1411         }
   1412     }
   1413 
   1414     DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode);
   1415 
   1416     ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode;
   1417 
   1418     return ;
   1419 }
   1420 
   1421 
   1422 /**
   1423 ******************************************************************************
   1424 *
   1425 * @brief
   1426 *  Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
   1427 *  prediction.
   1428 *
   1429 * @par Description
   1430 *  This function evaluates first three 16x16 modes and compute corresponding sad
   1431 *  and return the buffer predicted with best mode.
   1432 *
   1433 * @param[in] pu1_src
   1434 *  UWORD8 pointer to the source
   1435 *
   1436 * @param[in] pu1_ngbr_pels_i16
   1437 *  UWORD8 pointer to neighbouring pels
   1438 *
   1439 * @param[out] pu1_dst
   1440 *  UWORD8 pointer to the destination
   1441 *
   1442 * @param[in] src_strd
   1443 *  integer source stride
   1444 *
   1445 * @param[in] dst_strd
   1446 *  integer destination stride
   1447 *
   1448 * @param[in] u4_n_avblty
   1449 *  availability of neighbouring pixels
   1450 *
   1451 * @param[in] u4_intra_mode
   1452 *  Pointer to the variable in which best mode is returned
   1453 *
   1454 * @param[in] pu4_sadmin
   1455 *  Pointer to the variable in which minimum sad is returned
   1456 *
   1457 * @param[in] u4_valid_intra_modes
   1458 *  Says what all modes are valid
   1459 *
   1460 * @returns      none
   1461 *
   1462 ******************************************************************************
   1463 */
   1464 void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
   1465                                       UWORD8 *pu1_ngbr_pels_i16,
   1466                                       UWORD8 *pu1_dst,
   1467                                       UWORD32 src_strd,
   1468                                       UWORD32 dst_strd,
   1469                                       WORD32 u4_n_avblty,
   1470                                       UWORD32 *u4_intra_mode,
   1471                                       WORD32 *pu4_sadmin,
   1472                                       UWORD32 u4_valid_intra_modes)
   1473 {
   1474     UWORD8 *pu1_neighbour;
   1475     UWORD8 *pu1_src_temp = pu1_src;
   1476     UWORD8 left = 0, top = 0;
   1477     WORD32 u4_dcval = 0;
   1478     WORD32 i, j;
   1479     WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX,
   1480                     i4_min_sad = INT_MAX;
   1481     UWORD8 val;
   1482 
   1483     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   1484     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   1485 
   1486     /* left available */
   1487     if (left)
   1488     {
   1489         i4_sad_horz = 0;
   1490 
   1491         for (i = 0; i < 16; i++)
   1492         {
   1493             val = pu1_ngbr_pels_i16[15 - i];
   1494 
   1495             u4_dcval += val;
   1496 
   1497             for (j = 0; j < 16; j++)
   1498             {
   1499                 i4_sad_horz += ABS(val - pu1_src_temp[j]);
   1500             }
   1501 
   1502             pu1_src_temp += src_strd;
   1503         }
   1504         u4_dcval += 8;
   1505     }
   1506 
   1507     pu1_src_temp = pu1_src;
   1508     /* top available */
   1509     if (top)
   1510     {
   1511         i4_sad_vert = 0;
   1512 
   1513         for (i = 0; i < 16; i++)
   1514         {
   1515             u4_dcval += pu1_ngbr_pels_i16[17 + i];
   1516 
   1517             for (j = 0; j < 16; j++)
   1518             {
   1519                 i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]);
   1520             }
   1521             pu1_src_temp += src_strd;
   1522 
   1523         }
   1524         u4_dcval += 8;
   1525     }
   1526 
   1527     u4_dcval = (u4_dcval) >> (3 + left + top);
   1528 
   1529     pu1_src_temp = pu1_src;
   1530 
   1531     /* none available */
   1532     u4_dcval += (left == 0) * (top == 0) * 128;
   1533 
   1534     i4_sad_dc = 0;
   1535 
   1536     for (i = 0; i < 16; i++)
   1537     {
   1538         for (j = 0; j < 16; j++)
   1539         {
   1540             i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]);
   1541         }
   1542         pu1_src_temp += src_strd;
   1543     }
   1544 
   1545     if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */
   1546         i4_sad_dc = INT_MAX;
   1547 
   1548     if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */
   1549         i4_sad_vert = INT_MAX;
   1550 
   1551     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */
   1552         i4_sad_horz = INT_MAX;
   1553 
   1554     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
   1555 
   1556     /* Finding Minimum sad and doing corresponding prediction */
   1557     if (i4_min_sad < *pu4_sadmin)
   1558     {
   1559         *pu4_sadmin = i4_min_sad;
   1560         if (i4_min_sad == i4_sad_vert)
   1561         {
   1562             *u4_intra_mode = VERT_I16x16;
   1563             pu1_neighbour = pu1_ngbr_pels_i16 + 17;
   1564             for (j = 0; j < 16; j++)
   1565             {
   1566                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
   1567                 pu1_dst += dst_strd;
   1568             }
   1569         }
   1570         else if (i4_min_sad == i4_sad_horz)
   1571         {
   1572             *u4_intra_mode = HORZ_I16x16;
   1573             for (j = 0; j < 16; j++)
   1574             {
   1575                 val = pu1_ngbr_pels_i16[15 - j];
   1576                 memset(pu1_dst, val, MB_SIZE);
   1577                 pu1_dst += dst_strd;
   1578             }
   1579         }
   1580         else
   1581         {
   1582             *u4_intra_mode = DC_I16x16;
   1583             for (j = 0; j < 16; j++)
   1584             {
   1585                 memset(pu1_dst, u4_dcval, MB_SIZE);
   1586                 pu1_dst += dst_strd;
   1587             }
   1588         }
   1589     }
   1590     return;
   1591 }
   1592 
   1593 /**
   1594 ******************************************************************************
   1595 *
   1596 * @brief
   1597 *  Evaluate best intra 4x4 mode and perform prediction.
   1598 *
   1599 * @par Description
   1600 *  This function evaluates  4x4 modes and compute corresponding sad
   1601 *  and return the buffer predicted with best mode.
   1602 *
   1603 * @param[in] pu1_src
   1604 *  UWORD8 pointer to the source
   1605 *
   1606 * @param[in] pu1_ngbr_pels
   1607 *  UWORD8 pointer to neighbouring pels
   1608 *
   1609 * @param[out] pu1_dst
   1610 *  UWORD8 pointer to the destination
   1611 *
   1612 * @param[in] src_strd
   1613 *  integer source stride
   1614 *
   1615 * @param[in] dst_strd
   1616 *  integer destination stride
   1617 *
   1618 * @param[in] u4_n_avblty
   1619 *  availability of neighbouring pixels
   1620 *
   1621 * @param[in] u4_intra_mode
   1622 *  Pointer to the variable in which best mode is returned
   1623 *
   1624 * @param[in] pu4_sadmin
   1625 *  Pointer to the variable in which minimum cost is returned
   1626 *
   1627 * @param[in] u4_valid_intra_modes
   1628 *  Says what all modes are valid
   1629 *
   1630 * @param[in] u4_lambda
   1631 *  Lamda value for computing cost from SAD
   1632 *
   1633 * @param[in] u4_predictd_mode
   1634 *  Predicted mode for cost computation
   1635 *
   1636 * @returns      none
   1637 *
   1638 ******************************************************************************
   1639 */
   1640 void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
   1641                                      UWORD8 *pu1_ngbr_pels,
   1642                                      UWORD8 *pu1_dst,
   1643                                      UWORD32 src_strd,
   1644                                      UWORD32 dst_strd,
   1645                                      WORD32 u4_n_avblty,
   1646                                      UWORD32 *u4_intra_mode,
   1647                                      WORD32 *pu4_sadmin,
   1648                                      UWORD32 u4_valid_intra_modes,
   1649                                      UWORD32  u4_lambda,
   1650                                      UWORD32 u4_predictd_mode)
   1651 {
   1652     UWORD8 *pu1_src_temp = pu1_src;
   1653     UWORD8 *pu1_pred = pu1_ngbr_pels;
   1654     UWORD8 left = 0, top = 0;
   1655     UWORD8 u1_pred_val = 0;
   1656     UWORD8 u1_pred_vals[4] = {0};
   1657     UWORD8 *pu1_pred_val = NULL;
   1658     /* To store FILT121 operated values*/
   1659     UWORD8 u1_pred_vals_diag_121[15] = {0};
   1660     /* To store FILT11 operated values*/
   1661     UWORD8 u1_pred_vals_diag_11[15] = {0};
   1662     UWORD8 u1_pred_vals_vert_r[8] = {0};
   1663     UWORD8 u1_pred_vals_horz_d[10] = {0};
   1664     UWORD8 u1_pred_vals_horz_u[10] = {0};
   1665     WORD32 u4_dcval = 0;
   1666     WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
   1667                                INT_MAX, INT_MAX, INT_MAX, INT_MAX};
   1668 
   1669     WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
   1670                                 INT_MAX, INT_MAX, INT_MAX, INT_MAX};
   1671     WORD32 i, i4_min_cost = INT_MAX;
   1672 
   1673     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   1674     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   1675 
   1676     /* Computing SAD */
   1677 
   1678     /* VERT mode valid */
   1679     if (u4_valid_intra_modes & 1)
   1680     {
   1681         pu1_pred = pu1_ngbr_pels + 5;
   1682         i4_sad[VERT_I4x4] = 0;
   1683         i4_cost[VERT_I4x4] = 0;
   1684 
   1685         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1686         pu1_src_temp += src_strd;
   1687         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1688         pu1_src_temp += src_strd;
   1689         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1690         pu1_src_temp += src_strd;
   1691         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
   1692 
   1693         i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ?
   1694                                         u4_lambda : 4 * u4_lambda);
   1695     }
   1696 
   1697     /* HORZ mode valid */
   1698     if (u4_valid_intra_modes & 2)
   1699     {
   1700         i4_sad[HORZ_I4x4] = 0;
   1701         i4_cost[HORZ_I4x4] =0;
   1702         pu1_src_temp = pu1_src;
   1703 
   1704         u1_pred_val = pu1_ngbr_pels[3];
   1705 
   1706         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1707                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1708                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1709                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1710         pu1_src_temp += src_strd;
   1711 
   1712         u1_pred_val = pu1_ngbr_pels[2];
   1713 
   1714         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1715                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1716                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1717                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1718         pu1_src_temp += src_strd;
   1719 
   1720         u1_pred_val = pu1_ngbr_pels[1];
   1721 
   1722         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1723                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1724                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1725                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1726         pu1_src_temp += src_strd;
   1727 
   1728         u1_pred_val = pu1_ngbr_pels[0];
   1729 
   1730         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
   1731                         + ABS(pu1_src_temp[1] - u1_pred_val)
   1732                         + ABS(pu1_src_temp[2] - u1_pred_val)
   1733                         + ABS(pu1_src_temp[3] - u1_pred_val);
   1734 
   1735         i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ?
   1736                                         u4_lambda : 4 * u4_lambda);
   1737     }
   1738 
   1739     /* DC mode valid */
   1740     if (u4_valid_intra_modes & 4)
   1741     {
   1742         i4_sad[DC_I4x4] = 0;
   1743         i4_cost[DC_I4x4] = 0;
   1744         pu1_src_temp = pu1_src;
   1745 
   1746         if (left)
   1747             u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2]
   1748                             + pu1_ngbr_pels[3] + 2;
   1749         if (top)
   1750             u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7]
   1751                             + pu1_ngbr_pels[8] + 2;
   1752 
   1753         u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128;
   1754 
   1755         /* none available */
   1756         memset(u1_pred_vals, u4_dcval, 4);
   1757         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1758         pu1_src_temp += src_strd;
   1759         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1760         pu1_src_temp += src_strd;
   1761         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1762         pu1_src_temp += src_strd;
   1763         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
   1764         pu1_src_temp += src_strd;
   1765 
   1766         i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ?
   1767                                         u4_lambda : 4 * u4_lambda);
   1768     }
   1769 
   1770     /* if modes other than VERT, HORZ and DC are  valid */
   1771     if (u4_valid_intra_modes > 7)
   1772     {
   1773         pu1_pred = pu1_ngbr_pels;
   1774         pu1_pred[13] = pu1_pred[14] = pu1_pred[12];
   1775 
   1776         /* Performing FILT121 and FILT11 operation for all neighbour values*/
   1777         for (i = 0; i < 13; i++)
   1778         {
   1779             u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]);
   1780             u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]);
   1781 
   1782             pu1_pred++;
   1783         }
   1784 
   1785         if (u4_valid_intra_modes & 8)/* DIAG_DL */
   1786         {
   1787             i4_sad[DIAG_DL_I4x4] = 0;
   1788             i4_cost[DIAG_DL_I4x4] = 0;
   1789             pu1_src_temp = pu1_src;
   1790             pu1_pred_val = u1_pred_vals_diag_121 + 5;
   1791 
   1792             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]);
   1793             pu1_src_temp += src_strd;
   1794             USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]);
   1795             pu1_src_temp += src_strd;
   1796             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]);
   1797             pu1_src_temp += src_strd;
   1798             USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]);
   1799             pu1_src_temp += src_strd;
   1800             i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ?
   1801                                             u4_lambda : 4 * u4_lambda);
   1802         }
   1803 
   1804         if (u4_valid_intra_modes & 16)/* DIAG_DR */
   1805         {
   1806             i4_sad[DIAG_DR_I4x4] = 0;
   1807             i4_cost[DIAG_DR_I4x4] = 0;
   1808             pu1_src_temp = pu1_src;
   1809             pu1_pred_val = u1_pred_vals_diag_121 + 3;
   1810 
   1811             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]);
   1812             pu1_src_temp += src_strd;
   1813             USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]);
   1814             pu1_src_temp += src_strd;
   1815             USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]);
   1816             pu1_src_temp += src_strd;
   1817             USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]);
   1818             pu1_src_temp += src_strd;
   1819             i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ?
   1820                                             u4_lambda : 4 * u4_lambda);
   1821 
   1822         }
   1823 
   1824         if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
   1825         {
   1826             i4_sad[VERT_R_I4x4] = 0;
   1827 
   1828             pu1_src_temp = pu1_src;
   1829             u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2];
   1830             memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3);
   1831             u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1];
   1832             memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3);
   1833 
   1834             pu1_pred_val = u1_pred_vals_diag_11 + 4;
   1835             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
   1836             pu1_pred_val = u1_pred_vals_diag_121 + 3;
   1837             pu1_src_temp += src_strd;
   1838             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
   1839             pu1_src_temp += src_strd;
   1840             USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]);
   1841             pu1_src_temp += src_strd;
   1842             USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4),
   1843                    i4_sad[VERT_R_I4x4]);
   1844 
   1845             i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ?
   1846                                             u4_lambda : 4 * u4_lambda);
   1847         }
   1848 
   1849         if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
   1850         {
   1851             i4_sad[HORZ_D_I4x4] = 0;
   1852 
   1853             pu1_src_temp = pu1_src;
   1854             u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3];
   1855             memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3);
   1856             u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0];
   1857             u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0];
   1858             u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1];
   1859             u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1];
   1860             u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2];
   1861             u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2];
   1862 
   1863             pu1_pred_val = u1_pred_vals_horz_d;
   1864             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]);
   1865             pu1_src_temp += src_strd;
   1866             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]);
   1867             pu1_src_temp += src_strd;
   1868             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]);
   1869             pu1_src_temp += src_strd;
   1870             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]);
   1871 
   1872             i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ?
   1873                                             u4_lambda : 4 * u4_lambda);
   1874         }
   1875 
   1876         if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
   1877         {
   1878             i4_sad[VERT_L_I4x4] = 0;
   1879             pu1_src_temp = pu1_src;
   1880             pu1_pred_val = u1_pred_vals_diag_11 + 5;
   1881             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1882             pu1_src_temp += src_strd;
   1883             pu1_pred_val = u1_pred_vals_diag_121 + 5;
   1884             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1885             pu1_src_temp += src_strd;
   1886             pu1_pred_val = u1_pred_vals_diag_11 + 6;
   1887             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1888             pu1_src_temp += src_strd;
   1889             pu1_pred_val = u1_pred_vals_diag_121 + 6;
   1890             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
   1891 
   1892             i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ?
   1893                                             u4_lambda : 4 * u4_lambda);
   1894         }
   1895 
   1896         if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
   1897         {
   1898             i4_sad[HORZ_U_I4x4] = 0;
   1899             pu1_src_temp = pu1_src;
   1900             u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2];
   1901             u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1];
   1902             u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1];
   1903             u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0];
   1904             u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0];
   1905             u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]);
   1906 
   1907             memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4);
   1908 
   1909             pu1_pred_val = u1_pred_vals_horz_u;
   1910             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]);
   1911             pu1_src_temp += src_strd;
   1912             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]);
   1913             pu1_src_temp += src_strd;
   1914             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]);
   1915             pu1_src_temp += src_strd;
   1916             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]);
   1917 
   1918             i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ?
   1919                                             u4_lambda : 4 * u4_lambda);
   1920         }
   1921 
   1922         i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]),
   1923                         MIN3(i4_cost[3], i4_cost[4], i4_cost[5]),
   1924                         MIN3(i4_cost[6], i4_cost[7], i4_cost[8]));
   1925 
   1926     }
   1927     else
   1928     {
   1929         /* Only first three modes valid */
   1930         i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]);
   1931     }
   1932 
   1933     *pu4_sadmin = i4_min_cost;
   1934 
   1935     if (i4_min_cost == i4_cost[0])
   1936     {
   1937         *u4_intra_mode = VERT_I4x4;
   1938         pu1_pred_val = pu1_ngbr_pels + 5;
   1939         memcpy(pu1_dst, (pu1_pred_val), 4);
   1940         pu1_dst += dst_strd;
   1941         memcpy(pu1_dst, (pu1_pred_val), 4);
   1942         pu1_dst += dst_strd;
   1943         memcpy(pu1_dst, (pu1_pred_val), 4);
   1944         pu1_dst += dst_strd;
   1945         memcpy(pu1_dst, (pu1_pred_val), 4);
   1946     }
   1947     else if (i4_min_cost == i4_cost[1])
   1948     {
   1949         *u4_intra_mode = HORZ_I4x4;
   1950         memset(pu1_dst, pu1_ngbr_pels[3], 4);
   1951         pu1_dst += dst_strd;
   1952         memset(pu1_dst, pu1_ngbr_pels[2], 4);
   1953         pu1_dst += dst_strd;
   1954         memset(pu1_dst, pu1_ngbr_pels[1], 4);
   1955         pu1_dst += dst_strd;
   1956         memset(pu1_dst, pu1_ngbr_pels[0], 4);
   1957     }
   1958     else if (i4_min_cost == i4_cost[2])
   1959     {
   1960         *u4_intra_mode = DC_I4x4;
   1961         memset(pu1_dst, u4_dcval, 4);
   1962         pu1_dst += dst_strd;
   1963         memset(pu1_dst, u4_dcval, 4);
   1964         pu1_dst += dst_strd;
   1965         memset(pu1_dst, u4_dcval, 4);
   1966         pu1_dst += dst_strd;
   1967         memset(pu1_dst, u4_dcval, 4);
   1968     }
   1969 
   1970     else if (i4_min_cost == i4_cost[3])
   1971     {
   1972         *u4_intra_mode = DIAG_DL_I4x4;
   1973         pu1_pred_val = u1_pred_vals_diag_121 + 5;
   1974         memcpy(pu1_dst, (pu1_pred_val), 4);
   1975         pu1_dst += dst_strd;
   1976         memcpy(pu1_dst, (pu1_pred_val + 1), 4);
   1977         pu1_dst += dst_strd;
   1978         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   1979         pu1_dst += dst_strd;
   1980         memcpy(pu1_dst, (pu1_pred_val + 3), 4);
   1981     }
   1982     else if (i4_min_cost == i4_cost[4])
   1983     {
   1984         *u4_intra_mode = DIAG_DR_I4x4;
   1985         pu1_pred_val = u1_pred_vals_diag_121 + 3;
   1986 
   1987         memcpy(pu1_dst, (pu1_pred_val), 4);
   1988         pu1_dst += dst_strd;
   1989         memcpy(pu1_dst, (pu1_pred_val - 1), 4);
   1990         pu1_dst += dst_strd;
   1991         memcpy(pu1_dst, (pu1_pred_val - 2), 4);
   1992         pu1_dst += dst_strd;
   1993         memcpy(pu1_dst, (pu1_pred_val - 3), 4);
   1994     }
   1995 
   1996     else if (i4_min_cost == i4_cost[5])
   1997     {
   1998         *u4_intra_mode = VERT_R_I4x4;
   1999         pu1_pred_val = u1_pred_vals_diag_11 + 4;
   2000         memcpy(pu1_dst, (pu1_pred_val), 4);
   2001         pu1_dst += dst_strd;
   2002         pu1_pred_val = u1_pred_vals_diag_121 + 3;
   2003         memcpy(pu1_dst, (pu1_pred_val), 4);
   2004         pu1_dst += dst_strd;
   2005         memcpy(pu1_dst, (u1_pred_vals_vert_r), 4);
   2006         pu1_dst += dst_strd;
   2007         memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4);
   2008     }
   2009     else if (i4_min_cost == i4_cost[6])
   2010     {
   2011         *u4_intra_mode = HORZ_D_I4x4;
   2012         pu1_pred_val = u1_pred_vals_horz_d;
   2013         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
   2014         pu1_dst += dst_strd;
   2015         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
   2016         pu1_dst += dst_strd;
   2017         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   2018         pu1_dst += dst_strd;
   2019         memcpy(pu1_dst, (pu1_pred_val), 4);
   2020         pu1_dst += dst_strd;
   2021     }
   2022     else if (i4_min_cost == i4_cost[7])
   2023     {
   2024         *u4_intra_mode = VERT_L_I4x4;
   2025         pu1_pred_val = u1_pred_vals_diag_11 + 5;
   2026         memcpy(pu1_dst, (pu1_pred_val), 4);
   2027         pu1_dst += dst_strd;
   2028         pu1_pred_val = u1_pred_vals_diag_121 + 5;
   2029         memcpy(pu1_dst, (pu1_pred_val), 4);
   2030         pu1_dst += dst_strd;
   2031         pu1_pred_val = u1_pred_vals_diag_11 + 6;
   2032         memcpy(pu1_dst, (pu1_pred_val), 4);
   2033         pu1_dst += dst_strd;
   2034         pu1_pred_val = u1_pred_vals_diag_121 + 6;
   2035         memcpy(pu1_dst, (pu1_pred_val), 4);
   2036     }
   2037     else if (i4_min_cost == i4_cost[8])
   2038     {
   2039         *u4_intra_mode = HORZ_U_I4x4;
   2040         pu1_pred_val = u1_pred_vals_horz_u;
   2041         memcpy(pu1_dst, (pu1_pred_val), 4);
   2042         pu1_dst += dst_strd;
   2043         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
   2044         pu1_dst += dst_strd;
   2045         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
   2046         pu1_dst += dst_strd;
   2047         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
   2048         pu1_dst += dst_strd;
   2049     }
   2050 
   2051     return;
   2052 }
   2053 
   2054 /**
   2055 ******************************************************************************
   2056 *
   2057 * @brief:
   2058 *  Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction.
   2059 *
   2060 * @par Description
   2061 *  This function evaluates  first three intra chroma modes and compute corresponding sad
   2062 *  and return the buffer predicted with best mode.
   2063 *
   2064 * @param[in] pu1_src
   2065 *  UWORD8 pointer to the source
   2066 *
   2067 * @param[in] pu1_ngbr_pels
   2068 *  UWORD8 pointer to neighbouring pels
   2069 *
   2070 * @param[out] pu1_dst
   2071 *  UWORD8 pointer to the destination
   2072 *
   2073 * @param[in] src_strd
   2074 *  integer source stride
   2075 *
   2076 * @param[in] dst_strd
   2077 *  integer destination stride
   2078 *
   2079 * @param[in] u4_n_avblty
   2080 *  availability of neighbouring pixels
   2081 *
   2082 * @param[in] u4_intra_mode
   2083 *  Pointer to the variable in which best mode is returned
   2084 *
   2085 * @param[in] pu4_sadmin
   2086 *  Pointer to the variable in which minimum sad is returned
   2087 *
   2088 * @param[in] u4_valid_intra_modes
   2089 *  Says what all modes are valid
   2090 *
   2091 * @return      none
   2092 *
   2093 ******************************************************************************
   2094 */
   2095 void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
   2096                                         UWORD8 *pu1_ngbr_pels,
   2097                                         UWORD8 *pu1_dst,
   2098                                         UWORD32 src_strd,
   2099                                         UWORD32 dst_strd,
   2100                                         WORD32 u4_n_avblty,
   2101                                         UWORD32 *u4_intra_mode,
   2102                                         WORD32 *pu4_sadmin,
   2103                                         UWORD32 u4_valid_intra_modes)
   2104 {
   2105     UWORD8 *pu1_neighbour;
   2106     UWORD8 *pu1_src_temp = pu1_src;
   2107     UWORD8 left = 0, top = 0;
   2108     WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */
   2109            u4_dcval_u_t[2] = { 0, 0 };  /*sum top neighbours for 'U'*/
   2110 
   2111     WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/
   2112            u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/
   2113 
   2114     WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX,
   2115                     i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX;
   2116     UWORD8 val_u, val_v;
   2117 
   2118     WORD32 u4_dc_val[2][2][2];/*  -----------
   2119                                   |    |    |  Chroma can have four
   2120                                   | 00 | 01 |  separate dc value...
   2121                                   -----------  u4_dc_val corresponds to this dc values
   2122                                   |    |    |  with u4_dc_val[2][2][U] and u4_dc_val[2][2][V]
   2123                                   | 10 | 11 |
   2124                                   -----------                */
   2125     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
   2126     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
   2127 
   2128     /*Evaluating HORZ*/
   2129     if (left)/* Ifleft available*/
   2130     {
   2131         i4_sad_horz = 0;
   2132 
   2133         for (i = 0; i < 8; i++)
   2134         {
   2135             val_v = pu1_ngbr_pels[15 - 2 * i];
   2136             val_u = pu1_ngbr_pels[15 - 2 * i - 1];
   2137             row = i / 4;
   2138             u4_dcval_u_l[row] += val_u;
   2139             u4_dcval_v_l[row] += val_v;
   2140             for (j = 0; j < 8; j++)
   2141             {
   2142                 i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/
   2143                 i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]);
   2144             }
   2145 
   2146             pu1_src_temp += src_strd;
   2147         }
   2148         u4_dcval_u_l[0] += 2;
   2149         u4_dcval_u_l[1] += 2;
   2150         u4_dcval_v_l[0] += 2;
   2151         u4_dcval_v_l[1] += 2;
   2152     }
   2153 
   2154     /*Evaluating VERT**/
   2155     pu1_src_temp = pu1_src;
   2156     if (top) /* top available*/
   2157     {
   2158         i4_sad_vert = 0;
   2159 
   2160         for (i = 0; i < 8; i++)
   2161         {
   2162             col = i / 4;
   2163 
   2164             val_u = pu1_ngbr_pels[18 + i * 2];
   2165             val_v = pu1_ngbr_pels[18 + i * 2 + 1];
   2166             u4_dcval_u_t[col] += val_u;
   2167             u4_dcval_v_t[col] += val_v;
   2168 
   2169             for (j = 0; j < 16; j++)
   2170             {
   2171                 i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/
   2172             }
   2173             pu1_src_temp += src_strd;
   2174 
   2175         }
   2176         u4_dcval_u_t[0] += 2;
   2177         u4_dcval_u_t[1] += 2;
   2178         u4_dcval_v_t[0] += 2;
   2179         u4_dcval_v_t[1] += 2;
   2180     }
   2181 
   2182     /* computing DC value*/
   2183     /* Equation  8-128 in spec*/
   2184     u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top);
   2185     u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top);
   2186     u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top);
   2187     u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top);
   2188 
   2189     if (top)
   2190     {
   2191         /* Equation  8-132 in spec*/
   2192         u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top);
   2193         u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top);
   2194     }
   2195     else
   2196     {
   2197         u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left);
   2198         u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left);
   2199     }
   2200 
   2201     if (left)
   2202     {
   2203         u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left);
   2204         u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left);
   2205     }
   2206     else
   2207     {
   2208         u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top);
   2209         u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top);
   2210     }
   2211 
   2212     if (!(left || top))
   2213     {
   2214         /*none available*/
   2215         u4_dc_val[0][0][0] = u4_dc_val[0][0][1] =
   2216         u4_dc_val[0][1][0] = u4_dc_val[0][1][1] =
   2217         u4_dc_val[1][0][0] = u4_dc_val[1][0][1] =
   2218         u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128;
   2219     }
   2220 
   2221     /* Evaluating DC */
   2222     pu1_src_temp = pu1_src;
   2223     i4_sad_dc = 0;
   2224     for (i = 0; i < 8; i++)
   2225     {
   2226         for (j = 0; j < 8; j++)
   2227         {
   2228             col = j / 4;
   2229             row = i / 4;
   2230             val_u = u4_dc_val[row][col][0];
   2231             val_v = u4_dc_val[row][col][1];
   2232 
   2233             i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/
   2234             i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]);
   2235         }
   2236         pu1_src_temp += src_strd;
   2237     }
   2238 
   2239     if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/
   2240         i4_sad_dc = INT_MAX;
   2241     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/
   2242         i4_sad_horz = INT_MAX;
   2243     if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/
   2244         i4_sad_vert = INT_MAX;
   2245 
   2246     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
   2247 
   2248     /* Finding Minimum sad and doing corresponding prediction*/
   2249     if (i4_min_sad < *pu4_sadmin)
   2250     {
   2251         *pu4_sadmin = i4_min_sad;
   2252 
   2253         if (i4_min_sad == i4_sad_dc)
   2254         {
   2255             *u4_intra_mode = DC_CH_I8x8;
   2256             for (i = 0; i < 8; i++)
   2257             {
   2258                 for (j = 0; j < 8; j++)
   2259                 {
   2260                     col = j / 4;
   2261                     row = i / 4;
   2262 
   2263                     pu1_dst[2 * j] = u4_dc_val[row][col][0];
   2264                     pu1_dst[2 * j + 1] = u4_dc_val[row][col][1];
   2265                 }
   2266                 pu1_dst += dst_strd;
   2267             }
   2268         }
   2269         else if (i4_min_sad == i4_sad_horz)
   2270         {
   2271             *u4_intra_mode = HORZ_CH_I8x8;
   2272             for (j = 0; j < 8; j++)
   2273             {
   2274                 val_v = pu1_ngbr_pels[15 - 2 * j];
   2275                 val_u = pu1_ngbr_pels[15 - 2 * j - 1];
   2276 
   2277                 for (i = 0; i < 8; i++)
   2278                 {
   2279                     pu1_dst[2 * i] = val_u;
   2280                     pu1_dst[2 * i + 1] = val_v;
   2281 
   2282                 }
   2283                 pu1_dst += dst_strd;
   2284             }
   2285         }
   2286         else
   2287         {
   2288             *u4_intra_mode = VERT_CH_I8x8;
   2289             pu1_neighbour = pu1_ngbr_pels + 18;
   2290             for (j = 0; j < 8; j++)
   2291             {
   2292                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
   2293                 pu1_dst += dst_strd;
   2294             }
   2295         }
   2296     }
   2297 
   2298     return;
   2299 }
   2300