Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21 *******************************************************************************
     22 * @file
     23 *  ih264e_intra_modes_eval_ssse3.c
     24 *
     25 * @brief
     26 *   This file contains definitions of routines that perform rate distortion
     27 *  analysis on a macroblock if they are to be coded as intra.
     28 *
     29 * @author
     30 *  Ittiam
     31 *
     32 * @par List of Functions:
     33 *  ih264e_evaluate_intra16x16_modes_ssse3
     34 *  ih264e_evaluate_intra_4x4_modes_ssse3
     35 *  ih264e_evaluate_intra_chroma_modes_ssse3
     36 *
     37 * @remarks
     38 *  None
     39 *
     40 *******************************************************************************
     41 */
     42 
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 
     47 /* System include files */
     48 #include <stdio.h>
     49 #include <string.h>
     50 #include <limits.h>
     51 #include <assert.h>
     52 #include <immintrin.h>
     53 
     54 /* User include files */
     55 #include "ih264e_config.h"
     56 #include "ih264_typedefs.h"
     57 #include "ih264e_defs.h"
     58 #include "iv2.h"
     59 #include "ive2.h"
     60 #include "ih264_debug.h"
     61 #include "ih264_defs.h"
     62 #include "ih264_macros.h"
     63 #include "ih264_intra_pred_filters.h"
     64 #include "ih264_structs.h"
     65 #include "ih264_common_tables.h"
     66 #include "ih264_trans_quant_itrans_iquant.h"
     67 #include "ih264_inter_pred_filters.h"
     68 #include "ih264_mem_fns.h"
     69 #include "ih264_padding.h"
     70 #include "ih264_deblk_edge_filters.h"
     71 #include "ime_distortion_metrics.h"
     72 #include "ih264e_error.h"
     73 #include "ih264e_bitstream.h"
     74 #include "ime_defs.h"
     75 #include "ime_structs.h"
     76 #include "ih264_cabac_tables.h"
     77 #include "irc_cntrl_param.h"
     78 #include "irc_frame_info_collector.h"
     79 #include "ih264e_rate_control.h"
     80 
     81 #include "ih264e_cabac_structs.h"
     82 #include "ih264e_structs.h"
     83 #include "ih264e_cabac.h"
     84 #include "ih264e_intra_modes_eval.h"
     85 #include "ih264e_globals.h"
     86 #include "ime_platform_macros.h"
     87 
     88 
     89 /*****************************************************************************/
     90 /* Function Definitions                                                      */
     91 /*****************************************************************************/
     92 /**
     93 ******************************************************************************
     94 *
     95 * @brief
     96 *  evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
     97 *  prediction.
     98 *
     99 * @par Description
    100 *  This function evaluates first three 16x16 modes and compute corresponding
    101 *  SAD and returns the buffer predicted with best mode.
    102 *
    103 * @param[in] pu1_src
    104 *  UWORD8 pointer to the source
    105 *
    106 * @param[in] pu1_ngbr_pels_i16
    107 *  UWORD8 pointer to neighbouring pels
    108 *
    109 * @param[out] pu1_dst
    110 *  UWORD8 pointer to the destination
    111 *
    112 * @param[in] src_strd
    113 *  integer source stride
    114 *
    115 * @param[in] dst_strd
    116 *  integer destination stride
    117 *
    118 * @param[in] u4_n_avblty
    119 *  availability of neighbouring pixels
    120 *
    121 * @param[in] u4_intra_mode
    122 *  pointer to the variable in which best mode is returned
    123 *
    124 * @param[in] pu4_sadmin
    125 *  pointer to the variable in which minimum sad is returned
    126 *
    127 * @param[in] u4_valid_intra_modes
    128 *  says what all modes are valid
    129 *
    130 * @return
    131 *  None
    132 *
    133 ******************************************************************************
    134 */
    135 void ih264e_evaluate_intra16x16_modes_ssse3(UWORD8 *pu1_src,
    136                                             UWORD8 *pu1_ngbr_pels_i16,
    137                                             UWORD8 *pu1_dst,
    138                                             UWORD32 src_strd,
    139                                             UWORD32 dst_strd,
    140                                             WORD32 n_avblty,
    141                                             UWORD32 *u4_intra_mode,
    142                                             WORD32 *pu4_sadmin,
    143                                             UWORD32 u4_valid_intra_modes)
    144 {
    145     UWORD8 *pu1_src_temp;
    146 
    147     WORD32 left, top, horz_flag, vert_flag, dc_flag;
    148     WORD32 sad_vert, sad_horz, sad_dc, min_sad;
    149 
    150     WORD32 cnt, dcval;
    151     WORD32 src_strd2, src_strd3, src_strd4;
    152     WORD32 dst_strd2, dst_strd3, dst_strd4;
    153 
    154     __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b;
    155     __m128i val1_16x8b, val2_16x8b, val3_16x8b, val4_16x8b;
    156     __m128i sad1_8x16b, sad2_8x16b, sad3_8x16b, sad4_8x16b;
    157 
    158     __m128i sad_8x16b, val_16x8b, zero_vector;
    159 
    160     sad_vert = INT_MAX;
    161     sad_horz = INT_MAX;
    162     sad_dc = INT_MAX;
    163 
    164     src_strd2 = src_strd << 1;
    165     src_strd4 = src_strd << 2;
    166     src_strd3 = src_strd + src_strd2;
    167 
    168     dst_strd2 = dst_strd << 1;
    169     dst_strd4 = dst_strd << 2;
    170     dst_strd3 = dst_strd + dst_strd2;
    171 
    172     left = (n_avblty & LEFT_MB_AVAILABLE_MASK);
    173     top = (n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
    174 
    175     zero_vector = _mm_setzero_si128();
    176 
    177     horz_flag = left && ((u4_valid_intra_modes & 02) != 0);
    178     vert_flag = top && ((u4_valid_intra_modes & 01) != 0);
    179     dc_flag = (u4_valid_intra_modes & 04) != 0;
    180 
    181     if(horz_flag)
    182     {
    183         pu1_src_temp = pu1_src;
    184 
    185         val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[15]);
    186         val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[14]);
    187         val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[13]);
    188         val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[12]);
    189 
    190         src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
    191         src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
    192         src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
    193         src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
    194 
    195         sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
    196         sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b);
    197         sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b);
    198         sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b);
    199 
    200         sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
    201         sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
    202 
    203         cnt = 11;
    204         sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
    205         do
    206         {
    207             pu1_src_temp += src_strd4;
    208 
    209             val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]);
    210             val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]);
    211             val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]);
    212             val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]);
    213 
    214             src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
    215             src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
    216             src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
    217             src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
    218 
    219             sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
    220             sad2_8x16b = _mm_sad_epu8(val2_16x8b, src2_16x8b);
    221             sad3_8x16b = _mm_sad_epu8(val3_16x8b, src3_16x8b);
    222             sad4_8x16b = _mm_sad_epu8(val4_16x8b, src4_16x8b);
    223 
    224             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
    225             sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
    226             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
    227 
    228             cnt -= 4;
    229             sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
    230         }
    231         while(cnt >= 0);
    232 
    233         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    234         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    235         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    236 
    237         sad_horz = _mm_extract_epi16(sad_8x16b, 0);
    238     }
    239 
    240     if(vert_flag)
    241     {
    242         pu1_src_temp = pu1_src;
    243 
    244         val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
    245 
    246         src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
    247         src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
    248         src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
    249         src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
    250 
    251         sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
    252         sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
    253         sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
    254         sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
    255 
    256         sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
    257         sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
    258 
    259         cnt = 11;
    260         sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
    261         do
    262         {
    263             pu1_src_temp += src_strd4;
    264 
    265             src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
    266             src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
    267             src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
    268             src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
    269 
    270             sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
    271             sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
    272             sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
    273             sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
    274 
    275             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
    276             sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
    277             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
    278 
    279             cnt -= 4;
    280             sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
    281         }
    282         while(cnt >= 0);
    283 
    284         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    285         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    286         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    287 
    288         sad_vert = _mm_extract_epi16(sad_8x16b, 0);
    289     }
    290 
    291     dcval = 0;
    292 
    293     if(left)
    294     {
    295         val_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels_i16);
    296         dcval += 8;
    297 
    298         sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
    299         dcval += _mm_extract_epi16(sad1_8x16b, 0);
    300         dcval += _mm_extract_epi16(sad1_8x16b, 4);
    301     }
    302     if(top)
    303     {
    304         val_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
    305         dcval += 8;
    306 
    307         sad1_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
    308         dcval += _mm_extract_epi16(sad1_8x16b, 0);
    309         dcval += _mm_extract_epi16(sad1_8x16b, 4);
    310     }
    311     dcval = dcval >> (3 + left + top);
    312     dcval += ((left == 0) & (top == 0)) << 7;
    313 
    314     if(dc_flag)
    315     {
    316         pu1_src_temp = pu1_src;
    317         val1_16x8b = _mm_set1_epi8(dcval);
    318 
    319         src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
    320         src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
    321         src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
    322         src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
    323 
    324         sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
    325         sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
    326         sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
    327         sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
    328 
    329         sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
    330         sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
    331 
    332         cnt = 12;
    333         sad_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
    334         do
    335         {
    336             pu1_src_temp += src_strd4;
    337 
    338             src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src_temp);
    339             src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd));
    340             src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd2));
    341             src4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_temp + src_strd3));
    342 
    343             sad1_8x16b = _mm_sad_epu8(val1_16x8b, src1_16x8b);
    344             sad2_8x16b = _mm_sad_epu8(val1_16x8b, src2_16x8b);
    345             sad3_8x16b = _mm_sad_epu8(val1_16x8b, src3_16x8b);
    346             sad4_8x16b = _mm_sad_epu8(val1_16x8b, src4_16x8b);
    347 
    348             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad2_8x16b);
    349             sad3_8x16b = _mm_packs_epi32(sad3_8x16b, sad4_8x16b);
    350             sad1_8x16b = _mm_packs_epi32(sad1_8x16b, sad3_8x16b);
    351 
    352             cnt -= 4;
    353             sad_8x16b = _mm_add_epi16(sad_8x16b, sad1_8x16b);
    354         }
    355         while(cnt > 0);
    356 
    357         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    358         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    359         sad_8x16b = _mm_hadd_epi16(sad_8x16b, sad_8x16b);
    360 
    361         sad_dc = _mm_extract_epi16(sad_8x16b, 0);
    362     }
    363 
    364     // Doing prediction for minimum SAD
    365     min_sad = MIN3(sad_horz, sad_vert, sad_dc);
    366     if(min_sad < *pu4_sadmin)
    367     {
    368         *pu4_sadmin = min_sad;
    369         if(min_sad == sad_vert)
    370         {
    371             *u4_intra_mode = VERT_I16x16;
    372             val1_16x8b = _mm_loadu_si128((__m128i *)(pu1_ngbr_pels_i16 + 17));
    373             cnt = 15;
    374             do
    375             {
    376                 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
    377                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b);
    378                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b);
    379                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b);
    380 
    381                 cnt -= 4;
    382                 pu1_dst += dst_strd4;
    383             }
    384             while(cnt > 0);
    385         }
    386         else if(min_sad == sad_horz)
    387         {
    388             *u4_intra_mode = HORZ_I16x16;
    389             cnt = 15;
    390             do
    391             {
    392                 val1_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt]);
    393                 val2_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 1]);
    394                 val3_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 2]);
    395                 val4_16x8b = _mm_set1_epi8(pu1_ngbr_pels_i16[cnt - 3]);
    396 
    397                 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
    398                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val2_16x8b);
    399                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val3_16x8b);
    400                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val4_16x8b);
    401 
    402                 cnt -= 4;
    403                 pu1_dst += dst_strd4;
    404             }
    405             while(cnt >= 0);
    406         }
    407         else
    408         {
    409             *u4_intra_mode = DC_I16x16;
    410             val1_16x8b = _mm_set1_epi8(dcval);
    411             cnt = 15;
    412             do
    413             {
    414                 _mm_storeu_si128((__m128i *)pu1_dst, val1_16x8b);
    415                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), val1_16x8b);
    416                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), val1_16x8b);
    417                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), val1_16x8b);
    418 
    419                 cnt -= 4;
    420                 pu1_dst += dst_strd4;
    421             }
    422             while(cnt > 0);
    423         }
    424     }
    425 }
    426 
    427 /**
    428 ******************************************************************************
    429 *
    430 * @brief :Evaluate best intra 4x4 mode and do the prediction.
    431 *
    432 * @par Description
    433 *  This function evaluates intra 4x4 modes, computes corresponding sad
    434 *  and returns the buffer predicted with best mode.
    435 *
    436 * @param[in] pu1_src
    437 *  UWORD8 pointer to the source
    438 *
    439 ** @param[in] pu1_ngbr_pels
    440 *  UWORD8 pointer to neighbouring pels
    441 *
    442 * @param[out] pu1_dst
    443 *  UWORD8 pointer to the destination
    444 *
    445 * @param[in] src_strd
    446 *  integer source stride
    447 *
    448 * @param[in] dst_strd
    449 *  integer destination stride
    450 *
    451 * @param[in] u4_n_avblty
    452 * availability of neighbouring pixels
    453 *
    454 * @param[in] u4_intra_mode
    455 * Pointer to the variable in which best mode is returned
    456 *
    457 * @param[in] pu4_sadmin
    458 * Pointer to the variable in which minimum cost is returned
    459 *
    460 * @param[in] u4_valid_intra_modes
    461 * Says what all modes are valid
    462 *
    463 * * @param[in] u4_lambda
    464 * Lamda value for computing cost from SAD
    465 *
    466 * @param[in] u4_predictd_mode
    467 * Predicted mode for cost computation
    468 *
    469 * @return      none
    470 *
    471 ******************************************************************************
    472 */
    473 void ih264e_evaluate_intra_4x4_modes_ssse3(UWORD8 *pu1_src,
    474                                            UWORD8 *pu1_ngbr_pels,
    475                                            UWORD8 *pu1_dst,
    476                                            UWORD32 src_strd,
    477                                            UWORD32 dst_strd,
    478                                            WORD32 u4_n_avblty,
    479                                            UWORD32 *u4_intra_mode,
    480                                            WORD32 *pu4_sadmin,
    481                                            UWORD32 u4_valid_intra_modes,
    482                                            UWORD32 u4_lambda,
    483                                            UWORD32 u4_predictd_mode)
    484 {
    485     WORD32 left, top;
    486     WORD32 sad[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
    487                              INT_MAX, INT_MAX, INT_MAX, INT_MAX };
    488     WORD32 cost[MAX_I4x4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
    489                               INT_MAX, INT_MAX, INT_MAX, INT_MAX };
    490 
    491     WORD32 min_cost;
    492     UWORD32 lambda4 = u4_lambda << 2;
    493     WORD32 dst_strd2, dst_strd3;
    494 
    495     __m128i left_top_16x8b, src_16x8b, pred0_16x8b, sad_8x16b;
    496     __m128i pred1_16x8b, pred2_16x8b, pred3_16x8b, pred4_16x8b;
    497     __m128i pred5_16x8b, pred6_16x8b, pred7_16x8b, pred8_16x8b;
    498     __m128i shuffle_16x8b, zero_vector, mask_low_32b;
    499 
    500     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
    501     top  =  (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
    502 
    503     dst_strd2 = dst_strd << 1;
    504     dst_strd3 = dst_strd + dst_strd2;
    505 
    506     // loading the 4x4 source block and neighbouring pixels
    507     {
    508         __m128i row1_16x8b, row2_16x8b;
    509 
    510         row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    511         row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    512         left_top_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels);
    513 
    514         pu1_src += src_strd << 1;
    515         src_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b);
    516 
    517         row1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    518         row2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    519         zero_vector = _mm_setzero_si128();
    520 
    521         row1_16x8b = _mm_unpacklo_epi32(row1_16x8b, row2_16x8b);
    522         src_16x8b = _mm_unpacklo_epi64(src_16x8b, row1_16x8b);
    523     }
    524 
    525     /* Computing SADs*/
    526     if(u4_valid_intra_modes & 1)/* VERT mode valid ????*/
    527     {
    528         pred0_16x8b = _mm_srli_si128(left_top_16x8b, 5);
    529         pred0_16x8b = _mm_shuffle_epi32(pred0_16x8b, 0);
    530         sad_8x16b = _mm_sad_epu8(src_16x8b, pred0_16x8b);
    531 
    532         sad[VERT_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    533         cost[VERT_I4x4] = sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ? u4_lambda: lambda4);
    534     }
    535 
    536     if(u4_valid_intra_modes & 2)/* HORZ mode valid ????*/
    537     {
    538         shuffle_16x8b = _mm_setr_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
    539         pred1_16x8b = _mm_shuffle_epi8(left_top_16x8b, shuffle_16x8b);
    540 
    541         sad_8x16b = _mm_sad_epu8(src_16x8b, pred1_16x8b);
    542 
    543         sad[HORZ_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    544         cost[HORZ_I4x4] = sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ? u4_lambda: lambda4);
    545     }
    546 
    547     if(u4_valid_intra_modes & 4)/* DC mode valid ????*/
    548     {
    549         if(top + left)
    550         {
    551             WORD32 shft = 1, dcval = 0;
    552 
    553             __m128i val_16x8b, temp_16x8b, temp_8x16b;
    554 
    555             val_16x8b = _mm_setzero_si128();
    556 
    557             if(top)
    558             {
    559                 temp_16x8b = _mm_srli_si128(left_top_16x8b, 5);
    560                 val_16x8b = _mm_alignr_epi8(temp_16x8b, val_16x8b, 4);
    561                 shft ++;
    562                 dcval += 2;
    563             }
    564             if(left)
    565             {
    566                 val_16x8b = _mm_alignr_epi8(left_top_16x8b, val_16x8b, 4);
    567                 shft++;
    568                 dcval += 2;
    569             }
    570 
    571             temp_8x16b = _mm_sad_epu8(val_16x8b, zero_vector);
    572             dcval += _mm_extract_epi16(temp_8x16b, 4);
    573             dcval = dcval >> shft;
    574             pred2_16x8b = _mm_set1_epi8(dcval);
    575         }
    576         else
    577             pred2_16x8b = _mm_set1_epi8(128);
    578 
    579         sad_8x16b = _mm_sad_epu8(src_16x8b, pred2_16x8b);
    580 
    581         sad[DC_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    582         cost[DC_I4x4] = sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ? u4_lambda: lambda4);
    583     }
    584 
    585     if(u4_valid_intra_modes > 7)/* if modes other than VERT, HORZ and DC are  valid ????*/
    586     {
    587         __m128i w11_16x8b, w121_16x8b;
    588         __m128i temp1_16x8b, temp2_16x8b;
    589 
    590         /* Performing FILT121 and FILT11 operation for all neighbour values*/
    591         {
    592             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b;
    593             __m128i const_2_8x16b;
    594 
    595             const_2_8x16b = _mm_set1_epi16(2);
    596 
    597             temp1_8x16b = _mm_unpacklo_epi8(left_top_16x8b, zero_vector);   //l3 l2 l1 l0 tl t0 t1 t2
    598             temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2);                   // 0 l3 l2 l1 l0 tl t0 t1
    599             temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5);           //l3 l3 l2 l1 l0 tl t0 t1
    600 
    601             temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b);          //l3+l3  l3+l2       l2+l1...       t1+t2
    602             temp2_8x16b = _mm_slli_si128(temp1_8x16b, 2);                   //l3+l3  l3+l3       l3+l2...       t0+t1
    603             temp2_8x16b = _mm_shufflelo_epi16(temp2_8x16b, 0xe5);
    604             temp1_8x16b = _mm_add_epi16(temp1_8x16b, temp2_8x16b);          //4*l3   l3+2*l3+l2  l3+2*l2+l1...  t0+2*t1+t2
    605 
    606             temp1_8x16b = _mm_add_epi16(const_2_8x16b, temp1_8x16b);        //4*l3+2 3*l3+l2+2   l3+2*l2+l1+2.. t0+2*t1+t2+2
    607             temp1_8x16b = _mm_srli_epi16(temp1_8x16b, 2);
    608 
    609             temp1_16x8b = _mm_srli_si128(left_top_16x8b, 1);
    610             w11_16x8b = _mm_avg_epu8(left_top_16x8b, temp1_16x8b);
    611 
    612             temp2_16x8b = _mm_srli_si128(left_top_16x8b, 6);
    613             temp2_8x16b = _mm_unpacklo_epi8(temp2_16x8b, zero_vector);      //t1 t2 t3 t4 t5 t6 t7 0
    614             temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2);                   //t2 t3 t4 t5 t6 t7 0  0
    615             temp3_8x16b = _mm_shufflehi_epi16(temp3_8x16b, 0xd4);           //t2 t3 t4 t5 t6 t7 t7 0
    616 
    617             temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b);          //t1+t2      t2+t3...     t6+t7      t7+t7 0
    618             temp3_8x16b = _mm_srli_si128(temp2_8x16b, 2);                   //t2+t3      t3+t4...     t7+t7      0     0
    619             temp2_8x16b = _mm_add_epi16(temp2_8x16b, temp3_8x16b);          //t1+2*t2+t3 t2+2*t3+t4.. t6+2*t7+t7 t7+t7 0
    620 
    621             temp2_8x16b = _mm_add_epi16(const_2_8x16b, temp2_8x16b);        //t1+2*t2+t3+2 t2+2*t3+t4+2 t3+2*t4+t5+2... t6+2*t7+t7+2 t7+t7+2  2
    622             temp2_8x16b = _mm_srli_epi16(temp2_8x16b, 2);
    623 
    624             w121_16x8b = _mm_packus_epi16(temp1_8x16b, temp2_8x16b);
    625         }
    626 
    627         if(u4_valid_intra_modes & 8)/* DIAG_DL */
    628         {
    629             shuffle_16x8b = _mm_setr_epi8( 7,  8,  9,  10,
    630                                            8,  9,  10, 11,
    631                                            9,  10, 11, 12,
    632                                           10,  11, 12, 13);
    633             pred3_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b);
    634             sad_8x16b = _mm_sad_epu8(src_16x8b, pred3_16x8b);
    635 
    636             sad[DIAG_DL_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    637             cost[DIAG_DL_I4x4] = sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ? u4_lambda: lambda4);
    638         }
    639 
    640         if(u4_valid_intra_modes & 16)/* DIAG_DR */
    641         {
    642             shuffle_16x8b = _mm_setr_epi8(5, 6, 7, 8,
    643                                           4, 5, 6, 7,
    644                                           3, 4, 5, 6,
    645                                           2, 3, 4, 5);
    646             pred4_16x8b = _mm_shuffle_epi8(w121_16x8b, shuffle_16x8b);
    647             sad_8x16b = _mm_sad_epu8(src_16x8b, pred4_16x8b);
    648 
    649             sad[DIAG_DR_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    650             cost[DIAG_DR_I4x4] = sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ? u4_lambda: lambda4);
    651         }
    652 
    653         if(u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
    654         {
    655             temp1_16x8b = _mm_srli_si128(w121_16x8b, 1);
    656             temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, w11_16x8b);
    657             shuffle_16x8b = _mm_setr_epi8(12, 13, 14, 15,
    658                                            4,  5,  6,  7,
    659                                            3, 12, 13, 14,
    660                                            2,  4,  5,  6);
    661             pred5_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
    662             sad_8x16b = _mm_sad_epu8(src_16x8b, pred5_16x8b);
    663 
    664             sad[VERT_R_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    665             cost[VERT_R_I4x4] = sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ? u4_lambda: lambda4);
    666         }
    667 
    668         if(u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
    669         {
    670             temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b);
    671             shuffle_16x8b = _mm_setr_epi8(11, 5,  6, 7,
    672                                           10, 4, 11, 5,
    673                                            9, 3, 10, 4,
    674                                            8, 2,  9, 3);
    675             pred6_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
    676             sad_8x16b = _mm_sad_epu8(src_16x8b, pred6_16x8b);
    677 
    678             sad[HORZ_D_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    679             cost[HORZ_D_I4x4] = sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ? u4_lambda: lambda4);
    680         }
    681 
    682         if(u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
    683         {
    684             temp1_16x8b = _mm_srli_si128(w121_16x8b, 5);
    685             temp2_16x8b = _mm_srli_si128(w11_16x8b, 5);
    686             temp1_16x8b = _mm_unpacklo_epi64(temp1_16x8b, temp2_16x8b);
    687             shuffle_16x8b = _mm_setr_epi8(8,  9, 10, 11,
    688                                           2,  3,  4,  5,
    689                                           9, 10, 11, 12,
    690                                           3,  4,  5,  6);
    691             pred7_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
    692             sad_8x16b = _mm_sad_epu8(src_16x8b, pred7_16x8b);
    693 
    694             sad[VERT_L_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    695             cost[VERT_L_I4x4] = sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ? u4_lambda: lambda4);
    696         }
    697 
    698         if(u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
    699         {
    700             temp1_16x8b = _mm_unpacklo_epi64(w121_16x8b, w11_16x8b);
    701             shuffle_16x8b = _mm_setr_epi8(10, 3, 9, 2,
    702                                            9, 2, 8, 1,
    703                                            8, 1, 0, 0,
    704                                            0, 0, 0, 0);
    705             pred8_16x8b = _mm_shuffle_epi8(temp1_16x8b, shuffle_16x8b);
    706             sad_8x16b = _mm_sad_epu8(src_16x8b, pred8_16x8b);
    707 
    708             sad[HORZ_U_I4x4] = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    709             cost[HORZ_U_I4x4] = sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ? u4_lambda: lambda4);
    710         }
    711 
    712         min_cost = MIN3(MIN3(cost[0], cost[1], cost[2]),
    713                         MIN3(cost[3], cost[4], cost[5]),
    714                         MIN3(cost[6], cost[7], cost[8]));
    715     }
    716     else
    717     {  /*Only first three modes valid*/
    718         min_cost = MIN3(cost[0], cost[1], cost[2]);
    719     }
    720 
    721     *pu4_sadmin = min_cost;
    722 
    723     if(min_cost == cost[0])
    724     {
    725         *u4_intra_mode = VERT_I4x4;
    726     }
    727     else if(min_cost == cost[1])
    728     {
    729         *u4_intra_mode = HORZ_I4x4;
    730         pred0_16x8b = pred1_16x8b;
    731     }
    732     else if(min_cost == cost[2])
    733     {
    734         *u4_intra_mode = DC_I4x4;
    735         pred0_16x8b = pred2_16x8b;
    736     }
    737     else if(min_cost == cost[3])
    738     {
    739         *u4_intra_mode = DIAG_DL_I4x4;
    740         pred0_16x8b = pred3_16x8b;
    741     }
    742     else if(min_cost == cost[4])
    743     {
    744         *u4_intra_mode = DIAG_DR_I4x4;
    745         pred0_16x8b = pred4_16x8b;
    746     }
    747     else if(min_cost == cost[5])
    748     {
    749         *u4_intra_mode = VERT_R_I4x4;
    750         pred0_16x8b = pred5_16x8b;
    751     }
    752     else if(min_cost == cost[6])
    753     {
    754         *u4_intra_mode = HORZ_D_I4x4;
    755         pred0_16x8b = pred6_16x8b;
    756     }
    757     else if(min_cost == cost[7])
    758     {
    759         *u4_intra_mode = VERT_L_I4x4;
    760         pred0_16x8b = pred7_16x8b;
    761     }
    762     else if(min_cost == cost[8])
    763     {
    764         *u4_intra_mode = HORZ_U_I4x4;
    765         pred0_16x8b = pred8_16x8b;
    766     }
    767 
    768     mask_low_32b = _mm_set1_epi8(0xff);
    769     mask_low_32b = _mm_srli_si128(mask_low_32b, 12);
    770 
    771     _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)pu1_dst);
    772     pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
    773     _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd));
    774     pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
    775     _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd2));
    776     pred0_16x8b = _mm_srli_si128(pred0_16x8b, 4);
    777     _mm_maskmoveu_si128(pred0_16x8b, mask_low_32b, (char*)(pu1_dst + dst_strd3));
    778 
    779 }
    780 
    781 /**
    782 ******************************************************************************
    783 *
    784 * @brief
    785 *  Evaluate best intra chroma mode (among VERT, HORZ and DC) and do the prediction.
    786 *
    787 * @par Description
    788 *  This function evaluates first three intra chroma modes and compute corresponding sad
    789 *  and return the buffer predicted with best mode.
    790 *
    791 * @param[in] pu1_src
    792 *  UWORD8 pointer to the source
    793 *
    794 ** @param[in] pu1_ngbr_pels
    795 *  UWORD8 pointer to neighbouring pels
    796 *
    797 * @param[out] pu1_dst
    798 *  UWORD8 pointer to the destination
    799 *
    800 * @param[in] src_strd
    801 *  integer source stride
    802 *
    803 * @param[in] dst_strd
    804 *  integer destination stride
    805 *
    806 * @param[in] u4_n_avblty
    807 *  availability of neighbouring pixels
    808 *
    809 * @param[in] u4_intra_mode
    810 *  pointer to the variable in which best mode is returned
    811 *
    812 * @param[in] pu4_sadmin
    813 *  pointer to the variable in which minimum sad is returned
    814 *
    815 * @param[in] u4_valid_intra_modes
    816 *  says what all modes are valid
    817 *
    818 * @return
    819 *  none
    820 *
    821 ******************************************************************************
    822 */
    823 
    824 void ih264e_evaluate_intra_chroma_modes_ssse3(UWORD8 *pu1_src,
    825                                               UWORD8 *pu1_ngbr_pels,
    826                                               UWORD8 *pu1_dst,
    827                                               UWORD32 src_strd,
    828                                               UWORD32 dst_strd,
    829                                               WORD32 u4_n_avblty,
    830                                               UWORD32 *u4_intra_mode,
    831                                               WORD32 *pu4_sadmin,
    832                                               UWORD32 u4_valid_intra_modes)
    833 {
    834     WORD32 left, top;
    835     WORD32 sad_vert = INT_MAX, sad_horz = INT_MAX, sad_dc = INT_MAX, min_sad;
    836 
    837     __m128i src1_16x8b, src2_16x8b, src3_16x8b, src4_16x8b;
    838     __m128i src5_16x8b, src6_16x8b, src7_16x8b, src8_16x8b;
    839 
    840     __m128i top_16x8b, left_16x8b;
    841     __m128i pred1_16x8b, pred2_16x8b;
    842     __m128i tmp1_8x16b, tmp2_8x16b, sad_8x16b;
    843 
    844     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
    845     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
    846 
    847     //Loading source
    848     {
    849         src1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    850         pu1_src += src_strd;
    851         src2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    852         pu1_src += src_strd;
    853         src3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    854         pu1_src += src_strd;
    855         src4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    856         pu1_src += src_strd;
    857         src5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    858         pu1_src += src_strd;
    859         src6_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    860         pu1_src += src_strd;
    861         src7_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    862         pu1_src += src_strd;
    863         src8_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    864     }
    865 
    866     if(left)
    867     {
    868         left_16x8b = _mm_loadu_si128((__m128i *)pu1_ngbr_pels);
    869 
    870         if(u4_valid_intra_modes & 02) //If HORZ mode is valid
    871         {
    872             __m128i left_tmp_16x8b, left_sh_16x8b;
    873             __m128i const_14_15_16x8b;
    874 
    875             const_14_15_16x8b = _mm_set1_epi16(0x0f0e);
    876             left_sh_16x8b = _mm_slli_si128(left_16x8b, 2);
    877 
    878             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 1
    879             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2
    880             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
    881             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred2_16x8b);
    882 
    883             left_tmp_16x8b = _mm_slli_si128(left_16x8b, 4);
    884             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
    885             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
    886 
    887             pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 3
    888             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);  //row 4
    889             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
    890             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred2_16x8b);
    891 
    892             left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4);
    893             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
    894             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
    895             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
    896 
    897             pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 5
    898             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);  //row 6
    899             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
    900             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
    901 
    902             left_tmp_16x8b = _mm_slli_si128(left_tmp_16x8b, 4);
    903             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
    904             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
    905             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
    906 
    907             pred1_16x8b = _mm_shuffle_epi8(left_tmp_16x8b, const_14_15_16x8b); //row 7
    908             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b);  //row 8
    909             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
    910             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
    911 
    912             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
    913             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
    914 
    915             sad_horz = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    916         }
    917     }
    918 
    919     if(top)
    920     {
    921         UWORD8 *pu1_top;
    922 
    923         pu1_top = pu1_ngbr_pels + 2 * BLK8x8SIZE + 2;
    924         top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
    925 
    926         if(u4_valid_intra_modes & 04) //If VERT mode is valid
    927         {
    928             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, top_16x8b);
    929             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, top_16x8b);
    930             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
    931 
    932             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, top_16x8b);
    933             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, top_16x8b);
    934             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
    935             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
    936 
    937             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, top_16x8b);
    938             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, top_16x8b);
    939             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
    940             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
    941 
    942             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, top_16x8b);
    943             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, top_16x8b);
    944             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
    945             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
    946 
    947             sad_vert = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
    948         }
    949     }
    950 
    951     if(u4_valid_intra_modes & 01) //If DC mode is valid
    952     {
    953         if(left && top)
    954         {
    955             WORD32 left_up_u, left_down_u, left_up_v, left_down_v;
    956             WORD32 top_left_u, top_right_u, top_left_v, top_right_v;
    957             WORD32 dc_1u, dc_1v, dc_2u, dc_2v;
    958 
    959             __m128i val_sh_16x8b;
    960             __m128i intrlv_mask_8x16b, zero_vector;
    961 
    962             intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
    963             zero_vector = _mm_setzero_si128();
    964 
    965             val_sh_16x8b = _mm_srli_si128(left_16x8b, 1);
    966 
    967             tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b);
    968             tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b);
    969             tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
    970             tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
    971 
    972             left_up_u = _mm_extract_epi16(tmp1_8x16b, 4);
    973             left_up_v = _mm_extract_epi16(tmp2_8x16b, 4);
    974             left_down_u = _mm_extract_epi16(tmp1_8x16b, 0);
    975             left_down_v = _mm_extract_epi16(tmp2_8x16b, 0);
    976 
    977             val_sh_16x8b = _mm_srli_si128(top_16x8b, 1);
    978 
    979             tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b);
    980             tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, val_sh_16x8b);
    981             tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
    982             tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
    983 
    984             top_left_u = _mm_extract_epi16(tmp1_8x16b, 0);
    985             top_left_v = _mm_extract_epi16(tmp2_8x16b, 0);
    986             top_right_u = _mm_extract_epi16(tmp1_8x16b, 4);
    987             top_right_v = _mm_extract_epi16(tmp2_8x16b, 4);
    988 
    989             // First four rows
    990             dc_1u = (left_up_u + top_left_u + 4) >> 3;
    991             dc_1v = (left_up_v + top_left_v + 4) >> 3;
    992             dc_2u = (top_right_u + 2) >> 2;
    993             dc_2v = (top_right_v + 2) >> 2;
    994 
    995             pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
    996                                         dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
    997 
    998             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
    999             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
   1000             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
   1001 
   1002             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
   1003             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
   1004             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1005             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1006 
   1007             // Second four rows
   1008             dc_1u = (left_down_u + 2) >> 2;
   1009             dc_1v = (left_down_v + 2) >> 2;
   1010             dc_2u = (left_down_u + top_right_u + 4) >> 3;
   1011             dc_2v = (left_down_v + top_right_v + 4) >> 3;
   1012 
   1013             pred2_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
   1014                                         dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
   1015 
   1016             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b);
   1017             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
   1018             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1019             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1020 
   1021             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b);
   1022             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
   1023             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1024             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1025 
   1026             sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
   1027         }
   1028         else if(left)
   1029         {
   1030             WORD32 left_up_u, left_down_u, left_up_v, left_down_v;
   1031             WORD32 dc_u, dc_v;
   1032 
   1033             __m128i left_sh_16x8b;
   1034             __m128i intrlv_mask_8x16b, zero_vector;
   1035 
   1036             intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
   1037             zero_vector = _mm_setzero_si128();
   1038 
   1039             left_sh_16x8b = _mm_srli_si128(left_16x8b, 1);
   1040 
   1041             tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_16x8b);
   1042             tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, left_sh_16x8b);
   1043             tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
   1044             tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
   1045 
   1046             left_up_u = _mm_extract_epi16(tmp1_8x16b, 4);
   1047             left_up_v = _mm_extract_epi16(tmp2_8x16b, 4);
   1048             left_down_u = _mm_extract_epi16(tmp1_8x16b, 0);
   1049             left_down_v = _mm_extract_epi16(tmp2_8x16b, 0);
   1050 
   1051             // First four rows
   1052             dc_u = (left_up_u + 2) >> 2;
   1053             dc_v = (left_up_v + 2) >> 2;
   1054 
   1055             pred1_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8));
   1056 
   1057             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
   1058             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
   1059             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
   1060 
   1061             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
   1062             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
   1063             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1064             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1065 
   1066             // Second four rows
   1067             dc_u = (left_down_u + 2) >> 2;
   1068             dc_v = (left_down_v + 2) >> 2;
   1069 
   1070             pred2_16x8b = _mm_set1_epi16(dc_u | (dc_v << 8));
   1071 
   1072             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred2_16x8b);
   1073             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred2_16x8b);
   1074             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1075             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1076 
   1077             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred2_16x8b);
   1078             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred2_16x8b);
   1079             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1080             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1081 
   1082             sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
   1083         }
   1084         else if(top)
   1085         {
   1086             WORD32 top_left_u, top_right_u, top_left_v, top_right_v;
   1087             WORD32 dc_1u, dc_1v, dc_2u, dc_2v;
   1088 
   1089             __m128i top_sh_16x8b;
   1090             __m128i intrlv_mask_8x16b, zero_vector;
   1091 
   1092             intrlv_mask_8x16b = _mm_set1_epi16(0x00ff);
   1093             zero_vector = _mm_setzero_si128();
   1094 
   1095             top_sh_16x8b = _mm_srli_si128(top_16x8b, 1);
   1096 
   1097             tmp1_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_16x8b);
   1098             tmp2_8x16b = _mm_and_si128(intrlv_mask_8x16b, top_sh_16x8b);
   1099             tmp1_8x16b = _mm_sad_epu8(zero_vector, tmp1_8x16b);
   1100             tmp2_8x16b = _mm_sad_epu8(zero_vector, tmp2_8x16b);
   1101 
   1102             top_left_u = _mm_extract_epi16(tmp1_8x16b, 0);
   1103             top_left_v = _mm_extract_epi16(tmp2_8x16b, 0);
   1104             top_right_u = _mm_extract_epi16(tmp1_8x16b, 4);
   1105             top_right_v = _mm_extract_epi16(tmp2_8x16b, 4);
   1106 
   1107             dc_1u = (top_left_u + 2) >> 2;
   1108             dc_1v = (top_left_v + 2) >> 2;
   1109             dc_2u = (top_right_u + 2) >> 2;
   1110             dc_2v = (top_right_v + 2) >> 2;
   1111 
   1112             pred1_16x8b = _mm_setr_epi8(dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v, dc_1u, dc_1v,
   1113                                        dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v, dc_2u, dc_2v);
   1114 
   1115             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
   1116             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
   1117             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
   1118 
   1119             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
   1120             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
   1121             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1122             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1123 
   1124             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
   1125             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b);
   1126             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1127             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1128 
   1129             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
   1130             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b);
   1131             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1132             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1133 
   1134             sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
   1135         }
   1136         else
   1137         {
   1138             pred1_16x8b = _mm_set1_epi8(128);
   1139 
   1140             tmp1_8x16b = _mm_sad_epu8(src1_16x8b, pred1_16x8b);
   1141             tmp2_8x16b = _mm_sad_epu8(src2_16x8b, pred1_16x8b);
   1142             sad_8x16b = _mm_add_epi16(tmp1_8x16b, tmp2_8x16b);
   1143 
   1144             tmp1_8x16b = _mm_sad_epu8(src3_16x8b, pred1_16x8b);
   1145             tmp2_8x16b = _mm_sad_epu8(src4_16x8b, pred1_16x8b);
   1146             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1147             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1148 
   1149             tmp1_8x16b = _mm_sad_epu8(src5_16x8b, pred1_16x8b);
   1150             tmp2_8x16b = _mm_sad_epu8(src6_16x8b, pred1_16x8b);
   1151             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1152             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1153 
   1154             tmp1_8x16b = _mm_sad_epu8(src7_16x8b, pred1_16x8b);
   1155             tmp2_8x16b = _mm_sad_epu8(src8_16x8b, pred1_16x8b);
   1156             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp1_8x16b);
   1157             sad_8x16b = _mm_add_epi16(sad_8x16b, tmp2_8x16b);
   1158 
   1159             sad_dc = _mm_extract_epi16(sad_8x16b, 0) + _mm_extract_epi16(sad_8x16b, 4);
   1160         }
   1161     }
   1162 
   1163     min_sad = MIN3(sad_horz, sad_vert, sad_dc);
   1164 
   1165     /* Finding minimum SAD and doing corresponding prediction*/
   1166     if(min_sad < *pu4_sadmin)
   1167     {
   1168         *pu4_sadmin = min_sad;
   1169 
   1170         if(min_sad == sad_dc)
   1171         {
   1172             *u4_intra_mode = DC_CH_I8x8;
   1173 
   1174             if(!left)
   1175                 pred2_16x8b = pred1_16x8b;
   1176 
   1177             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
   1178             pu1_dst += dst_strd;
   1179             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
   1180             pu1_dst += dst_strd;
   1181             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
   1182             pu1_dst += dst_strd;
   1183             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
   1184             pu1_dst += dst_strd;
   1185 
   1186             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
   1187             pu1_dst += dst_strd;
   1188             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
   1189             pu1_dst += dst_strd;
   1190             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
   1191             pu1_dst += dst_strd;
   1192             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
   1193         }
   1194         else if(min_sad == sad_horz)
   1195         {
   1196             __m128i left_sh_16x8b, const_14_15_16x8b;
   1197 
   1198             *u4_intra_mode = HORZ_CH_I8x8;
   1199 
   1200             const_14_15_16x8b = _mm_set1_epi16(0x0f0e);
   1201 
   1202             left_sh_16x8b = _mm_slli_si128(left_16x8b, 2);
   1203             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 1
   1204             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 2
   1205 
   1206             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
   1207             pu1_dst += dst_strd;
   1208             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
   1209 
   1210             left_16x8b = _mm_slli_si128(left_16x8b, 4);
   1211             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
   1212             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 3
   1213             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 4
   1214 
   1215             pu1_dst += dst_strd;
   1216             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
   1217             pu1_dst += dst_strd;
   1218             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
   1219 
   1220             left_16x8b = _mm_slli_si128(left_16x8b, 4);
   1221             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
   1222             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 5
   1223             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 6
   1224 
   1225             pu1_dst += dst_strd;
   1226             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
   1227             pu1_dst += dst_strd;
   1228             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
   1229 
   1230             left_16x8b = _mm_slli_si128(left_16x8b, 4);
   1231             left_sh_16x8b = _mm_slli_si128(left_sh_16x8b, 4);
   1232             pred1_16x8b = _mm_shuffle_epi8(left_16x8b, const_14_15_16x8b);    //row 7
   1233             pred2_16x8b = _mm_shuffle_epi8(left_sh_16x8b, const_14_15_16x8b); //row 8
   1234 
   1235             pu1_dst += dst_strd;
   1236             _mm_storeu_si128((__m128i *)pu1_dst, pred1_16x8b);
   1237             pu1_dst += dst_strd;
   1238             _mm_storeu_si128((__m128i *)pu1_dst, pred2_16x8b);
   1239         }
   1240         else
   1241         {
   1242             *u4_intra_mode = VERT_CH_I8x8;
   1243 
   1244             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1245             pu1_dst += dst_strd;
   1246             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1247             pu1_dst += dst_strd;
   1248             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1249             pu1_dst += dst_strd;
   1250             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1251             pu1_dst += dst_strd;
   1252             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1253             pu1_dst += dst_strd;
   1254             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1255             pu1_dst += dst_strd;
   1256             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1257             pu1_dst += dst_strd;
   1258             _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
   1259         }
   1260     }
   1261 }
   1262