Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_intra_pred_filters_x86_intr.c
     22 *
     23 * @brief
     24 *  Contains function Definition for intra prediction  interpolation filters
     25 *
     26 *
     27 * @author
     28 * Ittiam
     29 *
     30 * @par List of Functions:
     31 *  - ihevc_intra_pred_ref_filtering_sse42()
     32 *  - ihevc_intra_pred_luma_dc_sse42()
     33 *  - ihevc_intra_pred_luma_horz_sse42()
     34 *  - ihevc_intra_pred_luma_ver_sse42()
     35 *  - ihevc_intra_pred_luma_mode_3_to_9_sse42()
     36 *  - ihevc_intra_pred_luma_mode_11_to_17_sse42()
     37 *  - ihevc_intra_pred_luma_mode_19_to_25_sse42()
     38 *  - ihevc_intra_pred_luma_mode_27_to_33_sse42()
     39 *
     40 * @remarks
     41 *  None
     42 *
     43 *******************************************************************************
     44 */
     45 
     46 
     47 /*****************************************************************************/
     48 /* File Includes                                                             */
     49 /*****************************************************************************/
     50 #include <stdlib.h>
     51 
     52 #include "ihevc_typedefs.h"
     53 #include "ihevc_intra_pred.h"
     54 #include "ihevc_macros.h"
     55 #include "ihevc_func_selector.h"
     56 #include "ihevc_platform_macros.h"
     57 #include "ihevc_common_tables.h"
     58 #include "ihevc_defs.h"
     59 #include "ihevc_tables_x86_intr.h"
     60 
     61 #include <immintrin.h>
     62 
     63 /****************************************************************************/
     64 /* Constant Macros                                                          */
     65 /****************************************************************************/
     66 #define MAX_CU_SIZE 64
     67 #define BIT_DEPTH 8
     68 #define T32_4NT 128
     69 #define T16_4NT 64
     70 
     71 
     72 /****************************************************************************/
     73 /* Function Macros                                                          */
     74 /****************************************************************************/
     75 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
     76 
     77 /* tables to shuffle 8-bit values */
     78 
     79 /*****************************************************************************/
     80 /* global tables Definition                                                  */
     81 /*****************************************************************************/
     82 
     83 
     84 
     85 /*****************************************************************************/
     86 /* Function Definition                                                      */
     87 /*****************************************************************************/
     88 
     89 /**
     90 *******************************************************************************
     91 *
     92 * @brief
     93 *    Intra prediction interpolation filter for ref_filtering
     94 *
     95 *
     96 * @par Description:
     97 *    Reference DC filtering for neighboring samples dependent  on TU size and
     98 *    mode  Refer to section 8.4.4.2.3 in the standard
     99 *
    100 * @param[in] pu1_src
    101 *  UWORD8 pointer to the source
    102 *
    103 * @param[out] pu1_dst
    104 *  UWORD8 pointer to the destination
    105 *
    106 * @param[in] nt
    107 *  integer Transform Block size
    108 *
    109 * @param[in] mode
    110 *  integer intraprediction mode
    111 *
    112 * @returns
    113 *
    114 * @remarks
    115 *  None
    116 *
    117 *******************************************************************************
    118 */
    119 
    120 
    121 void ihevc_intra_pred_ref_filtering_sse42(UWORD8 *pu1_src,
    122                                           WORD32 nt,
    123                                           UWORD8 *pu1_dst,
    124                                           WORD32 mode,
    125                                           WORD32 strong_intra_smoothing_enable_flag)
    126 {
    127     WORD32 filter_flag;
    128     WORD32 i; /* Generic indexing variable */
    129     WORD32 four_nt = 4 * nt;
    130     UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
    131     WORD32 bi_linear_int_flag = 0;
    132     WORD32 abs_cond_left_flag = 0;
    133     WORD32 abs_cond_top_flag = 0;
    134     WORD32 dc_val = 1 << (BIT_DEPTH - 5);
    135     __m128i src_temp1, src_temp2, src_temp3, src_temp7;
    136     __m128i src_temp4, src_temp5, src_temp6, src_temp8;
    137 
    138     //WORD32 strong_intra_smoothing_enable_flag  = 1;
    139 
    140 
    141 
    142     filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
    143     if(0 == filter_flag)
    144     {
    145         if(pu1_src == pu1_dst)
    146         {
    147             return;
    148         }
    149         else
    150         {
    151             if(nt == 4)
    152             {
    153                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
    154                 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    155                 pu1_dst[four_nt] = pu1_src[four_nt];
    156 
    157             }
    158 
    159             else if(nt == 8)
    160             {
    161 
    162                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
    163                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
    164 
    165                 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    166                 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    167 
    168 
    169                 pu1_dst[four_nt] = pu1_src[four_nt];
    170             }
    171             else if(nt == 16)
    172             {
    173 
    174                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
    175                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
    176                 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
    177                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
    178 
    179                 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    180                 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    181                 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
    182                 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
    183 
    184                 pu1_dst[four_nt] = pu1_src[four_nt];
    185             }
    186             else if(nt == 32)
    187             {
    188 
    189                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
    190                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
    191                 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
    192                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
    193 
    194                 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64));
    195                 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80));
    196                 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96));
    197                 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112));
    198 
    199                 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    200                 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    201                 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
    202                 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
    203 
    204                 _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
    205                 _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
    206                 _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
    207                 _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
    208 
    209                 pu1_dst[four_nt] = pu1_src[four_nt];
    210             }
    211 
    212         }
    213     }
    214 
    215     else
    216     {
    217         /* If strong intra smoothin is enabled and transform size is 32 */
    218         if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
    219         {
    220             /* Strong Intra Filtering */
    221             abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt]
    222                                      - (2 * pu1_src[3 * nt]))) < dc_val;
    223             abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0]
    224                                       - (2 * pu1_src[nt]))) < dc_val;
    225 
    226             bi_linear_int_flag = ((1 == abs_cond_left_flag)
    227                             && (1 == abs_cond_top_flag));
    228         }
    229         /* Extremities Untouched*/
    230         au1_flt[0] = pu1_src[0];
    231         au1_flt[4 * nt] = pu1_src[4 * nt];
    232 
    233         /* Strong filtering of reference samples */
    234         if(1 == bi_linear_int_flag)
    235         {
    236             au1_flt[2 * nt] = pu1_src[2 * nt];
    237 
    238             for(i = 1; i < (2 * nt); i++)
    239                 au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
    240 
    241             for(i = 1; i < (2 * nt); i++)
    242                 au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
    243         }
    244         else
    245         {
    246             __m128i const_value_8x16;
    247 
    248             const_value_8x16 = _mm_set1_epi16(2);
    249 
    250             au1_flt[0] = pu1_src[0];
    251             au1_flt[4 * nt] = pu1_src[4 * nt];
    252 
    253             /* Perform bilinear filtering of Reference Samples */
    254             for(i = 0; i < (four_nt); i += 16)
    255             {
    256                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i));
    257                 src_temp2 = _mm_srli_si128(src_temp1, 1);
    258                 src_temp3 = _mm_srli_si128(src_temp2, 1);
    259 
    260                 src_temp1 =  _mm_cvtepu8_epi16(src_temp1);
    261                 src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
    262                 src_temp3 =  _mm_cvtepu8_epi16(src_temp3);
    263 
    264                 src_temp2 = _mm_slli_epi16(src_temp2,  1);
    265 
    266                 src_temp1 = _mm_add_epi16(src_temp1, src_temp2);
    267                 src_temp1 = _mm_add_epi16(src_temp1, src_temp3);
    268                 src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16);
    269 
    270                 src_temp1 = _mm_srai_epi16(src_temp1,  2);
    271 
    272                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i));
    273                 src_temp5 = _mm_srli_si128(src_temp4, 1);
    274                 src_temp6 = _mm_srli_si128(src_temp5, 1);
    275 
    276                 src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
    277                 src_temp5 =  _mm_cvtepu8_epi16(src_temp5);
    278                 src_temp6 =  _mm_cvtepu8_epi16(src_temp6);
    279 
    280                 src_temp5 = _mm_slli_epi16(src_temp5,  1);
    281 
    282                 src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
    283                 src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    284                 src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16);
    285 
    286                 src_temp4 = _mm_srai_epi16(src_temp4,  2);
    287 
    288                 /* converting 16 bit to 8 bit */
    289                 src_temp1 = _mm_packus_epi16(src_temp1, src_temp4);
    290 
    291                 _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1);
    292             }
    293             au1_flt[4 * nt] = pu1_src[4 * nt];
    294         }
    295 
    296         if(nt == 4)
    297         {
    298             src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
    299             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    300             pu1_dst[four_nt] = au1_flt[four_nt];
    301         }
    302         else if(nt == 8)
    303         {
    304 
    305             src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
    306             src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
    307 
    308             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    309             _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    310 
    311             pu1_dst[four_nt] = au1_flt[four_nt];
    312         }
    313         else if(nt == 16)
    314         {
    315 
    316             src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
    317             src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
    318             src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
    319             src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
    320 
    321             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    322             _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    323             _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
    324             _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
    325 
    326             pu1_dst[four_nt] = au1_flt[four_nt];
    327         }
    328 
    329         else if(nt == 32)
    330         {
    331 
    332             src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
    333             src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
    334             src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
    335             src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
    336 
    337             src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64));
    338             src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80));
    339             src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96));
    340             src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112));
    341 
    342             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    343             _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    344             _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
    345             _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
    346 
    347             _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
    348             _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
    349             _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
    350             _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
    351 
    352             pu1_dst[four_nt] = au1_flt[four_nt];
    353         }
    354 
    355     }
    356 }
    357 
    358 
    359 
    360 /**
    361 *******************************************************************************
    362 *
    363 * @brief
    364 *    Intra prediction interpolation filter for luma dc
    365 *
    366 * @par Description:
    367 *   Intraprediction for DC mode with reference neighboring  samples location
    368 *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
    369 *   to section 8.4.4.2.5 in the standard
    370 *
    371 * @param[in] pu1_src
    372 *  UWORD8 pointer to the source
    373 *
    374 * @param[out] pu1_dst
    375 *  UWORD8 pointer to the destination
    376 *
    377 * @param[in] src_strd
    378 *  integer source stride
    379 *
    380 * @param[in] dst_strd
    381 *  integer destination stride
    382 *
    383 * @param[in] nt
    384 *  integer Transform Block size
    385 *
    386 * @param[in] mode
    387 *  integer intraprediction mode
    388 *
    389 * @returns
    390 *
    391 * @remarks
    392 *  None
    393 *
    394 *******************************************************************************
    395 */
    396 
    397 void ihevc_intra_pred_luma_dc_sse42(UWORD8 *pu1_ref,
    398                                     WORD32 src_strd,
    399                                     UWORD8 *pu1_dst,
    400                                     WORD32 dst_strd,
    401                                     WORD32 nt,
    402                                     WORD32 mode)
    403 {
    404 
    405     WORD32 acc_dc;
    406     WORD32 dc_val, two_dc_val, three_dc_val;
    407     WORD32 row;
    408     WORD32 log2nt = 5;
    409     WORD32 two_nt, three_nt;
    410     __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6;
    411     __m128i src_temp8, src_temp9, src_temp10, src_temp2;
    412     __m128i m_zero = _mm_set1_epi32(0);
    413     __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]);
    414     UNUSED(src_strd);
    415     UNUSED(mode);
    416 
    417 
    418     switch(nt)
    419     {
    420         case 32:
    421             log2nt = 5;
    422             break;
    423         case 16:
    424             log2nt = 4;
    425             break;
    426         case 8:
    427             log2nt = 3;
    428             break;
    429         case 4:
    430             log2nt = 2;
    431             break;
    432         default:
    433             break;
    434     }
    435     two_nt = 2 * nt;
    436     three_nt = 3 * nt;
    437 
    438     acc_dc = 0;
    439     /* Calculate DC value for the transform block */
    440 
    441 
    442 
    443     if(nt == 32)
    444     {
    445         __m128i temp;
    446         WORD32 itr_count;
    447 
    448         src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
    449         src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
    450         src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
    451         src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
    452 
    453         src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
    454         src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
    455         src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
    456         src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
    457 
    458         src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
    459         src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
    460         src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
    461 
    462         src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
    463         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    464 
    465         acc_dc = _mm_cvtsi128_si32(src_temp4);
    466 
    467         acc_dc += pu1_ref[three_nt];
    468         acc_dc -= pu1_ref[two_nt];
    469 
    470         /* computing acc_dc value */
    471         dc_val = (acc_dc + nt) >> (log2nt + 1);
    472 
    473         two_dc_val = 2 * dc_val;
    474         three_dc_val = 3 * dc_val;
    475 
    476         temp = _mm_set1_epi8(dc_val);
    477 
    478         for(itr_count = 0; itr_count < 2; itr_count++)
    479         {
    480             /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
    481             _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp);
    482             _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp);
    483             _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp);
    484             _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp);
    485             _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp);
    486             _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp);
    487             _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp);
    488             _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp);
    489 
    490             _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp);
    491             _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp);
    492             _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp);
    493             _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp);
    494             _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp);
    495             _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp);
    496             _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp);
    497             _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp);
    498 
    499             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp);
    500             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp);
    501             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp);
    502             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp);
    503             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp);
    504             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp);
    505             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp);
    506             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp);
    507 
    508             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp);
    509             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp);
    510             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp);
    511             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp);
    512             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp);
    513             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp);
    514             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp);
    515             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp);
    516 
    517             pu1_dst += 16 * dst_strd;
    518         }
    519     }
    520 
    521     else
    522 
    523     {
    524         __m128i  zero_8x16b;
    525         __m128i sm1 = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
    526 
    527         /* DC filtering for the first top row and first left column */
    528 
    529         zero_8x16b = _mm_set1_epi16(0);
    530 
    531         if(nt == 4) /* nt multiple of 4*/
    532         {
    533             WORD32 temp1, temp2, temp3;
    534 
    535             src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
    536             src_temp2 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
    537 
    538             src_temp4 =  _mm_cvtepu8_epi16(src_temp3);
    539             src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
    540 
    541             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    542             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    543             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    544 
    545             acc_dc = _mm_cvtsi128_si32(src_temp4);
    546             acc_dc += pu1_ref[three_nt];
    547             acc_dc -= pu1_ref[two_nt];
    548 
    549 /* computing acc_dc value */
    550 
    551             dc_val = (acc_dc + nt) >> (log2nt + 1);
    552 
    553             three_dc_val = 3 * dc_val;
    554 
    555             /* loding 8-bit 16 pixel */
    556             src_temp1 = _mm_set1_epi16(three_dc_val + 2);
    557             two_dc_val = 2 * dc_val;
    558 
    559             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
    560             src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
    561 
    562             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */
    563             src_temp2 = _mm_srli_epi16(src_temp2, 2);
    564 
    565             src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b);
    566 
    567             temp1 = _mm_cvtsi128_si32(src_temp2);
    568 
    569             *(WORD32 *)(&pu1_dst[0]) = temp1;
    570 
    571             /*  retore  first value*/
    572             pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
    573                             >> 2);
    574 
    575             for(row = 1; row < nt; row++)
    576                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
    577                                 >> 2;
    578 
    579             src_temp2 = _mm_insert_epi8(src_temp2, dc_val, 0);
    580 
    581             src_temp2 =  _mm_shuffle_epi8(src_temp2, sm1);
    582             src_temp3 =  _mm_shuffle_epi8(src_temp2, sm1);
    583             src_temp4 =  _mm_shuffle_epi8(src_temp2, sm1);
    584 
    585             src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(1 * dst_strd) + 0], 0);
    586             src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(2 * dst_strd) + 0], 0);
    587             src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(3 * dst_strd) + 0], 0);
    588 
    589             temp1 = _mm_cvtsi128_si32(src_temp2);
    590             temp2 = _mm_cvtsi128_si32(src_temp3);
    591             temp3 = _mm_cvtsi128_si32(src_temp4);
    592 
    593             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
    594             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
    595             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
    596 
    597         }
    598         else if(nt == 8) /* if nt%8==0*/
    599         {
    600 
    601             src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
    602 
    603             src_temp4 = _mm_sad_epu8(src_temp3, m_zero);
    604             src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
    605             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    606 
    607             acc_dc = _mm_cvtsi128_si32(src_temp4);
    608 
    609             acc_dc += pu1_ref[three_nt];
    610             acc_dc -= pu1_ref[two_nt];
    611 
    612             /* computing acc_dc value */
    613 
    614             dc_val = (acc_dc + nt) >> (log2nt + 1);
    615 
    616             three_dc_val = 3 * dc_val;
    617             src_temp1 = _mm_set1_epi16(three_dc_val + 2);
    618             two_dc_val = 2 * dc_val;
    619 
    620             /* loding 8-bit 16 pixel */
    621             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
    622             src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
    623 
    624             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
    625             src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
    626 
    627             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
    628             src_temp2 = _mm_srli_epi16(src_temp2, 2);
    629             src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b);
    630 
    631             _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2);
    632 
    633             /*  retore  first value*/
    634             pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
    635                             >> 2);
    636 
    637             for(row = 1; row < nt; row++)
    638                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
    639                                 >> 2;
    640 
    641             /* Fill the remaining rows with DC value*/
    642 
    643             src_temp1 = _mm_set1_epi8(dc_val);
    644             src_temp2 = _mm_set1_epi8(dc_val);
    645             src_temp3 = _mm_set1_epi8(dc_val);
    646             src_temp4 = _mm_set1_epi8(dc_val);
    647             src_temp5 = _mm_set1_epi8(dc_val);
    648             src_temp6 = _mm_set1_epi8(dc_val);
    649             src_temp7 = _mm_set1_epi8(dc_val);
    650 
    651             src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
    652             src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
    653             src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
    654             src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
    655             src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
    656             src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
    657             src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
    658 
    659             _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
    660             _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
    661             _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
    662             _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
    663             _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
    664             _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
    665             _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
    666 
    667         }
    668         else if(nt == 16) /* if nt%8==0*/
    669         {
    670 
    671             src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
    672             src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
    673 
    674             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
    675             src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
    676 
    677             src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
    678             src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
    679 
    680             src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
    681             src_temp10 =  _mm_cvtepu8_epi16(src_temp10);
    682 
    683             src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
    684             src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
    685             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    686 
    687             acc_dc = _mm_cvtsi128_si32(src_temp4);
    688 
    689             acc_dc += pu1_ref[three_nt];
    690             acc_dc -= pu1_ref[two_nt];
    691 
    692             /* computing acc_dc value */
    693 
    694             dc_val = (acc_dc + nt) >> (log2nt + 1);
    695 
    696             three_dc_val = 3 * dc_val;
    697             src_temp1 = _mm_set1_epi16(three_dc_val + 2);
    698             two_dc_val = 2 * dc_val;
    699 
    700             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
    701             src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
    702             src_temp10 = _mm_add_epi16(src_temp10, src_temp1);
    703             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
    704             src_temp2 = _mm_srli_epi16(src_temp2, 2);
    705             src_temp10 = _mm_srli_epi16(src_temp10, 2);
    706 
    707             src_temp2 = _mm_packus_epi16(src_temp2, src_temp10);
    708 
    709             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
    710 
    711             /*  retore  first value*/
    712             pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
    713                             >> 2);
    714 
    715             for(row = 1; row < nt; row++)
    716                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
    717                                 >> 2;
    718             /* Fill the remaining rows with DC value*/
    719             src_temp1 =  _mm_set1_epi8(dc_val);
    720             src_temp2 =  _mm_set1_epi8(dc_val);
    721             src_temp3 =  _mm_set1_epi8(dc_val);
    722             src_temp4 =  _mm_set1_epi8(dc_val);
    723             src_temp5 =  _mm_set1_epi8(dc_val);
    724             src_temp6 =  _mm_set1_epi8(dc_val);
    725             src_temp7 =  _mm_set1_epi8(dc_val);
    726 
    727             for(row = 1; row < nt; row += 8)
    728             {
    729 
    730                 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
    731                 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
    732                 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
    733                 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
    734                 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
    735                 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
    736                 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
    737 
    738                 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
    739                 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
    740                 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
    741                 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
    742                 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
    743                 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
    744                 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
    745 
    746                 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((8) * dst_strd)], 0);
    747                 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((9) * dst_strd)], 0);
    748                 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((10) * dst_strd)], 0);
    749                 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((11) * dst_strd)], 0);
    750                 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((12) * dst_strd)], 0);
    751                 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((13) * dst_strd)], 0);
    752                 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((14) * dst_strd)], 0);
    753 
    754                 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1);
    755                 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2);
    756                 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3);
    757 
    758                 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((15) * dst_strd)], 0);
    759 
    760                 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4);
    761                 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5);
    762                 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6);
    763                 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7);
    764 
    765                 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1);
    766 
    767             }
    768 
    769         }
    770         else if(nt == 32) /* if nt%8==0*/
    771         {
    772 
    773             __m128i src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16, src_temp17;
    774 
    775             src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
    776             src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
    777             src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
    778             src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
    779 
    780             /* loding 8-bit 16 pixel */
    781             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
    782             src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
    783             src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
    784             src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 24));
    785 
    786             src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
    787             src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
    788             src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
    789             src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
    790 
    791             src_temp2 =  _mm_cvtepu8_epi16(src_temp2);
    792             src_temp6 =  _mm_cvtepu8_epi16(src_temp6);
    793             src_temp9 =  _mm_cvtepu8_epi16(src_temp9);
    794             src_temp10 =  _mm_cvtepu8_epi16(src_temp10);
    795 
    796             src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
    797             src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
    798             src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
    799 
    800             src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
    801             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    802 
    803             acc_dc = _mm_cvtsi128_si32(src_temp4);
    804 
    805             acc_dc += pu1_ref[three_nt];
    806             acc_dc -= pu1_ref[two_nt];
    807 
    808             /* computing acc_dc value */
    809 
    810             dc_val = (acc_dc + nt) >> (log2nt + 1);
    811 
    812             three_dc_val = 3 * dc_val;
    813             src_temp1 = _mm_set1_epi16(three_dc_val + 2);
    814             two_dc_val = 2 * dc_val;
    815 
    816             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
    817             src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
    818             src_temp2 = _mm_add_epi16(src_temp6, src_temp1);
    819             src_temp2 = _mm_add_epi16(src_temp9, src_temp1);
    820             src_temp2 = _mm_add_epi16(src_temp10, src_temp1);
    821 
    822             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
    823             src_temp2 = _mm_srli_epi16(src_temp2, 2);
    824             src_temp6 = _mm_srli_epi16(src_temp6, 2);
    825             src_temp9 = _mm_srli_epi16(src_temp9, 2);
    826             src_temp10 = _mm_srli_epi16(src_temp10, 2);
    827 
    828             src_temp2 = _mm_packus_epi16(src_temp2, src_temp6);
    829             src_temp10 = _mm_packus_epi16(src_temp9, src_temp10);
    830 
    831             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
    832             _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp10);
    833 
    834             /*  retore  first value*/
    835             pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
    836                             >> 2);
    837 
    838             for(row = 1; row < nt; row++)
    839                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
    840                                 >> 2;
    841             /* Fill the remaining rows with DC value*/
    842             src_temp1 = _mm_insert_epi8(src_temp1, dc_val, 0);
    843 
    844             src_temp2 =  src_temp1;
    845             src_temp3 = src_temp1;
    846             src_temp4 =  src_temp1;
    847             src_temp5 =  src_temp1;
    848             src_temp6 =  src_temp1;
    849             src_temp7 =  src_temp1;
    850 
    851             src_temp12 = src_temp1;
    852             src_temp13 = src_temp1;
    853             src_temp14 = src_temp1;
    854             src_temp15 = src_temp1;
    855             src_temp16 = src_temp1;
    856             src_temp17 = src_temp1;
    857             src_temp11 = src_temp1;
    858 
    859             for(row = 1; row < nt; row++)
    860             {
    861                 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
    862                 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
    863                 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
    864                 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
    865                 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
    866                 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
    867                 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
    868 
    869                 _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), src_temp1);
    870                 _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), src_temp11);
    871                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
    872                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), src_temp12);
    873                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
    874                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), src_temp13);
    875 
    876                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
    877                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), src_temp14);
    878                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
    879                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp15);
    880                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
    881                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp16);
    882                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
    883                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd) + 16), src_temp17);
    884 
    885 
    886             }
    887 
    888         }
    889     }
    890 }
    891 
    892 /**
    893 *******************************************************************************
    894 *
    895 * @brief
    896 *     Intra prediction interpolation filter for horizontal luma variable.
    897 *
    898 * @par Description:
    899 *      Horizontal intraprediction(mode 10) with reference  samples location
    900 *      pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
    901 *      to section 8.4.4.2.6 in the standard (Special case)
    902 *
    903 * @param[in] pu1_src
    904 *  UWORD8 pointer to the source
    905 *
    906 * @param[out] pu1_dst
    907 *  UWORD8 pointer to the destination
    908 *
    909 * @param[in] src_strd
    910 *  integer source stride
    911 *
    912 * @param[in] dst_strd
    913 *  integer destination stride
    914 *
    915 * @param[in] nt
    916 *  integer Transform Block size
    917 *
    918 * @param[in] mode
    919 *  integer intraprediction mode
    920 *
    921 * @returns
    922 *
    923 * @remarks
    924 *  None
    925 *
    926 *******************************************************************************
    927 */
    928 
    929 void ihevc_intra_pred_luma_horz_sse42(UWORD8 *pu1_ref,
    930                                       WORD32 src_strd,
    931                                       UWORD8 *pu1_dst,
    932                                       WORD32 dst_strd,
    933                                       WORD32 nt,
    934                                       WORD32 mode)
    935 {
    936 
    937     WORD32 row;
    938     WORD32 two_nt;
    939     UNUSED(src_strd);
    940     UNUSED(mode);
    941 
    942     two_nt = 2 * nt;
    943 
    944 
    945     if(nt == 32)
    946     {
    947         __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
    948         __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
    949         __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
    950 
    951         for(row = 0; row < nt; row += 16)
    952         {
    953             {
    954                 src_temp1 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15));
    955 
    956                 src_temp2 =  _mm_srli_si128(src_temp1, 1);
    957                 src_temp3 =  _mm_srli_si128(src_temp1, 2);
    958                 src_temp4 =  _mm_srli_si128(src_temp1, 3);
    959                 src_temp5 =  _mm_srli_si128(src_temp1, 4);
    960                 src_temp6 =  _mm_srli_si128(src_temp1, 5);
    961                 src_temp7 =  _mm_srli_si128(src_temp1, 6);
    962                 src_temp8 =  _mm_srli_si128(src_temp1, 7);
    963 
    964                 src_temp9 =  _mm_srli_si128(src_temp1, 8);
    965                 src_temp10 =  _mm_srli_si128(src_temp1, 9);
    966                 src_temp11 =  _mm_srli_si128(src_temp1, 10);
    967                 src_temp12 =  _mm_srli_si128(src_temp1, 11);
    968                 src_temp13 =  _mm_srli_si128(src_temp1, 12);
    969                 src_temp14 =  _mm_srli_si128(src_temp1, 13);
    970                 src_temp15 =  _mm_srli_si128(src_temp1, 14);
    971                 src_temp16 =  _mm_srli_si128(src_temp1, 15);
    972 
    973                 src_temp8 =  _mm_shuffle_epi8(src_temp8, sm);
    974                 src_temp7 =  _mm_shuffle_epi8(src_temp7, sm);
    975                 src_temp6 =  _mm_shuffle_epi8(src_temp6, sm);
    976                 src_temp5 =  _mm_shuffle_epi8(src_temp5, sm);
    977                 src_temp4 =  _mm_shuffle_epi8(src_temp4, sm);
    978                 src_temp3 =  _mm_shuffle_epi8(src_temp3, sm);
    979                 src_temp2 =  _mm_shuffle_epi8(src_temp2, sm);
    980                 src_temp1 =  _mm_shuffle_epi8(src_temp1, sm);
    981 
    982                 src_temp16 =  _mm_shuffle_epi8(src_temp16, sm);
    983                 src_temp15 =  _mm_shuffle_epi8(src_temp15, sm);
    984                 src_temp14 =  _mm_shuffle_epi8(src_temp14, sm);
    985                 src_temp13 =  _mm_shuffle_epi8(src_temp13, sm);
    986                 src_temp12 =  _mm_shuffle_epi8(src_temp12, sm);
    987                 src_temp11 =  _mm_shuffle_epi8(src_temp11, sm);
    988                 src_temp10 =  _mm_shuffle_epi8(src_temp10, sm);
    989                 src_temp9 =  _mm_shuffle_epi8(src_temp9, sm);
    990 
    991                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16);
    992                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15);
    993                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14);
    994                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13);
    995                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12);
    996                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11);
    997                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10);
    998                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9);
    999 
   1000                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8);
   1001                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7);
   1002                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6);
   1003                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5);
   1004                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4);
   1005                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3);
   1006                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2);
   1007                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1);
   1008 
   1009                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16);
   1010                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15);
   1011                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14);
   1012                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13);
   1013                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12);
   1014                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11);
   1015                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10);
   1016                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9);
   1017 
   1018                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8);
   1019                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7);
   1020                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6);
   1021                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5);
   1022                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4);
   1023                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3);
   1024                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2);
   1025                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1);
   1026 
   1027             }
   1028 
   1029         }
   1030 
   1031     }
   1032     else
   1033 
   1034     {
   1035         __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6;
   1036         __m128i src_temp10, zero_8x16b, src_temp7;
   1037 
   1038         /* DC filtering for the first top row and first left column */
   1039 
   1040         zero_8x16b = _mm_set1_epi16(0);
   1041 
   1042         /*Filtering done for the 1st row */
   1043 
   1044         src_temp2 =  _mm_set1_epi16(pu1_ref[two_nt - 1]);
   1045         src_temp10 =  _mm_set1_epi16(pu1_ref[two_nt]);
   1046 
   1047         /*  loding 8-bit 16 pixels */
   1048         src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1049 
   1050         src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
   1051 
   1052         /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/
   1053         src_temp3 = _mm_sub_epi16(src_temp4, src_temp10);
   1054 
   1055         /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
   1056         src_temp3 = _mm_srai_epi16(src_temp3, 1);
   1057 
   1058         /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
   1059         src_temp3 = _mm_add_epi16(src_temp2, src_temp3);
   1060 
   1061         if(nt == 4)
   1062         {
   1063             int temp1, temp2, temp3;
   1064             src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b);
   1065             temp1 = _mm_cvtsi128_si32(src_temp3);
   1066 
   1067             *(WORD32 *)(&pu1_dst[0]) = temp1;
   1068 
   1069             src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
   1070             src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
   1071             src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
   1072 
   1073             temp1 = _mm_cvtsi128_si32(src_temp2);
   1074             temp2 = _mm_cvtsi128_si32(src_temp3);
   1075             temp3 = _mm_cvtsi128_si32(src_temp4);
   1076 
   1077             /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   1078             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
   1079             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
   1080             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
   1081 
   1082         }
   1083         else if(nt == 8)
   1084         {
   1085             src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b);
   1086 
   1087 
   1088             src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
   1089             src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
   1090             src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
   1091             src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
   1092             src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
   1093             src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
   1094             src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
   1095 
   1096             _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10);
   1097 
   1098             /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   1099             _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
   1100             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2);
   1101             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3);
   1102             _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
   1103             _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5);
   1104             _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6);
   1105             _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7);
   1106 
   1107         }
   1108         else if(nt == 16)
   1109         {
   1110             src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
   1111             src_temp4 =  _mm_cvtepu8_epi16(src_temp4);
   1112 
   1113             src_temp10 = _mm_sub_epi16(src_temp4, src_temp10);
   1114             src_temp10 = _mm_srai_epi16(src_temp10, 1);
   1115             src_temp10 = _mm_add_epi16(src_temp2, src_temp10);
   1116 
   1117             src_temp3 = _mm_packus_epi16(src_temp3, src_temp10);
   1118             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3);
   1119 
   1120             /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   1121             src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
   1122             src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
   1123             src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
   1124             src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
   1125             src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
   1126             src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
   1127             src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
   1128             src_temp10 =  _mm_set1_epi8(pu1_ref[two_nt - 9]);
   1129 
   1130             _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
   1131             _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
   1132             _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
   1133             _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
   1134             _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
   1135             _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
   1136             _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
   1137             _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10);
   1138 
   1139             src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 10]);
   1140             src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 11]);
   1141             src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 12]);
   1142             src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 13]);
   1143             src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 14]);
   1144             src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 15]);
   1145             src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 16]);
   1146 
   1147             _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1);
   1148             _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2);
   1149             _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3);
   1150             _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4);
   1151             _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5);
   1152             _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6);
   1153             _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7);
   1154 
   1155         }
   1156     }
   1157 }
   1158 
   1159 /**
   1160 *******************************************************************************
   1161 *
   1162 * @brief
   1163 *     Intra prediction interpolation filter for vertical luma variable.
   1164 *
   1165 * @par Description:
   1166 *    Horizontal intraprediction with reference neighboring  samples location
   1167 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
   1168 *    to section 8.4.4.2.6 in the standard (Special case)
   1169 *
   1170 * @param[in] pu1_src
   1171 *  UWORD8 pointer to the source
   1172 *
   1173 * @param[out] pu1_dst
   1174 *  UWORD8 pointer to the destination
   1175 *
   1176 * @param[in] src_strd
   1177 *  integer source stride
   1178 *
   1179 * @param[in] dst_strd
   1180 *  integer destination stride
   1181 *
   1182 * @param[in] nt
   1183 *  integer Transform Block size
   1184 *
   1185 * @param[in] mode
   1186 *  integer intraprediction mode
   1187 *
   1188 * @returns
   1189 *
   1190 * @remarks
   1191 *  None
   1192 *
   1193 *******************************************************************************
   1194 */
   1195 
   1196 
   1197 void ihevc_intra_pred_luma_ver_sse42(UWORD8 *pu1_ref,
   1198                                      WORD32 src_strd,
   1199                                      UWORD8 *pu1_dst,
   1200                                      WORD32 dst_strd,
   1201                                      WORD32 nt,
   1202                                      WORD32 mode)
   1203 {
   1204     WORD32 row;
   1205     WORD16 s2_predpixel;
   1206     WORD32 two_nt = 2 * nt;
   1207     __m128i src_temp0, src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7;
   1208 
   1209     UNUSED(src_strd);
   1210     UNUSED(mode);
   1211 
   1212     if(nt == 32)
   1213     {
   1214         __m128i temp1, temp2;
   1215         WORD32 itr_count;
   1216 
   1217         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1218         temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
   1219 
   1220         for(itr_count = 0; itr_count < 2; itr_count++)
   1221         {
   1222             /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
   1223             _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
   1224             _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
   1225             _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
   1226             _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
   1227             _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
   1228             _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
   1229             _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
   1230             _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
   1231 
   1232             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
   1233             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
   1234             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
   1235             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
   1236             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
   1237             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
   1238             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
   1239             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
   1240 
   1241             _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
   1242             _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
   1243             _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
   1244             _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
   1245             _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
   1246             _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
   1247             _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
   1248             _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
   1249 
   1250             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
   1251             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
   1252             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
   1253             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
   1254             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
   1255             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
   1256             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
   1257             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
   1258 
   1259             pu1_dst += 16 * dst_strd;
   1260         }
   1261     }
   1262 
   1263     else
   1264 
   1265     {
   1266         /*Filtering done for the 1st column */
   1267         for(row = nt - 1; row >= 0; row--)
   1268         {
   1269             s2_predpixel = pu1_ref[two_nt + 1]
   1270                             + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
   1271             pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
   1272         }
   1273 
   1274         /* Replication to next columns*/
   1275 
   1276         if(nt == 4)
   1277         {
   1278             int temp1, temp2, temp3, temp4;
   1279 
   1280             src_temp2 =   _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1281             src_temp3 =  src_temp2;
   1282             src_temp4 =  src_temp2;
   1283             src_temp5 =  src_temp2;
   1284 
   1285             src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(0 * dst_strd)], 0);
   1286             src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(1 * dst_strd)], 0);
   1287             src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(2 * dst_strd)], 0);
   1288             src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[(3 * dst_strd)], 0);
   1289 
   1290             temp1 = _mm_cvtsi128_si32(src_temp2);
   1291             temp2 = _mm_cvtsi128_si32(src_temp3);
   1292             temp3 = _mm_cvtsi128_si32(src_temp4);
   1293             temp4 = _mm_cvtsi128_si32(src_temp5);
   1294 
   1295             /* loding 4-bit 8 pixels values */
   1296             *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
   1297             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
   1298             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
   1299             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
   1300 
   1301         }
   1302         else if(nt == 8)
   1303         {
   1304 
   1305             src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1306             src_temp1 = src_temp0;
   1307             src_temp2 = src_temp0;
   1308             src_temp3 = src_temp0;
   1309             src_temp4 = src_temp0;
   1310             src_temp5 = src_temp0;
   1311             src_temp6 = src_temp0;
   1312             src_temp7 = src_temp0;
   1313 
   1314             src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((0) * dst_strd)], 0);
   1315             src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0);
   1316             src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0);
   1317             src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0);
   1318             src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0);
   1319             src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0);
   1320             src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0);
   1321             src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0);
   1322 
   1323             _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0);
   1324             _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
   1325             _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
   1326             _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
   1327             _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
   1328             _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
   1329             _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
   1330             _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
   1331 
   1332 
   1333         }
   1334         else if(nt == 16)
   1335         {
   1336             for(row = 0; row < nt; row += 8)
   1337             {
   1338 
   1339                 src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1340                 src_temp1 = src_temp0;
   1341                 src_temp2 = src_temp0;
   1342                 src_temp3 = src_temp0;
   1343                 src_temp4 = src_temp0;
   1344                 src_temp5 = src_temp0;
   1345                 src_temp6 = src_temp0;
   1346                 src_temp7 = src_temp0;
   1347 
   1348                 src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((row + 0) * dst_strd)], 0);
   1349                 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((row + 1) * dst_strd)], 0);
   1350                 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((row + 2) * dst_strd)], 0);
   1351                 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((row + 3) * dst_strd)], 0);
   1352                 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((row + 4) * dst_strd)], 0);
   1353                 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((row + 5) * dst_strd)], 0);
   1354                 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((row + 6) * dst_strd)], 0);
   1355                 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((row + 7) * dst_strd)], 0);
   1356 
   1357                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0);
   1358                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp1);
   1359                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp2);
   1360                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp3);
   1361                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp4);
   1362                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp5);
   1363                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp6);
   1364                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp7);
   1365 
   1366             }
   1367 
   1368         }
   1369 
   1370 
   1371     }
   1372 }
   1373 
   1374 
   1375 /**
   1376 *******************************************************************************
   1377 *
   1378 * @brief
   1379 *    Intra prediction interpolation filter for luma mode 3 to mode 9
   1380 *
   1381 * @par Description:
   1382 *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
   1383 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   1384 *    block location pointed by 'pu1_dst'
   1385 *
   1386 * @param[in] pu1_src
   1387 *  UWORD8 pointer to the source
   1388 *
   1389 * @param[out] pu1_dst
   1390 *  UWORD8 pointer to the destination
   1391 *
   1392 * @param[in] src_strd
   1393 *  integer source stride
   1394 *
   1395 * @param[in] dst_strd
   1396 *  integer destination stride
   1397 *
   1398 * @param[in] nt
   1399 *  integer Transform Block size
   1400 *
   1401 * @param[in] mode
   1402 *  integer intraprediction mode
   1403 *
   1404 * @returns
   1405 *
   1406 * @remarks
   1407 *  None
   1408 *
   1409 *******************************************************************************
   1410 */
   1411 
   1412 
   1413 void ihevc_intra_pred_luma_mode_3_to_9_sse42(UWORD8 *pu1_ref,
   1414                                              WORD32 src_strd,
   1415                                              UWORD8 *pu1_dst,
   1416                                              WORD32 dst_strd,
   1417                                              WORD32 nt,
   1418                                              WORD32 mode)
   1419 {
   1420     WORD32 row, col;
   1421     WORD32 two_nt = 2 * nt;
   1422     WORD32 intra_pred_ang;
   1423 
   1424 
   1425     __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
   1426     __m128i fract_4x32b, intra_pred_ang_4x32b;
   1427     __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
   1428     UNUSED(src_strd);
   1429 
   1430 
   1431     /* Intra Pred Angle according to the mode */
   1432     intra_pred_ang = gai4_ihevc_ang_table[mode];
   1433 
   1434     /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   1435     /* samples dependent on distance to obtain destination sample */
   1436 
   1437     /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   1438     /* samples dependent on distance to obtain destination sample */
   1439 
   1440     const_temp_4x32b  = _mm_set1_epi16(16);
   1441     const_temp2_4x32b = _mm_set1_epi32(31);
   1442     const_temp3_4x32b = _mm_set1_epi32(32);
   1443     const_temp4_4x32b = _mm_set1_epi32(4);
   1444 
   1445     two_nt_4x32b = _mm_set1_epi32(two_nt - nt);
   1446 
   1447 
   1448     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
   1449 
   1450     /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   1451     intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
   1452 
   1453     row_4x32b = _mm_set_epi32(4, 3, 2, 1);
   1454 
   1455     if(nt == 4)
   1456     {
   1457 
   1458         WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   1459         int temp11, temp21, temp31, temp41;
   1460         // WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
   1461 
   1462         __m128i fract1_8x16b, fract2_8x16b;
   1463         __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   1464 
   1465         __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1466         __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b;
   1467         __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
   1468 
   1469         /* pos = ((row + 1) * intra_pred_ang); */
   1470         res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
   1471 
   1472         /* idx = pos >> 5; */
   1473         fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   1474 
   1475         /* fract = pos & (31); */
   1476         ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   1477 
   1478         /*(32 - fract) */
   1479         row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
   1480 
   1481         fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   1482         fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
   1483 
   1484         fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   1485         row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
   1486 
   1487         fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b);
   1488         fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b);
   1489 
   1490         temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   1491         temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   1492         temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   1493         temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   1494 
   1495         ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
   1496         ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
   1497         ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
   1498         ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
   1499         ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
   1500         ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
   1501         ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
   1502 
   1503         /* loding 8-bit 16 pixels */
   1504         src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/
   1505         src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/
   1506         src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/
   1507         src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/
   1508 
   1509         src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
   1510         src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
   1511         src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
   1512         src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
   1513 
   1514         /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1515         src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   1516         src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   1517         src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   1518         src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   1519 
   1520         /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1521         src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   1522         src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   1523         src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   1524         src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   1525 
   1526         /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1527         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   1528         src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   1529         src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   1530         src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   1531 
   1532         /* converting 16 bit to 8 bit */
   1533         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   1534         src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   1535 
   1536 
   1537         src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   1538         src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   1539 
   1540         src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1541         src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4);
   1542         src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   1543         src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12);
   1544 
   1545         temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
   1546         temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
   1547         temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
   1548         temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
   1549 
   1550         /* loding 4-bit 8 pixels values */
   1551         *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
   1552         *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
   1553         *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
   1554         *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
   1555 
   1556     }
   1557 
   1558     else if(nt == 16 || nt == 32)
   1559     {
   1560         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   1561         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   1562         const_temp2_4x32b = _mm_set1_epi16(31);
   1563         const_temp4_4x32b = _mm_set1_epi16(8);
   1564         const_temp3_4x32b = _mm_set1_epi16(32);
   1565         two_nt_4x32b = _mm_set1_epi16(two_nt);
   1566 
   1567         for(col = 0; col < nt; col += 8)
   1568         {
   1569             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   1570             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   1571             //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
   1572 
   1573             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   1574 
   1575             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   1576             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   1577 
   1578             /* pos = ((row + 1) * intra_pred_ang); */
   1579             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   1580 
   1581             /* idx = pos >> 5; */
   1582             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   1583 
   1584             /*(32 - fract) */
   1585             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   1586 
   1587             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   1588             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   1589 
   1590             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   1591             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   1592 
   1593 
   1594             fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
   1595             fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
   1596 
   1597             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   1598             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   1599             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   1600             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   1601 
   1602             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   1603             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   1604             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   1605             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   1606 
   1607             /* fract = pos & (31); */
   1608             ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   1609 
   1610             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   1611 
   1612             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   1613             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   1614             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   1615             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   1616 
   1617             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   1618             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   1619             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   1620             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   1621 
   1622             for(row = 0; row < nt; row += 8)
   1623             {
   1624                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1625                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1626 
   1627 
   1628                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   1629                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   1630 
   1631                 /* loding 8-bit 16 pixels */
   1632                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/
   1633                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/
   1634                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/
   1635                 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/
   1636 
   1637                 /* loding 8-bit 16 pixels */
   1638                 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/
   1639                 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/
   1640                 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/
   1641                 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/
   1642 
   1643                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
   1644                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
   1645                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
   1646                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
   1647 
   1648                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
   1649                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
   1650                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
   1651                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
   1652 
   1653                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1654                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   1655                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   1656                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   1657                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   1658 
   1659                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1660                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   1661                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   1662                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   1663                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   1664 
   1665                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1666                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   1667                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   1668                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   1669                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   1670 
   1671                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1672                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   1673                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   1674                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   1675                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   1676 
   1677                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1678                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   1679                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   1680                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   1681                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   1682 
   1683                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1684                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   1685                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   1686                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   1687                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   1688 
   1689                 /* converting 16 bit to 8 bit */
   1690                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   1691                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   1692 
   1693                 /* converting 16 bit to 8 bit */
   1694                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
   1695                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
   1696 
   1697                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   1698                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   1699 
   1700                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   1701                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   1702 
   1703                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1704                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1705 
   1706                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   1707                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   1708 
   1709                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   1710                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   1711 
   1712                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   1713                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   1714 
   1715                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   1716                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   1717 
   1718                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   1719                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   1720 
   1721                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b);          /* row=7*/
   1722 
   1723                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b);       /* row=6*/
   1724 
   1725                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b);       /* row=5*/
   1726 
   1727                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b);       /* row=4*/
   1728 
   1729                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b);       /* row=3*/
   1730 
   1731                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b);       /* row=2*/
   1732 
   1733                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b);       /* row=1*/
   1734 
   1735                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b);       /* row=0*/
   1736 
   1737             }
   1738         }
   1739     }
   1740     else
   1741     {
   1742         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   1743         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   1744         const_temp2_4x32b = _mm_set1_epi16(31);
   1745         const_temp4_4x32b = _mm_set1_epi16(8);
   1746         const_temp3_4x32b = _mm_set1_epi16(32);
   1747         two_nt_4x32b = _mm_set1_epi16(two_nt - nt);
   1748         {
   1749             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   1750             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   1751 
   1752             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   1753 
   1754             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   1755             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   1756 
   1757             /* pos = ((row + 1) * intra_pred_ang); */
   1758             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   1759 
   1760             /* idx = pos >> 5; */
   1761             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   1762 
   1763             /* fract = pos & (31); */
   1764             ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   1765 
   1766             /*(32 - fract) */
   1767             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   1768 
   1769             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   1770             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   1771 
   1772             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   1773             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   1774 
   1775 
   1776             fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
   1777             fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
   1778 
   1779             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   1780             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   1781             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   1782             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   1783 
   1784             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   1785             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   1786             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   1787             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   1788 
   1789             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   1790             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   1791             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   1792             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   1793 
   1794             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   1795             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   1796             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   1797             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   1798 
   1799             {
   1800                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1801                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1802 
   1803                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   1804                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   1805 
   1806                 /* loding 8-bit 16 pixels */
   1807                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/
   1808                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/
   1809                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/
   1810                 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/
   1811 
   1812                 /* loding 8-bit 16 pixels */
   1813                 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/
   1814                 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/
   1815                 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/
   1816                 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/
   1817 
   1818                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
   1819                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
   1820                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
   1821                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
   1822 
   1823                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
   1824                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
   1825                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
   1826                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
   1827 
   1828                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1829                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   1830                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   1831                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   1832                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   1833 
   1834                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1835                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   1836                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   1837                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   1838                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   1839 
   1840                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1841                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   1842                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   1843                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   1844                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   1845 
   1846                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1847                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   1848                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   1849                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   1850                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   1851 
   1852                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1853                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   1854                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   1855                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   1856                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   1857 
   1858                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1859                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   1860                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   1861                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   1862                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   1863 
   1864                 /* converting 16 bit to 8 bit */
   1865                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   1866                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   1867 
   1868                 /* converting 16 bit to 8 bit */
   1869                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
   1870                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
   1871 
   1872                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   1873                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   1874 
   1875                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   1876                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   1877 
   1878                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1879                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1880 
   1881                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   1882                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   1883 
   1884                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   1885                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   1886 
   1887                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   1888                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   1889 
   1890                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   1891                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   1892 
   1893                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   1894                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   1895 
   1896                 _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b);       /* row=0*/
   1897                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b);       /* row=1*/
   1898                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b);       /* row=2*/
   1899                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b);       /* row=3*/
   1900                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b);       /* row=4*/
   1901                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b);       /* row=5*/
   1902 
   1903                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b);       /* row=6*/
   1904                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b);          /* row=7*/
   1905 
   1906             }
   1907         }
   1908     }
   1909 
   1910 }
   1911 
   1912 /**
   1913 *******************************************************************************
   1914 *
   1915 * @brief
   1916 *   Intra prediction interpolation filter for luma mode 11 to mode 17
   1917 *
   1918 * @par Description:
   1919 *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
   1920 *    with reference  neighboring samples location pointed by 'pu1_ref' to the
   1921 *    TU block location pointed by 'pu1_dst'
   1922 *
   1923 * @param[in] pu1_src
   1924 *  UWORD8 pointer to the source
   1925 *
   1926 * @param[out] pu1_dst
   1927 *  UWORD8 pointer to the destination
   1928 *
   1929 * @param[in] src_strd
   1930 *  integer source stride
   1931 *
   1932 * @param[in] dst_strd
   1933 *  integer destination stride
   1934 *
   1935 * @param[in] nt
   1936 *  integer Transform Block size
   1937 *
   1938 * @param[in] mode
   1939 *  integer intraprediction mode
   1940 *
   1941 * @returns
   1942 *
   1943 * @remarks
   1944 *  None
   1945 *
   1946 *******************************************************************************
   1947 */
   1948 
   1949 
   1950 void ihevc_intra_pred_luma_mode_11_to_17_sse42(UWORD8 *pu1_ref,
   1951                                                WORD32 src_strd,
   1952                                                UWORD8 *pu1_dst,
   1953                                                WORD32 dst_strd,
   1954                                                WORD32 nt,
   1955                                                WORD32 mode)
   1956 {
   1957 
   1958     /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
   1959     /* for ref main & side samples assignment,can be combined for */
   1960     /* optimzation*/
   1961 
   1962     WORD32 row, col, k;
   1963     WORD32 two_nt;
   1964     WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
   1965     WORD32 ref_idx;
   1966 
   1967     __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
   1968     __m128i fract_4x32b,  intra_pred_ang_4x32b;
   1969     __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
   1970 
   1971 
   1972     UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2];
   1973     UWORD8 *ref_main;
   1974     UWORD8 *ref_temp;
   1975     UNUSED(src_strd);
   1976 
   1977     inv_ang_sum = 128;
   1978     two_nt    = 2 * nt;
   1979     ref_temp = ref_tmp + 1;
   1980     ref_main = ref_temp + nt - 1;
   1981     intra_pred_ang = gai4_ihevc_ang_table[mode];
   1982 
   1983     /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   1984     /* samples dependent on distance to obtain destination sample */
   1985     const_temp_4x32b  = _mm_set1_epi16(16);
   1986     const_temp2_4x32b = _mm_set1_epi32(31);
   1987     const_temp3_4x32b = _mm_set1_epi32(32);
   1988     const_temp4_4x32b = _mm_set1_epi32(4);
   1989 
   1990     two_nt_4x32b = _mm_set1_epi32(1);
   1991 
   1992 
   1993     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
   1994 
   1995     /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   1996     intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
   1997 
   1998     row_4x32b = _mm_set_epi32(4, 3, 2, 1);
   1999 
   2000     if(nt == 4)
   2001     {
   2002 
   2003         WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   2004         int temp11, temp21, temp31, temp41;
   2005 //        WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
   2006 
   2007         __m128i fract1_8x16b, fract2_8x16b;
   2008         __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2009 
   2010         __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   2011         __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   2012         __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
   2013 
   2014         /* Intermediate reference samples for negative angle modes */
   2015         /* This have to be removed during optimization*/
   2016         /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
   2017         inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
   2018 
   2019         ref_main = ref_temp + nt - 1;
   2020         for(k = 0; k < nt + 1; k++)
   2021             ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
   2022 
   2023         ref_main = ref_temp + nt - 1;
   2024         ref_idx = (nt * intra_pred_ang) >> 5;
   2025 
   2026         /* SIMD Optimization can be done using look-up table for the loop */
   2027         /* For negative angled derive the main reference samples from side */
   2028         /*  reference samples refer to section 8.4.4.2.6 */
   2029         for(k = -1; k > ref_idx; k--)
   2030         {
   2031             inv_ang_sum += inv_ang;
   2032             ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
   2033         }
   2034 
   2035 
   2036         /* pos = ((row + 1) * intra_pred_ang); */
   2037         res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
   2038 
   2039         /* idx = pos >> 5; */
   2040         fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2041 
   2042         /* fract = pos & (31); */
   2043         ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   2044 
   2045         /*(32 - fract) */
   2046         row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
   2047 
   2048         fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   2049         fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
   2050 
   2051         fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   2052         row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
   2053 
   2054         fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b);
   2055         fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b);
   2056 
   2057         temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   2058         temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   2059         temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   2060         temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   2061 
   2062         ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
   2063         ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
   2064         ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
   2065         ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
   2066         ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
   2067         ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
   2068         ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
   2069 
   2070         /* loding 8-bit 16 pixels */
   2071         src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/
   2072         src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/
   2073         src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/
   2074         src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/
   2075 
   2076         src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
   2077         src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
   2078         src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
   2079         src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
   2080 
   2081         /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2082         src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   2083         src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   2084         src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   2085         src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   2086 
   2087         /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2088         src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   2089         src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   2090         src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   2091         src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   2092 
   2093         /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2094         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   2095         src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   2096         src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   2097         src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   2098 
   2099         /* converting 16 bit to 8 bit */
   2100         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   2101         src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   2102 
   2103 
   2104         src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2105         src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2106 
   2107         src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2108         src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4);
   2109         src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8);
   2110         src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12);
   2111 
   2112         temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
   2113         temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
   2114         temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
   2115         temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
   2116 
   2117         /* loding 8-bit 4 pixels values */
   2118         *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
   2119         *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
   2120         *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
   2121         *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
   2122     }
   2123 
   2124     else if(nt == 32)
   2125     {
   2126 
   2127 
   2128         __m128i temp1, temp2, temp3, temp11, temp12;
   2129         __m128i src_values0, src_values1;
   2130         /* Intermediate reference samples for negative angle modes */
   2131 
   2132         ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
   2133         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
   2134         temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17));
   2135         temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
   2136 
   2137         /* For negative angled derive the main reference samples from side */
   2138 
   2139         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
   2140         src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/
   2141 
   2142         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode]));
   2143         temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
   2144 
   2145         src_values0 = _mm_shuffle_epi8(src_values0, temp2);
   2146         src_values1 = _mm_shuffle_epi8(src_values1, temp2);
   2147         src_values0 = _mm_shuffle_epi8(src_values0, temp12);
   2148         src_values1 = _mm_shuffle_epi8(src_values1, temp11);
   2149 
   2150         temp1 = _mm_shuffle_epi8(temp1, temp2);
   2151         temp3 = _mm_shuffle_epi8(temp3, temp2);
   2152 
   2153         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3);
   2154         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1);
   2155         _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0);
   2156         _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1);
   2157 
   2158 
   2159         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2160         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   2161         const_temp2_4x32b = _mm_set1_epi16(31);
   2162         const_temp4_4x32b = _mm_set1_epi16(8);
   2163         const_temp3_4x32b = _mm_set1_epi16(32);
   2164         two_nt_4x32b = _mm_set1_epi16(1);
   2165 
   2166         for(col = 0; col < nt; col += 8)
   2167         {
   2168             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   2169             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   2170             // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
   2171 
   2172             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   2173 
   2174             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2175             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   2176 
   2177             /* pos = ((row + 1) * intra_pred_ang); */
   2178             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2179 
   2180             /* idx = pos >> 5; */
   2181             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2182 
   2183             /* fract = pos & (31); */
   2184             ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   2185 
   2186             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   2187             /*(32 - fract) */
   2188             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   2189 
   2190             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   2191             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   2192 
   2193             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   2194             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   2195 
   2196 
   2197             fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
   2198             fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
   2199 
   2200             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   2201             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   2202             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   2203             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   2204 
   2205             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   2206             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   2207             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   2208             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   2209 
   2210             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   2211             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   2212             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   2213             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   2214 
   2215             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   2216             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   2217             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   2218             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   2219 
   2220             for(row = 0; row < nt; row += 8)
   2221             {
   2222                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   2223                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   2224 
   2225 
   2226                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   2227                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   2228 
   2229                 /* loding 8-bit 16 pixels */
   2230                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
   2231                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
   2232                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
   2233                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
   2234 
   2235                 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
   2236                 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
   2237                 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
   2238                 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
   2239 
   2240                 /* loding 8-bit 16 pixels */
   2241                 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
   2242                 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
   2243                 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
   2244                 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
   2245 
   2246                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
   2247                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
   2248                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
   2249                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
   2250 
   2251                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
   2252                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
   2253                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
   2254                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
   2255 
   2256                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2257                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   2258                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   2259                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   2260                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   2261 
   2262                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2263                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   2264                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   2265                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   2266                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   2267 
   2268                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2269                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   2270                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   2271                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   2272                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   2273 
   2274                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2275                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   2276                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   2277                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   2278                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   2279 
   2280                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2281                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   2282                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   2283                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   2284                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   2285 
   2286                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2287                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   2288                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   2289                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   2290                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   2291 
   2292                 /* converting 16 bit to 8 bit */
   2293                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   2294                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   2295 
   2296                 /* converting 16 bit to 8 bit */
   2297                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
   2298                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
   2299 
   2300                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2301                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2302 
   2303                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2304                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2305 
   2306                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2307                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2308 
   2309                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2310                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2311 
   2312 
   2313                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2314                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2315 
   2316                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2317                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2318 
   2319                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   2320                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   2321                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   2322                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   2323 
   2324                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
   2325 
   2326                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
   2327 
   2328                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
   2329 
   2330                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
   2331 
   2332                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
   2333 
   2334                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
   2335 
   2336                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
   2337 
   2338                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
   2339 
   2340             }
   2341         }
   2342     }
   2343     else if(nt == 16)
   2344     {
   2345 
   2346         __m128i temp1, temp2, temp11, src_values0;
   2347         /* Intermediate reference samples for negative angle modes */
   2348         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   2349         ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
   2350         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
   2351         temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
   2352         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
   2353 
   2354         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
   2355 
   2356         src_values0 = _mm_shuffle_epi8(src_values0, temp2);
   2357         temp1 = _mm_shuffle_epi8(temp1, temp2);
   2358         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   2359 
   2360         _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
   2361         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
   2362 
   2363         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2364         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   2365         const_temp2_4x32b = _mm_set1_epi16(31);
   2366         const_temp4_4x32b = _mm_set1_epi16(8);
   2367         const_temp3_4x32b = _mm_set1_epi16(32);
   2368         two_nt_4x32b = _mm_set1_epi16(1);
   2369 
   2370         for(col = 0; col < nt; col += 8)
   2371         {
   2372             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   2373             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   2374             // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
   2375 
   2376             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   2377 
   2378             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2379             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   2380 
   2381             /* pos = ((row + 1) * intra_pred_ang); */
   2382             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2383 
   2384             /* idx = pos >> 5; */
   2385             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2386 
   2387             /* fract = pos & (31); */
   2388             ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   2389 
   2390             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   2391             /*(32 - fract) */
   2392             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   2393 
   2394             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   2395             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   2396 
   2397             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   2398             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   2399 
   2400 
   2401             fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
   2402             fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
   2403 
   2404             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   2405             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   2406             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   2407             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   2408 
   2409             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   2410             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   2411             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   2412             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   2413 
   2414             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   2415             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   2416             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   2417             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   2418 
   2419             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   2420             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   2421             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   2422             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   2423 
   2424             for(row = 0; row < nt; row += 8)
   2425             {
   2426                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   2427                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   2428 
   2429 
   2430                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   2431                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   2432 
   2433                 /* loding 8-bit 16 pixels */
   2434                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
   2435                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
   2436                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
   2437                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
   2438 
   2439                 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
   2440                 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
   2441                 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
   2442                 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
   2443 
   2444                 /* loding 8-bit 16 pixels */
   2445                 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
   2446                 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
   2447                 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
   2448                 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
   2449 
   2450                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
   2451                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
   2452                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
   2453                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
   2454 
   2455                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
   2456                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
   2457                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
   2458                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
   2459 
   2460                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2461                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   2462                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   2463                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   2464                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   2465 
   2466                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2467                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   2468                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   2469                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   2470                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   2471 
   2472                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2473                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   2474                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   2475                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   2476                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   2477 
   2478                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2479                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   2480                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   2481                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   2482                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   2483 
   2484                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2485                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   2486                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   2487                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   2488                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   2489 
   2490                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2491                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   2492                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   2493                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   2494                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   2495 
   2496                 /* converting 16 bit to 8 bit */
   2497                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   2498                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   2499 
   2500                 /* converting 16 bit to 8 bit */
   2501                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
   2502                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
   2503 
   2504                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2505                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2506 
   2507                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2508                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2509 
   2510                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2511                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2512 
   2513                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2514                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2515 
   2516 
   2517                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2518                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2519 
   2520                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2521                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2522 
   2523                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   2524                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   2525                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   2526                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   2527 
   2528                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
   2529 
   2530                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
   2531 
   2532                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
   2533 
   2534                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
   2535 
   2536                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
   2537 
   2538                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
   2539 
   2540                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
   2541 
   2542                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
   2543 
   2544             }
   2545         }
   2546     }
   2547     else
   2548     {
   2549 
   2550 
   2551         __m128i temp1, temp2, temp11, src_values0;
   2552         /* Intermediate reference samples for negative angle modes */
   2553         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   2554         ref_temp[two_nt - 1] = pu1_ref[nt];
   2555         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1));
   2556 
   2557         /* For negative angled derive the main reference samples from side */
   2558 
   2559         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
   2560         temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
   2561         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
   2562 
   2563         src_values0 = _mm_shuffle_epi8(src_values0, temp2);
   2564         temp1 = _mm_shuffle_epi8(temp1, temp2);
   2565         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   2566         src_values0 = _mm_srli_si128(src_values0, 8);
   2567 
   2568         _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
   2569         _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
   2570 
   2571 
   2572         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2573         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   2574         const_temp2_4x32b = _mm_set1_epi16(31);
   2575         const_temp4_4x32b = _mm_set1_epi16(8);
   2576         const_temp3_4x32b = _mm_set1_epi16(32);
   2577         two_nt_4x32b = _mm_set1_epi16(1);
   2578 
   2579         {
   2580             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   2581             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   2582             //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
   2583 
   2584             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   2585 
   2586             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2587             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   2588 
   2589             /* pos = ((row + 1) * intra_pred_ang); */
   2590             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2591 
   2592             /* idx = pos >> 5; */
   2593             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2594 
   2595             /* fract = pos & (31); */
   2596             ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   2597 
   2598             /*(32 - fract) */
   2599             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   2600 
   2601             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   2602             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   2603 
   2604             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   2605             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   2606 
   2607             fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
   2608             fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
   2609 
   2610             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   2611             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   2612             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   2613             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   2614 
   2615             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   2616             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   2617             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   2618             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   2619 
   2620             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   2621             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   2622             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   2623             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   2624 
   2625             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   2626             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   2627             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   2628             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   2629 
   2630             {
   2631                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   2632                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   2633 
   2634                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   2635                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   2636 
   2637                 /* loding 8-bit 16 pixels */
   2638                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
   2639                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
   2640                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
   2641                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
   2642 
   2643                 /* loding 8-bit 16 pixels */
   2644                 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/
   2645                 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/
   2646                 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/
   2647                 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/
   2648 
   2649                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
   2650                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
   2651                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
   2652                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
   2653 
   2654                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
   2655                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
   2656                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
   2657                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
   2658 
   2659                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2660                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   2661                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   2662                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   2663                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   2664 
   2665                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2666                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   2667                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   2668                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   2669                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   2670 
   2671                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2672                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   2673                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   2674                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   2675                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   2676 
   2677                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2678                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* row=0*/
   2679                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* row=1*/
   2680                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* row=2*/
   2681                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* row=3*/
   2682 
   2683                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2684                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   2685                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   2686                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   2687                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   2688 
   2689                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2690                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   2691                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   2692                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   2693                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   2694 
   2695                 /* converting 16 bit to 8 bit */
   2696                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   2697                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   2698 
   2699                 /* converting 16 bit to 8 bit */
   2700                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/
   2701                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/
   2702 
   2703                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2704                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2705 
   2706                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2707                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2708 
   2709                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2710                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2711 
   2712                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2713                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2714 
   2715 
   2716                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2717                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2718 
   2719                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2720                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2721 
   2722                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   2723                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   2724                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   2725                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   2726 
   2727                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b);       /* row=0*/
   2728 
   2729                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b);       /* row=1*/
   2730 
   2731                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b);       /* row=2*/
   2732 
   2733                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b);       /* row=3*/
   2734 
   2735                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b);       /* row=4*/
   2736 
   2737                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b);       /* row=5*/
   2738 
   2739                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b);       /* row=6*/
   2740 
   2741                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b);       /* row=7*/
   2742 
   2743             }
   2744         }
   2745     }
   2746 
   2747 }
   2748 
   2749 
   2750 
   2751 /**
   2752 *******************************************************************************
   2753 *
   2754 * @brief
   2755 *   Intra prediction interpolation filter for luma mode 19 to mode 25
   2756 *
   2757 * @par Description:
   2758 *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
   2759 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2760 *    block location pointed by 'pu1_dst'
   2761 *
   2762 * @param[in] pu1_src
   2763 *  UWORD8 pointer to the source
   2764 *
   2765 * @param[out] pu1_dst
   2766 *  UWORD8 pointer to the destination
   2767 *
   2768 * @param[in] src_strd
   2769 *  integer source stride
   2770 *
   2771 * @param[in] dst_strd
   2772 *  integer destination stride
   2773 *
   2774 * @param[in] nt
   2775 *  integer Transform Block size
   2776 *
   2777 * @param[in] mode
   2778 *  integer intraprediction mode
   2779 *
   2780 * @returns
   2781 *
   2782 * @remarks
   2783 *  None
   2784 *
   2785 *******************************************************************************
   2786 */
   2787 
   2788 
   2789 void ihevc_intra_pred_luma_mode_19_to_25_sse42(UWORD8 *pu1_ref,
   2790                                                WORD32 src_strd,
   2791                                                UWORD8 *pu1_dst,
   2792                                                WORD32 dst_strd,
   2793                                                WORD32 nt,
   2794                                                WORD32 mode)
   2795 {
   2796 
   2797     WORD32 row, k;
   2798     WORD32 two_nt, intra_pred_ang;
   2799     WORD32 inv_ang, inv_ang_sum;
   2800     //WORD32 ref_main_idx, pos, fract, idx;
   2801     WORD32 ref_idx;
   2802     UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2];
   2803     UWORD8 *ref_main, *ref_temp;
   2804 
   2805     __m128i  /*fract_8x16b,*/ const_temp_8x16b, sm3;
   2806     __m128i temp1, temp2, temp3, temp4;
   2807     __m128i temp11, temp12, temp13, temp14;
   2808     UNUSED(src_strd);
   2809 
   2810     two_nt = 2 * nt;
   2811     intra_pred_ang = gai4_ihevc_ang_table[mode];
   2812     inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
   2813 
   2814     /* Intermediate reference samples for negative angle modes */
   2815     /* This have to be removed during optimization*/
   2816     /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   2817     ref_temp = ref_tmp + 1;
   2818     ref_main = ref_temp + nt - 1;
   2819 
   2820 
   2821     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
   2822 
   2823 
   2824 
   2825     const_temp_8x16b = _mm_set1_epi16(16);
   2826 
   2827     if(nt == 32)
   2828     {
   2829 
   2830         __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
   2831         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   2832         __m128i row_4x32b, two_nt_4x32b, src_values12;
   2833 
   2834         __m128i src_values0, src_values1, src_values2, src_values3;
   2835         __m128i  src_values4, src_values5, src_values6, src_values7;
   2836         WORD32 col = 0;
   2837 
   2838         /* Intermediate reference samples for negative angle modes */
   2839         /* This have to be removed during optimization*/
   2840         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   2841         ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
   2842         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
   2843         temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16));
   2844 
   2845         /* SIMD Optimization can be done using look-up table for the loop */
   2846         /* For negative angled derive the main reference samples from side */
   2847         /*  reference samples refer to section 8.4.4.2.6 */
   2848         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
   2849         src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/
   2850 
   2851         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19]));
   2852         temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
   2853 
   2854         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   2855         src_values1 = _mm_shuffle_epi8(src_values1, temp12);
   2856 
   2857         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
   2858         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3);
   2859         _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1);
   2860         _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0);
   2861 
   2862         const_temp2_4x32b = _mm_set1_epi16(31);
   2863         const_temp3_4x32b = _mm_set1_epi16(32);
   2864         const_temp8_4x32b = _mm_set1_epi16(8);
   2865 
   2866         two_nt_4x32b = _mm_set1_epi16(1);
   2867 
   2868         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   2869         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2870 
   2871         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   2872 
   2873         for(row = 0; row < nt; row += 8)
   2874         {
   2875 
   2876             WORD16 ref_main_idx[9];
   2877 
   2878             __m128i res_temp5_4x32b;
   2879             __m128i fract1_8x16b, fract2_8x16b;
   2880 
   2881             /* pos = ((row + 1) * intra_pred_ang); */
   2882             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2883 
   2884             /* fract = pos & (31); */
   2885             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   2886 
   2887             /* idx = pos >> 5; */
   2888             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2889 
   2890             /*(32 - fract) */
   2891             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   2892 
   2893             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   2894             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   2895 
   2896             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   2897             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   2898 
   2899             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   2900             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   2901 
   2902             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   2903             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   2904             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   2905             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   2906 
   2907             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   2908             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   2909             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   2910             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   2911 
   2912             row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
   2913             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   2914             for(col = 0; col < nt; col += 16)
   2915             {
   2916                 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col));
   2917                 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col));
   2918                 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col));
   2919                 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col));
   2920                 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col));
   2921                 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col));
   2922                 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col));
   2923                 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col));
   2924 
   2925                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   2926                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   2927                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   2928                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   2929                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   2930                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   2931                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   2932                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   2933 
   2934 
   2935                 src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   2936                 src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   2937                 src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   2938                 src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   2939                 src_values4 = _mm_maddubs_epi16(src_values4, temp1);
   2940                 src_values5 = _mm_maddubs_epi16(src_values5, temp2);
   2941                 src_values6 = _mm_maddubs_epi16(src_values6, temp3);
   2942                 src_values7 = _mm_maddubs_epi16(src_values7, temp4);
   2943 
   2944                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2945                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   2946                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   2947                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   2948                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   2949                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   2950                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   2951                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   2952                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   2953 
   2954                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2955                 src_values0 = _mm_srai_epi16(src_values0,  5);
   2956                 src_values1 = _mm_srai_epi16(src_values1,  5);
   2957                 src_values2 = _mm_srai_epi16(src_values2,  5);
   2958                 src_values3 = _mm_srai_epi16(src_values3,  5);
   2959                 src_values4 = _mm_srai_epi16(src_values4,  5);
   2960                 src_values5 = _mm_srai_epi16(src_values5,  5);
   2961                 src_values6 = _mm_srai_epi16(src_values6,  5);
   2962                 src_values7 = _mm_srai_epi16(src_values7,  5);
   2963 
   2964                 /* converting 16 bit to 8 bit */
   2965                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   2966                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   2967                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   2968                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   2969 
   2970                 /* loading 8-bit 8 pixels values */
   2971                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
   2972                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
   2973                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
   2974                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
   2975 
   2976 
   2977                 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col));
   2978                 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col));
   2979                 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col));
   2980                 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col));
   2981                 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col));
   2982                 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col));
   2983                 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col));
   2984                 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col));
   2985 
   2986                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   2987                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   2988                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   2989                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   2990                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   2991                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   2992                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   2993                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   2994 
   2995 
   2996                 src_values0 = _mm_maddubs_epi16(src_values0, temp11);
   2997                 src_values1 = _mm_maddubs_epi16(src_values1, temp12);
   2998                 src_values2 = _mm_maddubs_epi16(src_values2, temp13);
   2999                 src_values3 = _mm_maddubs_epi16(src_values3, temp14);
   3000                 src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   3001                 src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   3002                 src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   3003                 src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   3004 
   3005                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3006                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3007                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3008                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3009                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3010                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3011                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3012                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3013                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3014 
   3015                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3016                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3017                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3018                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3019                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3020                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3021                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3022                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3023                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3024 
   3025                 /* converting 16 bit to 8 bit */
   3026                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3027                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3028                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3029                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3030 
   3031                 /* loading 8-bit 8 pixels values */
   3032                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
   3033                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
   3034                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
   3035                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
   3036 
   3037             }
   3038             pu1_dst += 8 * dst_strd;
   3039         }
   3040 
   3041     }
   3042     else if(nt == 16) /* for nt = 16 case */
   3043     {
   3044 
   3045         __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
   3046         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   3047         __m128i row_4x32b, two_nt_4x32b, src_values12;
   3048         __m128i src_values0, src_values1, src_values2, src_values3;
   3049         __m128i  src_values4, src_values5, src_values6, src_values7;
   3050 
   3051 
   3052         /* Intermediate reference samples for negative angle modes */
   3053         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   3054         ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
   3055         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
   3056 
   3057         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
   3058 
   3059         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
   3060 
   3061         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   3062 
   3063         _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
   3064         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
   3065 
   3066         const_temp2_4x32b = _mm_set1_epi16(31);
   3067         const_temp3_4x32b = _mm_set1_epi16(32);
   3068         const_temp8_4x32b = _mm_set1_epi16(8);
   3069 
   3070         two_nt_4x32b = _mm_set1_epi16(1);
   3071 
   3072         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   3073         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3074 
   3075         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3076 
   3077         for(row = 0; row < nt; row += 8)
   3078         {
   3079 
   3080             WORD16 ref_main_idx[9];
   3081 
   3082             __m128i res_temp5_4x32b;
   3083             __m128i fract1_8x16b, fract2_8x16b;
   3084 
   3085             /* pos = ((row + 1) * intra_pred_ang); */
   3086             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3087 
   3088             /* fract = pos & (31); */
   3089             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3090 
   3091             /* idx = pos >> 5; */
   3092             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3093 
   3094             /*(32 - fract) */
   3095             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   3096 
   3097             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   3098             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   3099 
   3100             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   3101             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   3102 
   3103             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   3104             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   3105 
   3106             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   3107             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   3108             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   3109             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   3110 
   3111             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   3112             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   3113             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   3114             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   3115 
   3116             row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
   3117             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   3118 
   3119             {
   3120                 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));
   3121                 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));
   3122                 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));
   3123                 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));
   3124                 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8));
   3125                 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8));
   3126                 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8));
   3127                 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8));
   3128 
   3129                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3130                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3131                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3132                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3133                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3134                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3135                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3136                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3137 
   3138 
   3139                 src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   3140                 src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   3141                 src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   3142                 src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   3143                 src_values4 = _mm_maddubs_epi16(src_values4, temp1);
   3144                 src_values5 = _mm_maddubs_epi16(src_values5, temp2);
   3145                 src_values6 = _mm_maddubs_epi16(src_values6, temp3);
   3146                 src_values7 = _mm_maddubs_epi16(src_values7, temp4);
   3147 
   3148                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3149                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3150                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3151                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3152                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3153                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3154                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3155                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3156                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3157 
   3158                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3159                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3160                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3161                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3162                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3163                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3164                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3165                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3166                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3167 
   3168                 /* converting 16 bit to 8 bit */
   3169                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3170                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3171                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3172                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3173 
   3174                 /* loading 8-bit 8 pixels values */
   3175                 _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
   3176                 _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
   3177                 _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
   3178                 _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
   3179 
   3180 
   3181                 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));
   3182                 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));
   3183                 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));
   3184                 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));
   3185                 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8));
   3186                 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8));
   3187                 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8));
   3188                 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8));
   3189 
   3190                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3191                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3192                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3193                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3194                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3195                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3196                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3197                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3198 
   3199 
   3200                 src_values0 = _mm_maddubs_epi16(src_values0, temp11);
   3201                 src_values1 = _mm_maddubs_epi16(src_values1, temp12);
   3202                 src_values2 = _mm_maddubs_epi16(src_values2, temp13);
   3203                 src_values3 = _mm_maddubs_epi16(src_values3, temp14);
   3204                 src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   3205                 src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   3206                 src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   3207                 src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   3208 
   3209                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3210                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3211                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3212                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3213                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3214                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3215                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3216                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3217                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3218 
   3219                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3220                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3221                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3222                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3223                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3224                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3225                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3226                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3227                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3228 
   3229                 /* converting 16 bit to 8 bit */
   3230                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3231                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3232                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3233                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3234 
   3235                 /* loading 8-bit 8 pixels values */
   3236                 _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
   3237                 _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
   3238                 _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
   3239                 _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
   3240 
   3241             }
   3242             pu1_dst += 8 * dst_strd;
   3243         }
   3244     }
   3245     else if(nt == 8)
   3246     {
   3247 
   3248 
   3249         __m128i const_temp2_4x32b, const_temp3_4x32b;
   3250         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   3251 
   3252         __m128i row_4x32b, two_nt_4x32b, src_values12;
   3253         __m128i src_values0, src_values1, src_values2, src_values3;
   3254         __m128i  src_values4, src_values5, src_values6, src_values7;
   3255 
   3256 
   3257         /* Intermediate reference samples for negative angle modes */
   3258         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   3259         ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
   3260         temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt));
   3261 
   3262         /* For negative angled derive the main reference samples from side */
   3263 
   3264         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/
   3265 
   3266         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
   3267 
   3268         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   3269         src_values0 = _mm_srli_si128(src_values0, 8);
   3270         _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
   3271         _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
   3272 
   3273 
   3274 
   3275         const_temp2_4x32b = _mm_set1_epi16(31);
   3276         const_temp3_4x32b = _mm_set1_epi16(32);
   3277 
   3278 
   3279         two_nt_4x32b = _mm_set1_epi16(1);
   3280 
   3281 
   3282         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   3283         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3284 
   3285         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3286 
   3287         {
   3288 
   3289             WORD16 ref_main_idx[9];
   3290 
   3291             __m128i res_temp5_4x32b;
   3292             __m128i fract1_8x16b, fract2_8x16b;
   3293 
   3294             /* pos = ((row + 1) * intra_pred_ang); */
   3295             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3296 
   3297             /* fract = pos & (31); */
   3298             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3299 
   3300             /* idx = pos >> 5; */
   3301             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3302 
   3303             /*(32 - fract) */
   3304             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   3305 
   3306             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   3307             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   3308 
   3309             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   3310             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   3311 
   3312             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   3313             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   3314 
   3315             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   3316             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   3317             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   3318             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   3319 
   3320             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   3321             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   3322             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   3323             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   3324 
   3325             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   3326 
   3327             src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));  /* col = 0-7   */
   3328             src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));  /* col = 8-15  */
   3329             src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));  /* col = 16-23 */
   3330             src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));  /* col = 24-31 */
   3331             src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));  /* col = 32-39   */
   3332             src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));  /* col = 40-47  */
   3333             src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));  /* col = 48-55 */
   3334             src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));  /* col = 56-63*/
   3335 
   3336             src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3337             src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3338             src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3339             src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3340             src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3341             src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3342             src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3343             src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3344 
   3345 
   3346             src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   3347             src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   3348             src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   3349             src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   3350             src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   3351             src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   3352             src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   3353             src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   3354 
   3355             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3356             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3357             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3358             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3359             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3360             src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3361             src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3362             src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3363             src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3364 
   3365             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3366             src_values0 = _mm_srai_epi16(src_values0,  5);
   3367             src_values1 = _mm_srai_epi16(src_values1,  5);
   3368             src_values2 = _mm_srai_epi16(src_values2,  5);
   3369             src_values3 = _mm_srai_epi16(src_values3,  5);
   3370             src_values4 = _mm_srai_epi16(src_values4,  5);
   3371             src_values5 = _mm_srai_epi16(src_values5,  5);
   3372             src_values6 = _mm_srai_epi16(src_values6,  5);
   3373             src_values7 = _mm_srai_epi16(src_values7,  5);
   3374 
   3375             /* converting 16 bit to 8 bit */
   3376             src_values0 = _mm_packus_epi16(src_values0, src_values1);
   3377             src_values2 = _mm_packus_epi16(src_values2, src_values3);
   3378             src_values1 = _mm_srli_si128(src_values0, 8);
   3379             src_values3 = _mm_srli_si128(src_values2, 8);
   3380             src_values4 = _mm_packus_epi16(src_values4, src_values5);
   3381             src_values6 = _mm_packus_epi16(src_values6, src_values7);
   3382             src_values5 = _mm_srli_si128(src_values4, 8);
   3383             src_values7 = _mm_srli_si128(src_values6, 8);
   3384 
   3385             /* loading 8-bit 8 pixels values */
   3386             _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
   3387             _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
   3388             _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
   3389             _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
   3390             _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
   3391             _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
   3392             _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
   3393             _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
   3394         }
   3395     }
   3396     else /* if nt =4*/
   3397     {
   3398 
   3399         __m128i const_temp2_4x32b, const_temp3_4x32b;
   3400         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   3401 
   3402         __m128i row_4x32b, two_nt_4x32b, src_values12;
   3403 
   3404 
   3405         for(k = 0; k < (nt + 1); k++)
   3406             ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
   3407         ref_idx = (nt * intra_pred_ang) >> 5;
   3408         inv_ang_sum = 128;
   3409 
   3410         for(k = -1; k > ref_idx; k--)
   3411         {
   3412             inv_ang_sum += inv_ang;
   3413             ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
   3414         }
   3415 
   3416 
   3417         const_temp2_4x32b = _mm_set1_epi32(31);
   3418         const_temp3_4x32b = _mm_set1_epi32(32);
   3419 
   3420         two_nt_4x32b = _mm_set1_epi32(1);
   3421 
   3422 
   3423         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   3424         intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
   3425 
   3426         row_4x32b = _mm_set_epi32(4, 3, 2, 1);
   3427         {
   3428             WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   3429             int temp11, temp21, temp31, temp41;
   3430 
   3431 
   3432             __m128i fract1_8x16b, fract2_8x16b,  res_temp5_4x32b;
   3433             __m128i src_values0, src_values1, src_values2, src_values3;
   3434             __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
   3435 
   3436             /* pos = ((row + 1) * intra_pred_ang); */
   3437             res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
   3438 
   3439             /* fract = pos & (31); */
   3440             src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   3441 
   3442             ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
   3443             ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
   3444             ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
   3445             ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
   3446             ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
   3447             ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
   3448             ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
   3449 
   3450             /* idx = pos >> 5; */
   3451             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3452 
   3453             /*(32 - fract) */
   3454             src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
   3455 
   3456             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   3457             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   3458 
   3459             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   3460             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   3461 
   3462             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   3463             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   3464 
   3465             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   3466             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   3467             temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   3468             temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   3469 
   3470             src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));     /* col = 0-7   */
   3471             src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));   /* col = 8-15  */
   3472             src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
   3473             src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */
   3474 
   3475             src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3476             src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3477             src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3478             src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3479 
   3480 
   3481             src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   3482             src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   3483             src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   3484             src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   3485 
   3486             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3487             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3488             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3489             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3490             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3491 
   3492             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3493             src_values0 = _mm_srai_epi16(src_values0,  5);
   3494             src_values1 = _mm_srai_epi16(src_values1,  5);
   3495             src_values2 = _mm_srai_epi16(src_values2,  5);
   3496             src_values3 = _mm_srai_epi16(src_values3,  5);
   3497 
   3498             /* converting 16 bit to 8 bit */
   3499             src_values0 = _mm_packus_epi16(src_values0, src_values1);
   3500             src_values2 = _mm_packus_epi16(src_values2, src_values3);
   3501             src_values1 = _mm_srli_si128(src_values0, 8);
   3502             src_values3 = _mm_srli_si128(src_values2, 8);
   3503 
   3504             temp11 = _mm_cvtsi128_si32(src_values0);
   3505             temp21 = _mm_cvtsi128_si32(src_values1);
   3506             temp31 = _mm_cvtsi128_si32(src_values2);
   3507             temp41 = _mm_cvtsi128_si32(src_values3);
   3508 
   3509             /* loding 4-bit 8 pixels values */
   3510             *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
   3511             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
   3512             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
   3513             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
   3514 
   3515         }
   3516     }
   3517 }
   3518 
   3519 
   3520 /**
   3521 *******************************************************************************
   3522 *
   3523 * @brief
   3524 *    Intra prediction interpolation filter for luma mode 27 to mode 33
   3525 *
   3526 * @par Description:
   3527 *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
   3528 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   3529 *    block location pointed by 'pu1_dst'
   3530 *
   3531 * @param[in] pu1_src
   3532 *  UWORD8 pointer to the source
   3533 *
   3534 * @param[out] pu1_dst
   3535 *  UWORD8 pointer to the destination
   3536 *
   3537 * @param[in] src_strd
   3538 *  integer source stride
   3539 *
   3540 * @param[in] dst_strd
   3541 *  integer destination stride
   3542 *
   3543 * @param[in] nt
   3544 *  integer Transform Block size
   3545 *
   3546 * @param[in] mode
   3547 *  integer intraprediction mode
   3548 *
   3549 * @returns
   3550 *
   3551 * @remarks
   3552 *  None
   3553 *
   3554 *******************************************************************************
   3555 */
   3556 
   3557 
   3558 void ihevc_intra_pred_luma_mode_27_to_33_sse42(UWORD8 *pu1_ref,
   3559                                                WORD32 src_strd,
   3560                                                UWORD8 *pu1_dst,
   3561                                                WORD32 dst_strd,
   3562                                                WORD32 nt,
   3563                                                WORD32 mode)
   3564 {
   3565     WORD32 row;
   3566     WORD32 two_nt;
   3567     WORD32 intra_pred_ang;
   3568 
   3569     __m128i temp11, temp12, temp13, temp14;
   3570 
   3571     __m128i     const_temp_8x16b;
   3572     __m128i temp1, temp2, temp3, temp4, sm3;
   3573     UNUSED(src_strd);
   3574 
   3575     two_nt = 2 * nt;
   3576     intra_pred_ang = gai4_ihevc_ang_table[mode];
   3577 
   3578     const_temp_8x16b = _mm_set1_epi16(16);
   3579     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
   3580     if(nt == 32)
   3581     {
   3582 
   3583         __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
   3584         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   3585         __m128i row_4x32b, two_nt_4x32b, src_values12;
   3586         int col = 0;
   3587 
   3588         const_temp2_4x32b = _mm_set1_epi16(31);
   3589         const_temp3_4x32b = _mm_set1_epi16(32);
   3590         const_temp8_4x32b = _mm_set1_epi16(8);
   3591 
   3592         two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
   3593 
   3594         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   3595         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3596 
   3597         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3598 
   3599         for(row = 0; row < nt; row += 8)
   3600         {
   3601 
   3602             WORD16 ref_main_idx[9];
   3603 
   3604             __m128i res_temp5_4x32b;
   3605             __m128i fract1_8x16b, fract2_8x16b;
   3606             __m128i src_values0, src_values1, src_values2, src_values3;
   3607             __m128i  src_values4, src_values5, src_values6, src_values7;
   3608 
   3609             /* pos = ((row + 1) * intra_pred_ang); */
   3610             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3611 
   3612             /* fract = pos & (31); */
   3613             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3614 
   3615             /* idx = pos >> 5; */
   3616             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3617 
   3618             /*(32 - fract) */
   3619             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   3620 
   3621             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   3622             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   3623 
   3624             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   3625             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   3626 
   3627             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   3628             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   3629 
   3630             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   3631             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   3632             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   3633             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   3634 
   3635             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   3636             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   3637             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   3638             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   3639 
   3640             row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
   3641             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   3642             for(col = 0; col < nt; col += 16)
   3643             {
   3644                 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col));
   3645                 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col));
   3646                 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col));
   3647                 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col));
   3648                 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col));
   3649                 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col));
   3650                 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col));
   3651                 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col));
   3652 
   3653                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3654                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3655                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3656                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3657                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3658                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3659                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3660                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3661 
   3662 
   3663                 src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   3664                 src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   3665                 src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   3666                 src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   3667                 src_values4 = _mm_maddubs_epi16(src_values4, temp1);
   3668                 src_values5 = _mm_maddubs_epi16(src_values5, temp2);
   3669                 src_values6 = _mm_maddubs_epi16(src_values6, temp3);
   3670                 src_values7 = _mm_maddubs_epi16(src_values7, temp4);
   3671 
   3672                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3673                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3674                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3675                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3676                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3677                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3678                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3679                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3680                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3681 
   3682                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3683                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3684                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3685                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3686                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3687                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3688                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3689                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3690                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3691 
   3692                 /* converting 16 bit to 8 bit */
   3693                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3694                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3695                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3696                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3697 
   3698                 /* loading 8-bit 8 pixels values */
   3699                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
   3700                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
   3701                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
   3702                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
   3703 
   3704 
   3705                 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col));
   3706                 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col));
   3707                 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col));
   3708                 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col));
   3709                 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col));
   3710                 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col));
   3711                 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col));
   3712                 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col));
   3713 
   3714                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3715                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3716                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3717                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3718                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3719                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3720                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3721                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3722 
   3723 
   3724                 src_values0 = _mm_maddubs_epi16(src_values0, temp11);
   3725                 src_values1 = _mm_maddubs_epi16(src_values1, temp12);
   3726                 src_values2 = _mm_maddubs_epi16(src_values2, temp13);
   3727                 src_values3 = _mm_maddubs_epi16(src_values3, temp14);
   3728                 src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   3729                 src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   3730                 src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   3731                 src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   3732 
   3733                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3734                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3735                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3736                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3737                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3738                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3739                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3740                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3741                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3742 
   3743                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3744                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3745                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3746                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3747                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3748                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3749                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3750                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3751                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3752 
   3753                 /* converting 16 bit to 8 bit */
   3754                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3755                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3756                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3757                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3758 
   3759                 /* loading 8-bit 8 pixels values */
   3760                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
   3761                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
   3762                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
   3763                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
   3764 
   3765             }
   3766             pu1_dst += 8 * dst_strd;
   3767         }
   3768 
   3769     }
   3770     else if(nt == 16) /* for nt = 16 case */
   3771     {
   3772 
   3773         __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
   3774         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   3775         __m128i row_4x32b, two_nt_4x32b, src_values12;
   3776 
   3777 
   3778         const_temp2_4x32b = _mm_set1_epi16(31);
   3779         const_temp3_4x32b = _mm_set1_epi16(32);
   3780         const_temp8_4x32b = _mm_set1_epi16(8);
   3781 
   3782         two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
   3783 
   3784         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   3785         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3786 
   3787         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3788 
   3789         for(row = 0; row < nt; row += 8)
   3790         {
   3791 
   3792             WORD16 ref_main_idx[9];
   3793 
   3794             __m128i res_temp5_4x32b;
   3795             __m128i fract1_8x16b, fract2_8x16b;
   3796             __m128i src_values0, src_values1, src_values2, src_values3;
   3797             __m128i  src_values4, src_values5, src_values6, src_values7;
   3798 
   3799             /* pos = ((row + 1) * intra_pred_ang); */
   3800             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3801 
   3802             /* fract = pos & (31); */
   3803             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3804 
   3805             /* idx = pos >> 5; */
   3806             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3807 
   3808             /*(32 - fract) */
   3809             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   3810 
   3811             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   3812             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   3813 
   3814             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   3815             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   3816 
   3817             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   3818             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   3819 
   3820             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   3821             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   3822             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   3823             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   3824 
   3825             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   3826             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   3827             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   3828             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   3829 
   3830             row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
   3831             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   3832 
   3833             {
   3834                 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));
   3835                 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));
   3836                 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));
   3837                 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));
   3838                 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8));
   3839                 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8));
   3840                 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8));
   3841                 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8));
   3842 
   3843                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3844                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3845                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3846                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3847                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3848                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3849                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3850                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3851 
   3852 
   3853                 src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   3854                 src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   3855                 src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   3856                 src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   3857                 src_values4 = _mm_maddubs_epi16(src_values4, temp1);
   3858                 src_values5 = _mm_maddubs_epi16(src_values5, temp2);
   3859                 src_values6 = _mm_maddubs_epi16(src_values6, temp3);
   3860                 src_values7 = _mm_maddubs_epi16(src_values7, temp4);
   3861 
   3862                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3863                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3864                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3865                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3866                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3867                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3868                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3869                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3870                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3871 
   3872                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3873                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3874                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3875                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3876                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3877                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3878                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3879                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3880                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3881 
   3882                 /* converting 16 bit to 8 bit */
   3883                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3884                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3885                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3886                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3887 
   3888                 /* loading 8-bit 8 pixels values */
   3889                 _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
   3890                 _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
   3891                 _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
   3892                 _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
   3893 
   3894 
   3895                 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));
   3896                 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));
   3897                 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));
   3898                 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));
   3899                 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8));
   3900                 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8));
   3901                 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8));
   3902                 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8));
   3903 
   3904                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3905                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3906                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3907                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3908                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3909                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3910                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3911                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3912 
   3913 
   3914                 src_values0 = _mm_maddubs_epi16(src_values0, temp11);
   3915                 src_values1 = _mm_maddubs_epi16(src_values1, temp12);
   3916                 src_values2 = _mm_maddubs_epi16(src_values2, temp13);
   3917                 src_values3 = _mm_maddubs_epi16(src_values3, temp14);
   3918                 src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   3919                 src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   3920                 src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   3921                 src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   3922 
   3923                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3924                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3925                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3926                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3927                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3928                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3929                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3930                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3931                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3932 
   3933                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3934                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3935                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3936                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3937                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3938                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3939                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3940                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3941                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3942 
   3943                 /* converting 16 bit to 8 bit */
   3944                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3945                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3946                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3947                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3948 
   3949                 /* loading 8-bit 8 pixels values */
   3950                 _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
   3951                 _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
   3952                 _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
   3953                 _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
   3954 
   3955             }
   3956             pu1_dst += 8 * dst_strd;
   3957         }
   3958 
   3959     }
   3960     else if(nt == 8)
   3961     {
   3962 
   3963         __m128i const_temp2_4x32b, const_temp3_4x32b;
   3964         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   3965         __m128i row_4x32b, two_nt_4x32b, src_values12;
   3966 
   3967 
   3968         const_temp2_4x32b = _mm_set1_epi16(31);
   3969         const_temp3_4x32b = _mm_set1_epi16(32);
   3970 
   3971         two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
   3972 
   3973 
   3974         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   3975         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3976 
   3977         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3978 
   3979         //for(row = 0; row < nt; row +=4)
   3980         {
   3981 
   3982             WORD16 ref_main_idx[9];
   3983 
   3984             __m128i res_temp5_4x32b;
   3985             __m128i fract1_8x16b, fract2_8x16b;
   3986             __m128i src_values0, src_values1, src_values2, src_values3;
   3987             __m128i  src_values4, src_values5, src_values6, src_values7;
   3988 
   3989             /* pos = ((row + 1) * intra_pred_ang); */
   3990             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3991 
   3992             /* fract = pos & (31); */
   3993             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3994 
   3995             /* idx = pos >> 5; */
   3996             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3997 
   3998             /*(32 - fract) */
   3999             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   4000 
   4001             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   4002             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   4003 
   4004             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   4005             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   4006 
   4007             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   4008             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   4009 
   4010             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   4011             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   4012             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   4013             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   4014 
   4015             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   4016             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   4017             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   4018             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   4019 
   4020             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   4021 
   4022             src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));  /* col = 0-7   */
   4023             src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));  /* col = 8-15  */
   4024             src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));  /* col = 16-23 */
   4025             src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));  /* col = 24-31 */
   4026             src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));  /* col = 32-39   */
   4027             src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));  /* col = 40-47  */
   4028             src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));  /* col = 48-55 */
   4029             src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));  /* col = 56-63*/
   4030 
   4031             src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4032             src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4033             src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4034             src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4035             src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4036             src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4037             src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4038             src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4039 
   4040 
   4041             src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   4042             src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   4043             src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   4044             src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   4045             src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   4046             src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   4047             src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   4048             src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   4049 
   4050             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4051             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4052             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4053             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4054             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4055             src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4056             src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4057             src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4058             src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4059 
   4060             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4061             src_values0 = _mm_srai_epi16(src_values0,  5);
   4062             src_values1 = _mm_srai_epi16(src_values1,  5);
   4063             src_values2 = _mm_srai_epi16(src_values2,  5);
   4064             src_values3 = _mm_srai_epi16(src_values3,  5);
   4065             src_values4 = _mm_srai_epi16(src_values4,  5);
   4066             src_values5 = _mm_srai_epi16(src_values5,  5);
   4067             src_values6 = _mm_srai_epi16(src_values6,  5);
   4068             src_values7 = _mm_srai_epi16(src_values7,  5);
   4069 
   4070             /* converting 16 bit to 8 bit */
   4071             src_values0 = _mm_packus_epi16(src_values0, src_values1);
   4072             src_values2 = _mm_packus_epi16(src_values2, src_values3);
   4073             src_values1 = _mm_srli_si128(src_values0, 8);
   4074             src_values3 = _mm_srli_si128(src_values2, 8);
   4075             src_values4 = _mm_packus_epi16(src_values4, src_values5);
   4076             src_values6 = _mm_packus_epi16(src_values6, src_values7);
   4077             src_values5 = _mm_srli_si128(src_values4, 8);
   4078             src_values7 = _mm_srli_si128(src_values6, 8);
   4079 
   4080             /* loading 8-bit 8 pixels values */
   4081             _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
   4082             _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
   4083             _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
   4084             _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
   4085             _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
   4086             _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
   4087             _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
   4088             _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
   4089         }
   4090 
   4091     }
   4092     else /* if nt =4*/
   4093     {
   4094 
   4095         __m128i const_temp2_4x32b, const_temp3_4x32b;
   4096         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   4097 
   4098         __m128i row_4x32b, two_nt_4x32b, src_values12;
   4099 
   4100 
   4101         const_temp2_4x32b = _mm_set1_epi32(31);
   4102         const_temp3_4x32b = _mm_set1_epi32(32);
   4103 
   4104         two_nt_4x32b = _mm_set1_epi32(two_nt + 1);
   4105 
   4106 
   4107         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   4108         intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
   4109 
   4110         row_4x32b = _mm_set_epi32(4, 3, 2, 1);
   4111         {
   4112             int temp11, temp21, temp31, temp41;
   4113 
   4114             WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   4115 
   4116             __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
   4117             __m128i src_values0, src_values1, src_values2, src_values3;
   4118             __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
   4119 
   4120             /* pos = ((row + 1) * intra_pred_ang); */
   4121             res_temp5_4x32b  = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b);
   4122 
   4123             /* fract = pos & (31); */
   4124             src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   4125 
   4126             ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
   4127             ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
   4128             ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
   4129             ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
   4130             ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
   4131             ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
   4132             ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
   4133 
   4134             /* idx = pos >> 5; */
   4135             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   4136 
   4137             /*(32 - fract) */
   4138             src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
   4139 
   4140             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   4141             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   4142 
   4143             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   4144             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   4145 
   4146             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   4147             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   4148 
   4149             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   4150             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   4151             temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   4152             temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   4153 
   4154             src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));     /* col = 0-7   */
   4155             src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));   /* col = 8-15  */
   4156             src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
   4157             src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */
   4158 
   4159             src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4160             src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4161             src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4162             src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4163 
   4164             src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   4165             src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   4166             src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   4167             src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   4168 
   4169             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4170             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4171             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4172             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4173             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4174 
   4175             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4176             src_values0 = _mm_srai_epi16(src_values0,  5);
   4177             src_values1 = _mm_srai_epi16(src_values1,  5);
   4178             src_values2 = _mm_srai_epi16(src_values2,  5);
   4179             src_values3 = _mm_srai_epi16(src_values3,  5);
   4180 
   4181             /* converting 16 bit to 8 bit */
   4182             src_values0 = _mm_packus_epi16(src_values0, src_values1);
   4183             src_values2 = _mm_packus_epi16(src_values2, src_values3);
   4184             src_values1 = _mm_srli_si128(src_values0, 8);
   4185             src_values3 = _mm_srli_si128(src_values2, 8);
   4186 
   4187             temp11 = _mm_cvtsi128_si32(src_values0);
   4188             temp21 = _mm_cvtsi128_si32(src_values1);
   4189             temp31 = _mm_cvtsi128_si32(src_values2);
   4190             temp41 = _mm_cvtsi128_si32(src_values3);
   4191 
   4192             /* loding 4-bit 8 pixels values */
   4193             *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
   4194             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
   4195             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
   4196             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
   4197 
   4198         }
   4199     }
   4200 }
   4201 
   4202