Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_intra_pred_filters_atom_intr.c
     22 *
     23 * @brief
     24 *  Contains function Definition for intra prediction  interpolation filters
     25 *
     26 *
     27 * @author
     28 * Ittiam
     29 *
     30 * @par List of Functions:
     31 *  - ihevc_intra_pred_luma_planar_ssse3()
     32 *  - ihevc_intra_pred_luma_dc_ssse3()
     33 *  - ihevc_intra_pred_luma_horz_ssse3()
     34 *  - ihevc_intra_pred_luma_ver_ssse3()
     35 *  - ihevc_intra_pred_luma_mode2_ssse3()
     36 *  - ihevc_intra_pred_luma_mode_18_34_ssse3()
     37 *  - ihevc_intra_pred_luma_mode_3_to_9_ssse3()
     38 *  - ihevc_intra_pred_luma_mode_11_to_17_ssse3()
     39 *  - ihevc_intra_pred_luma_mode_19_to_25_ssse3()
     40 *  - ihevc_intra_pred_luma_mode_27_to_33_ssse3()
     41 *  - ihevc_intra_pred_luma_ref_substitution_ssse3()
     42 *
     43 * @remarks
     44 *  None
     45 *
     46 *******************************************************************************
     47 */
     48 
     49 
     50 /*****************************************************************************/
     51 /* File Includes                                                             */
     52 /*****************************************************************************/
     53 #include <stdlib.h>
     54 
     55 #include "ihevc_typedefs.h"
     56 #include "ihevc_intra_pred.h"
     57 #include "ihevc_platform_macros.h"
     58 #include "ihevc_macros.h"
     59 #include "ihevc_func_selector.h"
     60 #include "ihevc_common_tables.h"
     61 #include "ihevc_defs.h"
     62 #include "ihevc_tables_x86_intr.h"
     63 
     64 #include <immintrin.h>
     65 
     66 /****************************************************************************/
     67 /* Constant Macros                                                          */
     68 /****************************************************************************/
     69 #define MAX_CU_SIZE 64
     70 #define BIT_DEPTH 8
     71 #define T32_4NT 128
     72 #define T16_4NT 64
     73 
     74 
     75 /****************************************************************************/
     76 /* Function Macros                                                          */
     77 /****************************************************************************/
     78 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
     79 
     80 /* tables to shuffle 8-bit values */
     81 
     82 
     83 /*****************************************************************************/
     84 /* global tables Definition                                                  */
     85 /*****************************************************************************/
     86 
     87 
     88 /*****************************************************************************/
     89 /* Function Definition                                                      */
     90 /*****************************************************************************/
     91 
     92 
     93 /**
     94 *******************************************************************************
     95 *
     96 * @brief
     97 *    Intra prediction interpolation filter for pu1_ref substitution
     98 *
     99 *
    100 * @par Description:
    101 *    Reference substitution process for samples unavailable  for prediction
    102 *    Refer to section 8.4.4.2.2
    103 *
    104 * @param[in] pu1_top_left
    105 *  UWORD8 pointer to the top-left
    106 *
    107 * @param[in] pu1_top
    108 *  UWORD8 pointer to the top
    109 *
    110 * @param[in] pu1_left
    111 *  UWORD8 pointer to the left
    112 *
    113 * @param[in] src_strd
    114 *  WORD32 Source stride
    115 *
    116 * @param[in] nbr_flags
    117 *  WORD32 neighbor availability flags
    118 *
    119 * @param[in] nt
    120 *  WORD32 transform Block size
    121 *
    122 * @param[in] dst_strd
    123 *  WORD32 Destination stride
    124 *
    125 * @returns
    126 *
    127 * @remarks
    128 *  None
    129 *
    130 *******************************************************************************
    131 */
    132 
    133 void ihevc_intra_pred_luma_ref_substitution_ssse3(UWORD8 *pu1_top_left,
    134                                                   UWORD8 *pu1_top,
    135                                                   UWORD8 *pu1_left,
    136                                                   WORD32 src_strd,
    137                                                   WORD32 nt,
    138                                                   WORD32 nbr_flags,
    139                                                   UWORD8 *pu1_dst,
    140                                                   WORD32 dst_strd)
    141 {
    142     UWORD8 pu1_ref;
    143     WORD32 dc_val, i;
    144     WORD32 total_samples = (4 * nt) + 1;
    145     WORD32 two_nt = 2 * nt;
    146 
    147     WORD32 three_nt = 3 * nt;
    148     WORD32 get_bits;
    149     WORD32 next;
    150     WORD32 bot_left, left, top, tp_right, tp_left;
    151 
    152     WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
    153     UNUSED(dst_strd);
    154 
    155     dc_val = 1 << (BIT_DEPTH - 1);
    156 
    157 
    158     /* Neighbor Flag Structure*/
    159     /* MSB ---> LSB */
    160     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
    161               1         4         4     4         4
    162      */
    163     /* If no neighbor flags are present, fill the neighbor samples with DC value */
    164     if(nbr_flags == 0)
    165     {
    166         for(i = 0; i < total_samples; i++)
    167         {
    168             pu1_dst[i] = dc_val;
    169         }
    170     }
    171     else
    172     {
    173         /* Else fill the corresponding samples */
    174         pu1_dst[two_nt] = *pu1_top_left;
    175         for(i = 0; i < two_nt; i++)
    176             pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
    177         for(i = 0; i < two_nt; i++)
    178             pu1_dst[two_nt + 1 + i] = pu1_top[i];
    179 
    180         if(nt <= 8)
    181         {
    182             /* 1 bit extraction for all the neighboring blocks */
    183             tp_left = (nbr_flags & 0x10000) >> 16;
    184             bot_left = (nbr_flags & 0x8) >> 3;
    185             left = (nbr_flags & 0x80) >> 7;
    186             top = (nbr_flags & 0x100) >> 8;
    187             tp_right = (nbr_flags & 0x1000) >> 12;
    188 
    189             next = 1;
    190 
    191             /* If bottom -left is not available, reverse substitution process*/
    192             if(bot_left == 0)
    193             {
    194                 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
    195 
    196                 /* Check for the 1st available sample from bottom-left*/
    197                 while(!a_nbr_flag[next])
    198                     next++;
    199 
    200                 /* If Left, top-left are available*/
    201                 if(next <= 2)
    202                 {
    203                     idx = nt * next;
    204                     pu1_ref = pu1_dst[idx];
    205                     for(i = 0; i < idx; i++)
    206                         pu1_dst[i] = pu1_ref;
    207                 }
    208                 else /* If top, top-right are available */
    209                 {
    210                     /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
    211                     idx = (nt * (next - 1)) + 1;
    212                     pu1_ref = pu1_dst[idx];
    213                     for(i = 0; i < idx; i++)
    214                         pu1_dst[i] = pu1_ref;
    215                 }
    216             }
    217 
    218             /* Forward Substitution Process */
    219             /* If left is Unavailable, copy the last bottom-left value */
    220             if(left == 0)
    221             {
    222                 for(i = 0; i < nt; i++)
    223                     pu1_dst[nt + i] = pu1_dst[nt - 1];
    224             }
    225             /* If top-left is Unavailable, copy the last left value */
    226             if(tp_left == 0)
    227                 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
    228             /* If top is Unavailable, copy the last top-left value */
    229             if(top == 0)
    230             {
    231                 for(i = 0; i < nt; i++)
    232                     pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
    233             }
    234             /* If to right is Unavailable, copy the last top value */
    235             if(tp_right == 0)
    236             {
    237                 for(i = 0; i < nt; i++)
    238                     pu1_dst[three_nt + 1 + i] = pu1_dst[three_nt];
    239             }
    240         }
    241 
    242         if(nt == 16)
    243         {
    244             WORD32 nbr_flags_temp = 0;
    245             nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4)
    246                             + ((nbr_flags & 0x300) >> 4)
    247                             + ((nbr_flags & 0x3000) >> 6)
    248                             + ((nbr_flags & 0x10000) >> 8);
    249 
    250             /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
    251             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
    252             {
    253                 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
    254 
    255                 if(nbr_id_from_bl == 64)
    256                     nbr_id_from_bl = 32;
    257 
    258                 if(nbr_id_from_bl == 32)
    259                 {
    260                     /* for top left : 1 pel per nbr bit */
    261                     if(!((nbr_flags_temp >> 8) & 0x1))
    262                     {
    263                         nbr_id_from_bl++;
    264                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
    265                         //nbr_id_from_bl += idx * 8;
    266                     }
    267                 }
    268                 /* Reverse Substitution Process*/
    269                 if(nbr_id_from_bl)
    270                 {
    271                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
    272                     pu1_ref = pu1_dst[nbr_id_from_bl];
    273                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
    274                     {
    275                         pu1_dst[i] = pu1_ref;
    276                     }
    277                 }
    278             }
    279 
    280             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
    281             while(nbr_id_from_bl < ((T16_4NT) + 1))
    282             {
    283                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
    284                 /* Devide by 8 to obtain the original index */
    285                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
    286 
    287                 /* The Top-left flag is at the last bit location of nbr_flags*/
    288                 if(nbr_id_from_bl == (T16_4NT / 2))
    289                 {
    290                     get_bits = GET_BITS(nbr_flags_temp, 8);
    291 
    292                     /* only pel substitution for TL */
    293                     if(!get_bits)
    294                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
    295                 }
    296                 else
    297                 {
    298                     get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
    299                     if(!get_bits)
    300                     {
    301                         /* 8 pel substitution (other than TL) */
    302                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
    303                         for(i = 0; i < 8; i++)
    304                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
    305                     }
    306 
    307                 }
    308                 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
    309             }
    310 
    311 
    312         }
    313 
    314         if(nt == 32)
    315         {
    316             /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
    317             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
    318             {
    319                 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
    320 
    321                 if(nbr_id_from_bl == 64)
    322                 {
    323                     /* for top left : 1 pel per nbr bit */
    324                     if(!((nbr_flags >> 16) & 0x1))
    325                     {
    326                         /* top left not available */
    327                         nbr_id_from_bl++;
    328                         /* top and top right;  8 pels per nbr bit */
    329                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
    330                     }
    331                 }
    332                 /* Reverse Substitution Process*/
    333                 if(nbr_id_from_bl)
    334                 {
    335                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
    336                     pu1_ref = pu1_dst[nbr_id_from_bl];
    337                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
    338                         pu1_dst[i] = pu1_ref;
    339                 }
    340             }
    341 
    342             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
    343             while(nbr_id_from_bl < ((T32_4NT) + 1))
    344             {
    345                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
    346                 /* Devide by 8 to obtain the original index */
    347                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
    348 
    349                 /* The Top-left flag is at the last bit location of nbr_flags*/
    350                 if(nbr_id_from_bl == (T32_4NT / 2))
    351                 {
    352                     get_bits = GET_BITS(nbr_flags, 16);
    353                     /* only pel substitution for TL */
    354                     if(!get_bits)
    355                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
    356                 }
    357                 else
    358                 {
    359                     get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
    360                     if(!get_bits)
    361                     {
    362                         /* 8 pel substitution (other than TL) */
    363                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
    364                         for(i = 0; i < 8; i++)
    365                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
    366                     }
    367 
    368                 }
    369                 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
    370             }
    371         }
    372 
    373     }
    374 }
    375 
    376 /**
    377 *******************************************************************************
    378 *
    379 * @brief
    380 *    Intra prediction interpolation filter for ref_filtering
    381 *
    382 *
    383 * @par Description:
    384 *    Reference DC filtering for neighboring samples dependent  on TU size and
    385 *    mode  Refer to section 8.4.4.2.3 in the standard
    386 *
    387 * @param[in] pu1_src
    388 *  UWORD8 pointer to the source
    389 *
    390 * @param[out] pu1_dst
    391 *  UWORD8 pointer to the destination
    392 *
    393 * @param[in] nt
    394 *  integer Transform Block size
    395 *
    396 * @param[in] mode
    397 *  integer intraprediction mode
    398 *
    399 * @returns
    400 *
    401 * @remarks
    402 *  None
    403 *
    404 *******************************************************************************
    405 */
    406 
    407 void ihevc_intra_pred_ref_filtering_ssse3(UWORD8 *pu1_src,
    408                                           WORD32 nt,
    409                                           UWORD8 *pu1_dst,
    410                                           WORD32 mode,
    411                                           WORD32 strong_intra_smoothing_enable_flag)
    412 {
    413     WORD32 filter_flag;
    414     WORD32 i; /* Generic indexing variable */
    415     WORD32 four_nt = 4 * nt;
    416     UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1];
    417     WORD32 bi_linear_int_flag = 0;
    418     WORD32 abs_cond_left_flag = 0;
    419     WORD32 abs_cond_top_flag = 0;
    420     WORD32 dc_val = 1 << (BIT_DEPTH - 5);
    421     __m128i src_temp1, src_temp2, src_temp3, src_temp7;
    422     __m128i src_temp4, src_temp5, src_temp6, src_temp8;
    423 
    424     //WORD32 strong_intra_smoothing_enable_flag  = 1;
    425 
    426     filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
    427     if(0 == filter_flag)
    428     {
    429         if(pu1_src == pu1_dst)
    430         {
    431             return;
    432         }
    433         else
    434         {
    435             if(nt == 4)
    436             {
    437                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
    438                 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    439                 pu1_dst[four_nt] = pu1_src[four_nt];
    440 
    441             }
    442 
    443             else if(nt == 8)
    444             {
    445 
    446                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
    447                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
    448 
    449                 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    450                 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    451 
    452 
    453                 pu1_dst[four_nt] = pu1_src[four_nt];
    454             }
    455             else if(nt == 16)
    456             {
    457 
    458                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
    459                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
    460                 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
    461                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
    462 
    463                 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    464                 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    465                 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
    466                 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
    467 
    468                 pu1_dst[four_nt] = pu1_src[four_nt];
    469             }
    470             else if(nt == 32)
    471             {
    472 
    473                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src));
    474                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16));
    475                 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32));
    476                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48));
    477 
    478                 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64));
    479                 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80));
    480                 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96));
    481                 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112));
    482 
    483                 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    484                 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    485                 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
    486                 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
    487 
    488                 _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
    489                 _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
    490                 _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
    491                 _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
    492 
    493                 pu1_dst[four_nt] = pu1_src[four_nt];
    494             }
    495 
    496         }
    497     }
    498 
    499     else
    500     {
    501         /* If strong intra smoothin is enabled and transform size is 32 */
    502         if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
    503         {
    504             /* Strong Intra Filtering */
    505             abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt]
    506                             - (2 * pu1_src[3 * nt]))) < dc_val;
    507             abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0]
    508                             - (2 * pu1_src[nt]))) < dc_val;
    509 
    510             bi_linear_int_flag = ((1 == abs_cond_left_flag)
    511                             && (1 == abs_cond_top_flag));
    512         }
    513         /* Extremities Untouched*/
    514         au1_flt[0] = pu1_src[0];
    515         au1_flt[4 * nt] = pu1_src[4 * nt];
    516 
    517         /* Strong filtering of reference samples */
    518         if(1 == bi_linear_int_flag)
    519         {
    520             au1_flt[2 * nt] = pu1_src[2 * nt];
    521 
    522             for(i = 1; i < (2 * nt); i++)
    523                 au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6;
    524 
    525             for(i = 1; i < (2 * nt); i++)
    526                 au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6;
    527         }
    528         else
    529         {
    530             __m128i const_value_8x16, zero_8x16b;
    531 
    532             const_value_8x16 = _mm_set1_epi16(2);
    533 
    534             au1_flt[0] = pu1_src[0];
    535             au1_flt[4 * nt] = pu1_src[4 * nt];
    536 
    537             zero_8x16b = _mm_setzero_si128();
    538 
    539             /* Perform bilinear filtering of Reference Samples */
    540             for(i = 0; i < (four_nt); i += 16)
    541             {
    542                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i));
    543                 src_temp2 = _mm_srli_si128(src_temp1, 1);
    544                 src_temp3 = _mm_srli_si128(src_temp2, 1);
    545 
    546                 src_temp1 = _mm_unpacklo_epi8(src_temp1, zero_8x16b);
    547                 src_temp2 = _mm_unpacklo_epi8(src_temp2, zero_8x16b);
    548                 src_temp3 = _mm_unpacklo_epi8(src_temp3, zero_8x16b);
    549 
    550                 src_temp2 = _mm_slli_epi16(src_temp2,  1);
    551 
    552                 src_temp1 = _mm_add_epi16(src_temp1, src_temp2);
    553                 src_temp1 = _mm_add_epi16(src_temp1, src_temp3);
    554                 src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16);
    555 
    556                 src_temp1 = _mm_srai_epi16(src_temp1,  2);
    557 
    558                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i));
    559                 src_temp5 = _mm_srli_si128(src_temp4, 1);
    560                 src_temp6 = _mm_srli_si128(src_temp5, 1);
    561 
    562                 src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b);
    563                 src_temp5 = _mm_unpacklo_epi8(src_temp5, zero_8x16b);
    564                 src_temp6 = _mm_unpacklo_epi8(src_temp6, zero_8x16b);
    565 
    566                 src_temp5 = _mm_slli_epi16(src_temp5,  1);
    567 
    568                 src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
    569                 src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    570                 src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16);
    571 
    572                 src_temp4 = _mm_srai_epi16(src_temp4,  2);
    573 
    574                 /* converting 16 bit to 8 bit */
    575                 src_temp1 = _mm_packus_epi16(src_temp1, src_temp4);
    576 
    577                 _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1);
    578             }
    579             au1_flt[4 * nt] = pu1_src[4 * nt];
    580         }
    581 
    582         if(nt == 4)
    583         {
    584             src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
    585             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    586             pu1_dst[four_nt] = au1_flt[four_nt];
    587         }
    588         else if(nt == 8)
    589         {
    590 
    591             src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
    592             src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
    593 
    594             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    595             _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    596 
    597             pu1_dst[four_nt] = au1_flt[four_nt];
    598         }
    599         else if(nt == 16)
    600         {
    601 
    602             src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
    603             src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
    604             src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
    605             src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
    606 
    607             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    608             _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    609             _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
    610             _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
    611 
    612             pu1_dst[four_nt] = au1_flt[four_nt];
    613         }
    614 
    615         else if(nt == 32)
    616         {
    617 
    618             src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt));
    619             src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16));
    620             src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32));
    621             src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48));
    622 
    623             src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64));
    624             src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80));
    625             src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96));
    626             src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112));
    627 
    628             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1);
    629             _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2);
    630             _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3);
    631             _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4);
    632 
    633             _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5);
    634             _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6);
    635             _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7);
    636             _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8);
    637 
    638             pu1_dst[four_nt] = au1_flt[four_nt];
    639         }
    640 
    641     }
    642 }
    643 
    644 /**
    645 *******************************************************************************
    646 *
    647 * @brief
    648 *    Intra prediction interpolation filter for luma planar
    649 *
    650 * @par Description:
    651 *    Planar Intraprediction with reference neighboring samples location
    652 *    pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
    653 *    to section 8.4.4.2.4 in the standard
    654 *
    655 * @param[in] pu1_src
    656 *  UWORD8 pointer to the source
    657 *
    658 * @param[out] pu1_dst
    659 *  UWORD8 pointer to the destination
    660 *
    661 * @param[in] src_strd
    662 *  integer source stride
    663 *
    664 * @param[in] dst_strd
    665 *  integer destination stride
    666 *
    667 * @param[in] nt
    668 *  integer Transform Block size
    669 *
    670 * @param[in] mode
    671 *  integer intraprediction mode
    672 *
    673 * @returns
    674 *
    675 * @remarks
    676 *  None
    677 *
    678 *******************************************************************************
    679 */
    680 
    681 
    682 void ihevc_intra_pred_luma_planar_ssse3(UWORD8 *pu1_ref,
    683                                         WORD32 src_strd,
    684                                         UWORD8 *pu1_dst,
    685                                         WORD32 dst_strd,
    686                                         WORD32 nt,
    687                                         WORD32 mode)
    688 {
    689 
    690 
    691     WORD32 row, col;
    692     WORD32 two_nt, three_nt;
    693     UWORD16 temp;
    694 
    695     __m128i pu1_ref_16x8b, const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
    696     __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b, const_temp8_4x32b;
    697     __m128i nt_row_16x8b, nt_row1_16x8b, nt_row2_16x8b, nt_row3_16x8b; //nt-1-row
    698     __m128i row_16x8b, row1_16x8b, row2_16x8b, row3_16x8b; //row+1
    699     UNUSED(src_strd);
    700     UNUSED(mode);
    701 
    702     two_nt = 2 * nt;
    703     three_nt = 3 * nt;
    704 
    705     /* Planar filtering */
    706     temp = pu1_ref[nt - 1];
    707     temp = (temp << 8) | ((UWORD16)pu1_ref[three_nt + 1]);
    708     /* setting vallues in  registera*/
    709     pu1_ref_16x8b  = _mm_set1_epi16(temp);
    710     const_temp6_4x32b = _mm_set1_epi16(nt);
    711 
    712 
    713 
    714     if(nt == 32) /* for nt multiple of 8*/
    715     {
    716 
    717 
    718         const_temp4_4x32b = _mm_set1_epi16(0x0400);
    719         const_temp1_4x32b = _mm_set1_epi16(0x0100);
    720         const_temp8_4x32b = _mm_set1_epi16(0x0008);
    721         //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row
    722         //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
    723         nt_row_16x8b = _mm_set_epi16(0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f);
    724         //(y+1) (x+1) ; x= 0..15 , y = row
    725         //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
    726         row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
    727 
    728         for(row = 0; row < nt; row += 1)
    729         {
    730             __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
    731             __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
    732 
    733             __m128i src_temp_8x16b, src_temp1_8x16b;
    734 
    735 
    736             res_temp1_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
    737 
    738             nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b,  const_temp8_4x32b);
    739             row1_16x8b    = _mm_add_epi16(row_16x8b,     const_temp8_4x32b);
    740             nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b);
    741             row2_16x8b    = _mm_add_epi16(row1_16x8b,    const_temp8_4x32b);
    742             nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp8_4x32b);
    743             row3_16x8b    = _mm_add_epi16(row2_16x8b,    const_temp8_4x32b);
    744             /* loding 8bit 16 pixles*/
    745             src_temp_8x16b  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
    746             src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17));
    747 
    748             res_temp4_8x16b =  _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/
    749             res_temp5_8x16b =  _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=1*/
    750             res_temp6_8x16b =  _mm_unpacklo_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=2*/
    751             res_temp7_8x16b =  _mm_unpackhi_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=3*/
    752 
    753             /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
    754             res_temp_8x16b  = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
    755             res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
    756             res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
    757             res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
    758             /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
    759             res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
    760             res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
    761             res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
    762             res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
    763 
    764             res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
    765             res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
    766             res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
    767             res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
    768             /*res_temp + nt)*/
    769             res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
    770             res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
    771             res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
    772             res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
    773 
    774             res_temp_8x16b  = _mm_srli_epi16(res_temp_8x16b, 6); //log2(32)+1
    775             res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 6);
    776             res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 6);
    777             res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 6);
    778 
    779             res_temp_8x16b  = _mm_packus_epi16(res_temp_8x16b, res_temp1_8x16b);
    780             res_temp1_8x16b = _mm_packus_epi16(res_temp2_8x16b, res_temp3_8x16b);
    781 
    782 
    783             _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
    784             _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), res_temp1_8x16b);
    785 
    786 
    787             nt_row_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b);
    788             row_16x8b    = _mm_add_epi16(row_16x8b,    const_temp1_4x32b);
    789         }
    790     }
    791     else if(nt == 16) /* for nt multiple of 8*/
    792     {
    793 
    794         const_temp4_4x32b = _mm_set1_epi16(0x0400);
    795         const_temp1_4x32b = _mm_set1_epi16(0x0100);
    796         const_temp8_4x32b = _mm_set1_epi16(0x0008);
    797         //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row
    798         //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
    799         nt_row_16x8b = _mm_set_epi16(0x0f08, 0x0f09, 0x0f0a, 0x0f0b, 0x0f0c, 0x0f0d, 0x0f0e, 0x0f0f);
    800         //(y+1) (x+1) ; x= 0..15 , y = row
    801         //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
    802         row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
    803 
    804         for(row = 0; row < nt; row += 2)
    805         {
    806             __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
    807             __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
    808 
    809             __m128i src_temp_8x16b;
    810 
    811 
    812             res_temp1_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
    813             res_temp2_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]);
    814 
    815 
    816             nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b,  const_temp1_4x32b);
    817             row1_16x8b    = _mm_add_epi16(row_16x8b,     const_temp1_4x32b);
    818             nt_row2_16x8b = _mm_sub_epi16(nt_row_16x8b,  const_temp8_4x32b);
    819             row2_16x8b    = _mm_add_epi16(row_16x8b,     const_temp8_4x32b);
    820             nt_row3_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b);
    821             row3_16x8b    = _mm_add_epi16(row1_16x8b,    const_temp8_4x32b);
    822             /* loding 8bit 16 pixles*/
    823             src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
    824 
    825 
    826             res_temp4_8x16b =  _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/
    827             res_temp5_8x16b =  _mm_unpacklo_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=1*/
    828             res_temp6_8x16b =  _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=2*/
    829             res_temp7_8x16b =  _mm_unpackhi_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=3*/
    830 
    831             /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
    832             res_temp_8x16b  = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
    833             res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
    834             res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
    835             res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
    836             /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
    837             res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
    838             res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
    839             res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
    840             res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
    841 
    842             res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
    843             res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
    844             res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
    845             res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
    846             /*res_temp + nt)*/
    847             res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
    848             res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
    849             res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
    850             res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
    851 
    852             res_temp_8x16b  = _mm_srli_epi16(res_temp_8x16b, 5); //log2(16)+1
    853             res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 5);
    854             res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 5);
    855             res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 5);
    856 
    857             res_temp_8x16b  = _mm_packus_epi16(res_temp_8x16b, res_temp2_8x16b);
    858             res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, res_temp3_8x16b);
    859 
    860             _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
    861             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b);
    862 
    863             nt_row_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b);
    864             row_16x8b    = _mm_add_epi16(row1_16x8b,    const_temp1_4x32b);
    865         }
    866     }
    867     else if(nt == 8)
    868     {
    869 
    870 
    871         const_temp4_4x32b = _mm_set1_epi16(0x0400);
    872         const_temp1_4x32b = _mm_set1_epi16(0x0100);
    873         zero_8x16b = _mm_set1_epi32(0);
    874 
    875         //(nt-1-y) (nt-1-x) ; x= 0..7 , y = row
    876         //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7);
    877         nt_row_16x8b = _mm_set_epi16(0x0700, 0x0701, 0x0702, 0x0703, 0x0704, 0x0705, 0x0706, 0x0707);
    878         //(y+1) (x+1) ; x= 0..7 , y = row
    879         //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1);
    880         row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101);
    881 
    882         for(row = 0; row < nt; row += 4)
    883         {
    884             __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
    885             __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
    886 
    887             __m128i src_temp_8x16b;
    888 
    889 
    890             res_temp4_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]);
    891             res_temp5_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]);
    892             res_temp6_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 3 - row]);
    893             res_temp7_8x16b  = _mm_set1_epi8(pu1_ref[two_nt - 4 - row]);
    894 
    895             nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b,  const_temp1_4x32b);
    896             row1_16x8b    = _mm_add_epi16(row_16x8b,     const_temp1_4x32b);
    897             nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b);
    898             row2_16x8b    = _mm_add_epi16(row1_16x8b,    const_temp1_4x32b);
    899             nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp1_4x32b);
    900             row3_16x8b    = _mm_add_epi16(row2_16x8b,    const_temp1_4x32b);
    901             /* loding 8bit 16 pixles*/
    902             src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
    903 
    904             res_temp4_8x16b =  _mm_unpacklo_epi8(res_temp4_8x16b, src_temp_8x16b); /* row=0*/
    905             res_temp5_8x16b =  _mm_unpacklo_epi8(res_temp5_8x16b, src_temp_8x16b); /* row=1*/
    906             res_temp6_8x16b =  _mm_unpacklo_epi8(res_temp6_8x16b, src_temp_8x16b); /* row=2*/
    907             res_temp7_8x16b =  _mm_unpacklo_epi8(res_temp7_8x16b, src_temp_8x16b); /* row=3*/
    908 
    909             /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */
    910             res_temp_8x16b  = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b);
    911             res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b);
    912             res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b);
    913             res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b);
    914             /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */
    915             res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b);
    916             res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b);
    917             res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b);
    918             res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b);
    919 
    920             res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b);
    921             res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b);
    922             res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b);
    923             res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b);
    924             /*res_temp + nt)*/
    925             res_temp_8x16b  = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
    926             res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b);
    927             res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b);
    928             res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b);
    929 
    930             res_temp_8x16b  = _mm_srli_epi16(res_temp_8x16b, 4); //log2(16)+1
    931             res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 4);
    932             res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 4);
    933             res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 4);
    934 
    935             res_temp_8x16b  = _mm_packus_epi16(res_temp_8x16b, zero_8x16b);
    936             res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
    937             res_temp2_8x16b = _mm_packus_epi16(res_temp2_8x16b, zero_8x16b);
    938             res_temp3_8x16b = _mm_packus_epi16(res_temp3_8x16b, zero_8x16b);
    939 
    940             _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b);
    941             _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b);
    942             _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), res_temp2_8x16b);
    943             _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), res_temp3_8x16b);
    944 
    945             nt_row_16x8b = _mm_sub_epi16(nt_row3_16x8b, const_temp1_4x32b);
    946             row_16x8b    = _mm_add_epi16(row3_16x8b,    const_temp1_4x32b);
    947         }
    948     }
    949     else
    950     {
    951 
    952         /* for nt multiple of 4*/
    953         const_temp7_4x32b = _mm_set1_epi16(4);
    954         const_temp4_4x32b = _mm_set1_epi16(nt - 1);
    955         const_temp_4x32b  = _mm_set1_epi16(pu1_ref[three_nt + 1]);
    956         const_temp1_4x32b = _mm_set1_epi16(pu1_ref[nt - 1]);
    957         zero_8x16b = _mm_set1_epi32(0);
    958 
    959         for(row = 0; row < nt; row++)
    960         {
    961             __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
    962             __m128i res_temp3_8x16b;
    963 
    964             const_temp2_4x32b  = _mm_set1_epi16(pu1_ref[two_nt - 1 - row]);
    965             const_temp3_4x32b  = _mm_set1_epi16((row + 1));
    966 
    967 
    968             row_8x16b = _mm_set1_epi16((nt - 1 - row));
    969 
    970             const_temp5_4x32b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0);
    971             col_8x16b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
    972 
    973             const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
    974 
    975             /*(row + 1) * pu1_ref[nt - 1]*/
    976             res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
    977 
    978             /*(row + 1) * pu1_ref[nt - 1] + nt)*/
    979             res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
    980 
    981             for(col = 0; col < nt; col += 4)
    982             {
    983                 __m128i src_temp_8x16b;
    984                 int temp1;
    985 
    986                 /* loding 8bit 16 pixles*/
    987                 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + col));
    988 
    989                 src_temp_8x16b =  _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b); /* row=0*/
    990 
    991                 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
    992                 res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
    993 
    994                 /*(col + 1) * pu1_ref[three_nt + 1]*/
    995                 res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
    996 
    997                 /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
    998                 res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
    999 
   1000                 res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
   1001                 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   1002                 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
   1003 
   1004                 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 3); //log2(16)+1
   1005                 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
   1006 
   1007                 temp1 = _mm_cvtsi128_si32(res_temp1_8x16b);
   1008 
   1009                 *(WORD32 *)(&pu1_dst[(row * dst_strd) + col]) = temp1;
   1010 
   1011                 const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
   1012                 col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
   1013             } /* inner loop ends here */
   1014         }
   1015     }
   1016 
   1017 
   1018 }
   1019 
   1020 /**
   1021 *******************************************************************************
   1022 *
   1023 * @brief
   1024 *    Intra prediction interpolation filter for luma dc
   1025 *
   1026 * @par Description:
   1027 *   Intraprediction for DC mode with reference neighboring  samples location
   1028 *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
   1029 *   to section 8.4.4.2.5 in the standard
   1030 *
   1031 * @param[in] pu1_src
   1032 *  UWORD8 pointer to the source
   1033 *
   1034 * @param[out] pu1_dst
   1035 *  UWORD8 pointer to the destination
   1036 *
   1037 * @param[in] src_strd
   1038 *  integer source stride
   1039 *
   1040 * @param[in] dst_strd
   1041 *  integer destination stride
   1042 *
   1043 * @param[in] nt
   1044 *  integer Transform Block size
   1045 *
   1046 * @param[in] mode
   1047 *  integer intraprediction mode
   1048 *
   1049 * @returns
   1050 *
   1051 * @remarks
   1052 *  None
   1053 *
   1054 *******************************************************************************
   1055 */
   1056 
   1057 void ihevc_intra_pred_luma_dc_ssse3(UWORD8 *pu1_ref,
   1058                                     WORD32 src_strd,
   1059                                     UWORD8 *pu1_dst,
   1060                                     WORD32 dst_strd,
   1061                                     WORD32 nt,
   1062                                     WORD32 mode)
   1063 {
   1064 
   1065     WORD32 acc_dc;
   1066     WORD32 dc_val, two_dc_val, three_dc_val;
   1067     WORD32 row;
   1068     WORD32 log2nt = 5;
   1069     WORD32 two_nt, three_nt;
   1070     __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6;
   1071     __m128i src_temp8, src_temp10, src_temp2;
   1072     __m128i m_zero = _mm_setzero_si128();
   1073     __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]);
   1074     UNUSED(src_strd);
   1075     UNUSED(mode);
   1076 
   1077 
   1078     switch(nt)
   1079     {
   1080         case 32:
   1081             log2nt = 5;
   1082             break;
   1083         case 16:
   1084             log2nt = 4;
   1085             break;
   1086         case 8:
   1087             log2nt = 3;
   1088             break;
   1089         case 4:
   1090             log2nt = 2;
   1091             break;
   1092         default:
   1093             break;
   1094     }
   1095     two_nt = 2 * nt;
   1096     three_nt = 3 * nt;
   1097 
   1098     acc_dc = 0;
   1099     /* Calculate DC value for the transform block */
   1100 
   1101 
   1102 
   1103     if(nt == 32)
   1104     {
   1105         __m128i temp;
   1106         WORD32 itr_count;
   1107 
   1108         src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
   1109         src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
   1110         src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32));
   1111         src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48));
   1112 
   1113         src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
   1114         src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
   1115         src_temp7 = _mm_sad_epu8(src_temp7, m_zero);
   1116         src_temp8 = _mm_sad_epu8(src_temp8, m_zero);
   1117 
   1118         src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
   1119         src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
   1120         src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
   1121 
   1122         src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
   1123         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
   1124 
   1125         acc_dc = _mm_cvtsi128_si32(src_temp4);
   1126 
   1127         acc_dc += pu1_ref[three_nt];
   1128         acc_dc -= pu1_ref[two_nt];
   1129 
   1130         /* computing acc_dc value */
   1131         dc_val = (acc_dc + nt) >> (log2nt + 1);
   1132 
   1133         two_dc_val = 2 * dc_val;
   1134         three_dc_val = 3 * dc_val;
   1135 
   1136         temp = _mm_set1_epi8(dc_val);
   1137 
   1138         for(itr_count = 0; itr_count < 2; itr_count++)
   1139         {
   1140             /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
   1141             _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp);
   1142             _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp);
   1143             _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp);
   1144             _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp);
   1145             _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp);
   1146             _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp);
   1147             _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp);
   1148             _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp);
   1149 
   1150             _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp);
   1151             _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp);
   1152             _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp);
   1153             _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp);
   1154             _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp);
   1155             _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp);
   1156             _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp);
   1157             _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp);
   1158 
   1159             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp);
   1160             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp);
   1161             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp);
   1162             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp);
   1163             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp);
   1164             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp);
   1165             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp);
   1166             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp);
   1167 
   1168             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp);
   1169             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp);
   1170             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp);
   1171             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp);
   1172             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp);
   1173             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp);
   1174             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp);
   1175             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp);
   1176 
   1177             pu1_dst += 16 * dst_strd;
   1178         }
   1179     }
   1180     else
   1181 
   1182     {
   1183         __m128i sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
   1184 
   1185         /* DC filtering for the first top row and first left column */
   1186 
   1187 
   1188 
   1189         if(nt == 4) /* nt multiple of 4*/
   1190         {
   1191             WORD32 temp1, temp2, temp3;
   1192 
   1193             src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
   1194             src_temp2 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1195 
   1196             src_temp4 =  _mm_unpacklo_epi8(src_temp3, m_zero);
   1197             src_temp2 =  _mm_unpacklo_epi8(src_temp2, m_zero);
   1198 
   1199             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
   1200             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
   1201             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
   1202 
   1203             acc_dc = _mm_cvtsi128_si32(src_temp4);
   1204             acc_dc += pu1_ref[three_nt];
   1205             acc_dc -= pu1_ref[two_nt];
   1206 
   1207 /* computing acc_dc value */
   1208 
   1209             dc_val = (acc_dc + nt) >> (log2nt + 1);
   1210 
   1211             three_dc_val = 3 * dc_val;
   1212 
   1213             /* loding 8-bit 16 pixel */
   1214             src_temp1 = _mm_set1_epi16(three_dc_val + 2);
   1215             two_dc_val = 2 * dc_val;
   1216 
   1217             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
   1218             src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
   1219 
   1220             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */
   1221             src_temp2 = _mm_srli_epi16(src_temp2, 2);
   1222 
   1223             src_temp2 = _mm_packus_epi16(src_temp2, m_zero);
   1224 
   1225             temp1 = _mm_cvtsi128_si32(src_temp2);
   1226 
   1227             *(WORD32 *)(&pu1_dst[0]) = temp1;
   1228 
   1229             src_temp2 = _mm_insert_epi16(src_temp2, dc_val, 0);
   1230 
   1231             src_temp2 =  _mm_shuffle_epi8(src_temp2, sm1);
   1232             src_temp3 =  _mm_shuffle_epi8(src_temp2, sm1);
   1233             src_temp4 =  _mm_shuffle_epi8(src_temp2, sm1);
   1234 
   1235             temp1 = _mm_cvtsi128_si32(src_temp2);
   1236             temp2 = _mm_cvtsi128_si32(src_temp3);
   1237             temp3 = _mm_cvtsi128_si32(src_temp4);
   1238 
   1239             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
   1240             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
   1241             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
   1242 
   1243             /*  retore  first value*/
   1244             pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
   1245                             >> 2);
   1246 
   1247             for(row = 1; row < nt; row++)
   1248                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
   1249                                 >> 2;
   1250 
   1251         }
   1252         else if(nt == 8) /* if nt%8==0*/
   1253         {
   1254 
   1255             src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
   1256 
   1257             src_temp4 = _mm_sad_epu8(src_temp3, m_zero);
   1258             src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
   1259             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
   1260 
   1261             acc_dc = _mm_cvtsi128_si32(src_temp4);
   1262 
   1263             acc_dc += pu1_ref[three_nt];
   1264             acc_dc -= pu1_ref[two_nt];
   1265 
   1266             /* computing acc_dc value */
   1267 
   1268             dc_val = (acc_dc + nt) >> (log2nt + 1);
   1269 
   1270             three_dc_val = 3 * dc_val;
   1271             src_temp1 = _mm_set1_epi16(three_dc_val + 2);
   1272             two_dc_val = 2 * dc_val;
   1273 
   1274             /* loding 8-bit 16 pixel */
   1275             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1276             src_temp2 =  _mm_unpacklo_epi8(src_temp2, m_zero);
   1277 
   1278             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
   1279             src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
   1280 
   1281             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
   1282             src_temp2 = _mm_srli_epi16(src_temp2, 2);
   1283             src_temp2 = _mm_packus_epi16(src_temp2, m_zero);
   1284 
   1285             _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2);
   1286 
   1287             /* Fill the remaining rows with DC value*/
   1288 
   1289             src_temp1 = _mm_set1_epi8(dc_val);
   1290             src_temp2 = _mm_set1_epi8(dc_val);
   1291             src_temp3 = _mm_set1_epi8(dc_val);
   1292             src_temp4 = _mm_set1_epi8(dc_val);
   1293             src_temp5 = _mm_set1_epi8(dc_val);
   1294             src_temp6 = _mm_set1_epi8(dc_val);
   1295             src_temp7 = _mm_set1_epi8(dc_val);
   1296 
   1297             _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
   1298             _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
   1299             _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
   1300             _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
   1301             _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
   1302             _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
   1303             _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
   1304 
   1305             /*  retore  first value*/
   1306             pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
   1307                             >> 2);
   1308 
   1309             for(row = 1; row < nt; row++)
   1310                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
   1311                                 >> 2;
   1312 
   1313         }
   1314         else /* if nt == 16*/
   1315         {
   1316 
   1317             src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt));
   1318             src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16));
   1319 
   1320             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1321             src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
   1322 
   1323             src_temp3 = _mm_sad_epu8(src_temp3, m_zero);
   1324             src_temp4 = _mm_sad_epu8(src_temp4, m_zero);
   1325 
   1326             src_temp2  =  _mm_unpacklo_epi8(src_temp2, m_zero);
   1327             src_temp10 =  _mm_unpacklo_epi8(src_temp10, m_zero);
   1328 
   1329             src_temp4 = _mm_add_epi16(src_temp3, src_temp4);
   1330             src_temp4 = _mm_shuffle_epi8(src_temp4, sm);
   1331             src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
   1332 
   1333             acc_dc = _mm_cvtsi128_si32(src_temp4);
   1334 
   1335             acc_dc += pu1_ref[three_nt];
   1336             acc_dc -= pu1_ref[two_nt];
   1337 
   1338             /* computing acc_dc value */
   1339 
   1340             dc_val = (acc_dc + nt) >> (log2nt + 1);
   1341 
   1342             three_dc_val = 3 * dc_val;
   1343             src_temp1 = _mm_set1_epi16(three_dc_val + 2);
   1344             two_dc_val = 2 * dc_val;
   1345 
   1346             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */
   1347             src_temp2 = _mm_add_epi16(src_temp2, src_temp1);
   1348             src_temp10 = _mm_add_epi16(src_temp10, src_temp1);
   1349             /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */
   1350             src_temp2 = _mm_srli_epi16(src_temp2, 2);
   1351             src_temp10 = _mm_srli_epi16(src_temp10, 2);
   1352 
   1353             src_temp2 = _mm_packus_epi16(src_temp2, src_temp10);
   1354 
   1355             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2);
   1356 
   1357             /* Fill the remaining rows with DC value*/
   1358             src_temp1 =  _mm_set1_epi8(dc_val);
   1359             src_temp2 =  _mm_set1_epi8(dc_val);
   1360             src_temp3 =  _mm_set1_epi8(dc_val);
   1361             src_temp4 =  _mm_set1_epi8(dc_val);
   1362             src_temp5 =  _mm_set1_epi8(dc_val);
   1363             src_temp6 =  _mm_set1_epi8(dc_val);
   1364             src_temp7 =  _mm_set1_epi8(dc_val);
   1365 
   1366             for(row = 1; row < nt; row += 8)
   1367             {
   1368 
   1369                 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
   1370                 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
   1371                 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
   1372                 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
   1373                 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
   1374                 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
   1375                 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
   1376 
   1377                 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1);
   1378                 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2);
   1379                 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3);
   1380 
   1381                 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4);
   1382                 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5);
   1383                 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6);
   1384                 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7);
   1385 
   1386                 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1);
   1387 
   1388             }
   1389 
   1390             /*  retore  first value*/
   1391             pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2)
   1392                             >> 2);
   1393 
   1394             for(row = 1; row < nt; row++)
   1395                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2)
   1396                                 >> 2;
   1397 
   1398         }
   1399     }
   1400 }
   1401 
   1402 /**
   1403 *******************************************************************************
   1404 *
   1405 * @brief
   1406 *     Intra prediction interpolation filter for horizontal luma variable.
   1407 *
   1408 * @par Description:
   1409 *      Horizontal intraprediction(mode 10) with reference  samples location
   1410 *      pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
   1411 *      to section 8.4.4.2.6 in the standard (Special case)
   1412 *
   1413 * @param[in] pu1_src
   1414 *  UWORD8 pointer to the source
   1415 *
   1416 * @param[out] pu1_dst
   1417 *  UWORD8 pointer to the destination
   1418 *
   1419 * @param[in] src_strd
   1420 *  integer source stride
   1421 *
   1422 * @param[in] dst_strd
   1423 *  integer destination stride
   1424 *
   1425 * @param[in] nt
   1426 *  integer Transform Block size
   1427 *
   1428 * @param[in] mode
   1429 *  integer intraprediction mode
   1430 *
   1431 * @returns
   1432 *
   1433 * @remarks
   1434 *  None
   1435 *
   1436 *******************************************************************************
   1437 */
   1438 
   1439 void ihevc_intra_pred_luma_horz_ssse3(UWORD8 *pu1_ref,
   1440                                       WORD32 src_strd,
   1441                                       UWORD8 *pu1_dst,
   1442                                       WORD32 dst_strd,
   1443                                       WORD32 nt,
   1444                                       WORD32 mode)
   1445 {
   1446 
   1447     WORD32 row;
   1448     WORD32 two_nt;
   1449     UNUSED(src_strd);
   1450     UNUSED(mode);
   1451 
   1452     two_nt = 2 * nt;
   1453 
   1454 
   1455     if(nt == 32)
   1456     {
   1457         __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
   1458         __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
   1459         __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]);
   1460 
   1461         for(row = 0; row < nt; row += 16)
   1462         {
   1463             {
   1464                 src_temp1 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15));
   1465 
   1466                 src_temp2 =  _mm_srli_si128(src_temp1, 1);
   1467                 src_temp3 =  _mm_srli_si128(src_temp1, 2);
   1468                 src_temp4 =  _mm_srli_si128(src_temp1, 3);
   1469                 src_temp5 =  _mm_srli_si128(src_temp1, 4);
   1470                 src_temp6 =  _mm_srli_si128(src_temp1, 5);
   1471                 src_temp7 =  _mm_srli_si128(src_temp1, 6);
   1472                 src_temp8 =  _mm_srli_si128(src_temp1, 7);
   1473 
   1474                 src_temp9 =  _mm_srli_si128(src_temp1, 8);
   1475                 src_temp10 =  _mm_srli_si128(src_temp1, 9);
   1476                 src_temp11 =  _mm_srli_si128(src_temp1, 10);
   1477                 src_temp12 =  _mm_srli_si128(src_temp1, 11);
   1478                 src_temp13 =  _mm_srli_si128(src_temp1, 12);
   1479                 src_temp14 =  _mm_srli_si128(src_temp1, 13);
   1480                 src_temp15 =  _mm_srli_si128(src_temp1, 14);
   1481                 src_temp16 =  _mm_srli_si128(src_temp1, 15);
   1482 
   1483                 src_temp8 =  _mm_shuffle_epi8(src_temp8, sm);
   1484                 src_temp7 =  _mm_shuffle_epi8(src_temp7, sm);
   1485                 src_temp6 =  _mm_shuffle_epi8(src_temp6, sm);
   1486                 src_temp5 =  _mm_shuffle_epi8(src_temp5, sm);
   1487                 src_temp4 =  _mm_shuffle_epi8(src_temp4, sm);
   1488                 src_temp3 =  _mm_shuffle_epi8(src_temp3, sm);
   1489                 src_temp2 =  _mm_shuffle_epi8(src_temp2, sm);
   1490                 src_temp1 =  _mm_shuffle_epi8(src_temp1, sm);
   1491 
   1492                 src_temp16 =  _mm_shuffle_epi8(src_temp16, sm);
   1493                 src_temp15 =  _mm_shuffle_epi8(src_temp15, sm);
   1494                 src_temp14 =  _mm_shuffle_epi8(src_temp14, sm);
   1495                 src_temp13 =  _mm_shuffle_epi8(src_temp13, sm);
   1496                 src_temp12 =  _mm_shuffle_epi8(src_temp12, sm);
   1497                 src_temp11 =  _mm_shuffle_epi8(src_temp11, sm);
   1498                 src_temp10 =  _mm_shuffle_epi8(src_temp10, sm);
   1499                 src_temp9 =  _mm_shuffle_epi8(src_temp9, sm);
   1500 
   1501                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16);
   1502                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15);
   1503                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14);
   1504                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13);
   1505                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12);
   1506                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11);
   1507                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10);
   1508                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9);
   1509 
   1510                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8);
   1511                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7);
   1512                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6);
   1513                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5);
   1514                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4);
   1515                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3);
   1516                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2);
   1517                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1);
   1518 
   1519                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16);
   1520                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15);
   1521                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14);
   1522                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13);
   1523                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12);
   1524                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11);
   1525                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10);
   1526                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9);
   1527 
   1528                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8);
   1529                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7);
   1530                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6);
   1531                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5);
   1532                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4);
   1533                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3);
   1534                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2);
   1535                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1);
   1536 
   1537             }
   1538 
   1539         }
   1540 
   1541     }
   1542     else
   1543 
   1544     {
   1545         __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6;
   1546         __m128i src_temp10, zero_8x16b, src_temp7;
   1547 
   1548         /* DC filtering for the first top row and first left column */
   1549 
   1550         zero_8x16b = _mm_set1_epi16(0);
   1551 
   1552         /*Filtering done for the 1st row */
   1553 
   1554         src_temp2 =  _mm_set1_epi16(pu1_ref[two_nt - 1]);
   1555         src_temp10 =  _mm_set1_epi16(pu1_ref[two_nt]);
   1556 
   1557         /*  loding 8-bit 16 pixels */
   1558         src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1559 
   1560         src_temp4 =  _mm_unpacklo_epi8(src_temp4, zero_8x16b);
   1561 
   1562         /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/
   1563         src_temp3 = _mm_sub_epi16(src_temp4, src_temp10);
   1564 
   1565         /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
   1566         src_temp3 = _mm_srai_epi16(src_temp3, 1);
   1567 
   1568         /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/
   1569         src_temp3 = _mm_add_epi16(src_temp2, src_temp3);
   1570 
   1571         if(nt == 4)
   1572         {
   1573             int temp1, temp2, temp3;
   1574             src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b);
   1575             temp1 = _mm_cvtsi128_si32(src_temp3);
   1576 
   1577             *(WORD32 *)(&pu1_dst[0]) = temp1;
   1578 
   1579             src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
   1580             src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
   1581             src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
   1582 
   1583             temp1 = _mm_cvtsi128_si32(src_temp2);
   1584             temp2 = _mm_cvtsi128_si32(src_temp3);
   1585             temp3 = _mm_cvtsi128_si32(src_temp4);
   1586 
   1587             /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   1588             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
   1589             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2;
   1590             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3;
   1591 
   1592         }
   1593         else if(nt == 8)
   1594         {
   1595             src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b);
   1596 
   1597 
   1598             src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
   1599             src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
   1600             src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
   1601             src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
   1602             src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
   1603             src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
   1604             src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
   1605 
   1606             _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10);
   1607 
   1608             /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   1609             _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
   1610             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2);
   1611             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3);
   1612             _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
   1613             _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5);
   1614             _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6);
   1615             _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7);
   1616 
   1617         }
   1618         else if(nt == 16)
   1619         {
   1620             src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8));
   1621             src_temp4 =  _mm_unpacklo_epi8(src_temp4, zero_8x16b);
   1622             //src_temp4 =  _mm_cvtepu8_epi16 (src_temp4);
   1623 
   1624             src_temp10 = _mm_sub_epi16(src_temp4, src_temp10);
   1625             src_temp10 = _mm_srai_epi16(src_temp10, 1);
   1626             src_temp10 = _mm_add_epi16(src_temp2, src_temp10);
   1627 
   1628             src_temp3 = _mm_packus_epi16(src_temp3, src_temp10);
   1629             _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3);
   1630 
   1631             /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   1632             src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 2]);
   1633             src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 3]);
   1634             src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 4]);
   1635             src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 5]);
   1636             src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 6]);
   1637             src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 7]);
   1638             src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 8]);
   1639             src_temp10 =  _mm_set1_epi8(pu1_ref[two_nt - 9]);
   1640 
   1641             _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
   1642             _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2);
   1643             _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3);
   1644             _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4);
   1645             _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5);
   1646             _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6);
   1647             _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7);
   1648             _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10);
   1649 
   1650             src_temp1 =  _mm_set1_epi8(pu1_ref[two_nt - 10]);
   1651             src_temp2 =  _mm_set1_epi8(pu1_ref[two_nt - 11]);
   1652             src_temp3 =  _mm_set1_epi8(pu1_ref[two_nt - 12]);
   1653             src_temp4 =  _mm_set1_epi8(pu1_ref[two_nt - 13]);
   1654             src_temp5 =  _mm_set1_epi8(pu1_ref[two_nt - 14]);
   1655             src_temp6 =  _mm_set1_epi8(pu1_ref[two_nt - 15]);
   1656             src_temp7 =  _mm_set1_epi8(pu1_ref[two_nt - 16]);
   1657 
   1658             _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1);
   1659             _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2);
   1660             _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3);
   1661             _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4);
   1662             _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5);
   1663             _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6);
   1664             _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7);
   1665 
   1666         }
   1667     }
   1668 }
   1669 
   1670 
   1671 /**
   1672 *******************************************************************************
   1673 *
   1674 * @brief
   1675 *     Intra prediction interpolation filter for vertical luma variable.
   1676 *
   1677 * @par Description:
   1678 *    Horizontal intraprediction with reference neighboring  samples location
   1679 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
   1680 *    to section 8.4.4.2.6 in the standard (Special case)
   1681 *
   1682 * @param[in] pu1_src
   1683 *  UWORD8 pointer to the source
   1684 *
   1685 * @param[out] pu1_dst
   1686 *  UWORD8 pointer to the destination
   1687 *
   1688 * @param[in] src_strd
   1689 *  integer source stride
   1690 *
   1691 * @param[in] dst_strd
   1692 *  integer destination stride
   1693 *
   1694 * @param[in] nt
   1695 *  integer Transform Block size
   1696 *
   1697 * @param[in] mode
   1698 *  integer intraprediction mode
   1699 *
   1700 * @returns
   1701 *
   1702 * @remarks
   1703 *  None
   1704 *
   1705 *******************************************************************************
   1706 */
   1707 
   1708 
   1709 void ihevc_intra_pred_luma_ver_ssse3(UWORD8 *pu1_ref,
   1710                                      WORD32 src_strd,
   1711                                      UWORD8 *pu1_dst,
   1712                                      WORD32 dst_strd,
   1713                                      WORD32 nt,
   1714                                      WORD32 mode)
   1715 {
   1716     WORD32 row;
   1717     WORD16 s2_predpixel;
   1718     WORD32 two_nt = 2 * nt;
   1719     __m128i src_temp0, src_temp2;
   1720     UNUSED(src_strd);
   1721     UNUSED(mode);
   1722 
   1723 
   1724     if(nt == 32)
   1725     {
   1726         __m128i temp1, temp2;
   1727         WORD32 itr_count;
   1728 
   1729         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1730         temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16));
   1731 
   1732         for(itr_count = 0; itr_count < 2; itr_count++)
   1733         {
   1734             /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
   1735             _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
   1736             _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
   1737             _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
   1738             _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
   1739             _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
   1740             _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
   1741             _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
   1742             _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
   1743 
   1744             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
   1745             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
   1746             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
   1747             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
   1748             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
   1749             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
   1750             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
   1751             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
   1752 
   1753             _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
   1754             _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
   1755             _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
   1756             _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
   1757             _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
   1758             _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
   1759             _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
   1760             _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
   1761 
   1762             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
   1763             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
   1764             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
   1765             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
   1766             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
   1767             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
   1768             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
   1769             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
   1770 
   1771             pu1_dst += 16 * dst_strd;
   1772         }
   1773     }
   1774     else
   1775     {
   1776         /* Replication to next columns*/
   1777 
   1778         if(nt == 4)
   1779         {
   1780             int temp1;
   1781 
   1782             src_temp2 =   _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1783 
   1784             temp1 = _mm_cvtsi128_si32(src_temp2);
   1785 
   1786             /* loding 4-bit 8 pixels values */
   1787             *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
   1788             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1;
   1789             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp1;
   1790             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp1;
   1791 
   1792         }
   1793         else if(nt == 8)
   1794         {
   1795 
   1796             src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1797 
   1798             _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0);
   1799             _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp0);
   1800             _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp0);
   1801             _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp0);
   1802             _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp0);
   1803             _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp0);
   1804             _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp0);
   1805             _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp0);
   1806 
   1807 
   1808         }
   1809         else if(nt == 16)
   1810         {
   1811             for(row = 0; row < nt; row += 8)
   1812             {
   1813 
   1814                 src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1));
   1815 
   1816                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0);
   1817                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp0);
   1818                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp0);
   1819                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp0);
   1820                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp0);
   1821                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp0);
   1822                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp0);
   1823                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp0);
   1824 
   1825             }
   1826 
   1827         }
   1828 
   1829         /*Filtering done for the 1st column */
   1830         for(row = nt - 1; row >= 0; row--)
   1831         {
   1832             s2_predpixel = pu1_ref[two_nt + 1]
   1833                             + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1);
   1834             pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel);
   1835         }
   1836 
   1837 
   1838     }
   1839 }
   1840 
   1841 /**
   1842 *******************************************************************************
   1843 *
   1844 * @brief
   1845 *     Intra prediction interpolation filter for luma mode2.
   1846 *
   1847 * @par Description:
   1848 *    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
   1849 *    location pointed by 'pu1_ref' to the  TU block location pointed by
   1850 *    'pu1_dst'  Refer to section 8.4.4.2.6 in the standard
   1851 *
   1852 * @param[in] pu1_src
   1853 *  UWORD8 pointer to the source
   1854 *
   1855 * @param[out] pu1_dst
   1856 *  UWORD8 pointer to the destination
   1857 *
   1858 * @param[in] src_strd
   1859 *  integer source stride
   1860 *
   1861 * @param[in] dst_strd
   1862 *  integer destination stride
   1863 *
   1864 * @param[in] nt
   1865 *  integer Transform Block size
   1866 *
   1867 * @param[in] mode
   1868 *  integer intraprediction mode
   1869 *
   1870 * @returns
   1871 *
   1872 * @remarks
   1873 *  None
   1874 *
   1875 *******************************************************************************
   1876 */
   1877 
   1878 void ihevc_intra_pred_luma_mode2_ssse3(UWORD8 *pu1_ref,
   1879                                        WORD32 src_strd,
   1880                                        UWORD8 *pu1_dst,
   1881                                        WORD32 dst_strd,
   1882                                        WORD32 nt,
   1883                                        WORD32 mode)
   1884 {
   1885     WORD32 row, col;
   1886     WORD32 two_nt = 2 * nt;
   1887 
   1888     __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
   1889     __m128i   sm1, sm2, sm3;
   1890     UNUSED(src_strd);
   1891     UNUSED(mode);
   1892 
   1893 
   1894     sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY1[0]);
   1895     sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY2[0]);
   1896     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY3[0]);
   1897 
   1898     /* For the angle 45, replication is done from the corresponding angle */
   1899     /* intra_pred_ang = tan(angle) in q5 format */
   1900 
   1901     if(nt == 4)
   1902     {
   1903         int temp1, temp2, temp3, temp4;
   1904 
   1905         /*pu1_ref[two_nt - row - (col+1) - 1]*/
   1906         src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 8));
   1907         src_temp2 = _mm_srli_si128(src_temp1, 1);
   1908         src_temp3 = _mm_srli_si128(src_temp1, 2);
   1909         src_temp4 = _mm_srli_si128(src_temp1, 3);
   1910 
   1911         src_temp4 = _mm_shuffle_epi8(src_temp4, sm1);
   1912         src_temp3 = _mm_shuffle_epi8(src_temp3, sm1);
   1913         src_temp2 = _mm_shuffle_epi8(src_temp2, sm1);
   1914         src_temp1 = _mm_shuffle_epi8(src_temp1, sm1);
   1915 
   1916         temp1 = _mm_cvtsi128_si32(src_temp4);
   1917         temp2 = _mm_cvtsi128_si32(src_temp3);
   1918         temp3 = _mm_cvtsi128_si32(src_temp2);
   1919         temp4 = _mm_cvtsi128_si32(src_temp1);
   1920 
   1921         /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   1922         *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
   1923         *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
   1924         *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
   1925         *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
   1926 
   1927 
   1928     }
   1929     else if(nt == 8)
   1930     {
   1931         /*pu1_ref[two_nt - row - (col+1) - 1]*/
   1932         src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16));
   1933         src_temp2 = _mm_srli_si128(src_temp1, 1);
   1934         src_temp3 = _mm_srli_si128(src_temp1, 2);
   1935         src_temp4 = _mm_srli_si128(src_temp1, 3);
   1936         src_temp5 = _mm_srli_si128(src_temp1, 4);
   1937         src_temp6 = _mm_srli_si128(src_temp1, 5);
   1938         src_temp7 = _mm_srli_si128(src_temp1, 6);
   1939         src_temp8 = _mm_srli_si128(src_temp1, 7);
   1940 
   1941         src_temp1 = _mm_shuffle_epi8(src_temp1, sm2);
   1942         src_temp2 = _mm_shuffle_epi8(src_temp2, sm2);
   1943         src_temp3 = _mm_shuffle_epi8(src_temp3, sm2);
   1944         src_temp4 = _mm_shuffle_epi8(src_temp4, sm2);
   1945         src_temp5 = _mm_shuffle_epi8(src_temp5, sm2);
   1946         src_temp6 = _mm_shuffle_epi8(src_temp6, sm2);
   1947         src_temp7 = _mm_shuffle_epi8(src_temp7, sm2);
   1948         src_temp8 = _mm_shuffle_epi8(src_temp8, sm2);
   1949 
   1950         _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8);
   1951         _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7);
   1952         _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6);
   1953         _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5);
   1954         _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
   1955         _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3);
   1956         _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2);
   1957         _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
   1958 
   1959     }
   1960     else
   1961     {
   1962         for(row = 0; row < nt; row += 8)
   1963         {
   1964             for(col = 0; col < nt; col += 16)
   1965             {   /*pu1_ref[two_nt - row - (col+1) - 1]*/
   1966 
   1967                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0) - (col + 16) - 1));
   1968                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1) - (col + 16) - 1));
   1969                 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2) - (col + 16) - 1));
   1970                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3) - (col + 16) - 1));
   1971                 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4) - (col + 16) - 1));
   1972                 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5) - (col + 16) - 1));
   1973                 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6) - (col + 16) - 1));
   1974                 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7) - (col + 16) - 1));
   1975 
   1976                 src_temp1 = _mm_shuffle_epi8(src_temp1, sm3);
   1977                 src_temp2 = _mm_shuffle_epi8(src_temp2, sm3);
   1978                 src_temp3 = _mm_shuffle_epi8(src_temp3, sm3);
   1979                 src_temp4 = _mm_shuffle_epi8(src_temp4, sm3);
   1980                 src_temp5 = _mm_shuffle_epi8(src_temp5, sm3);
   1981                 src_temp6 = _mm_shuffle_epi8(src_temp6, sm3);
   1982                 src_temp7 = _mm_shuffle_epi8(src_temp7, sm3);
   1983                 src_temp8 = _mm_shuffle_epi8(src_temp8, sm3);
   1984 
   1985                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), src_temp1);
   1986                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), src_temp2);
   1987                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), src_temp3);
   1988                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), src_temp4);
   1989                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), src_temp5);
   1990                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), src_temp6);
   1991                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), src_temp7);
   1992                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), src_temp8);
   1993             }
   1994         }
   1995     }
   1996 
   1997 }
   1998 
   1999 /**
   2000 *******************************************************************************
   2001 *
   2002 * @brief
   2003 *    Intra prediction interpolation filter for luma mode 18 & mode 34.
   2004 *
   2005 * @par Description:
   2006 *    Intraprediction for mode 34 (ne angle) and  mode 18 (nw angle) with
   2007 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2008 *    block location pointed by 'pu1_dst'
   2009 *
   2010 * @param[in] pu1_src
   2011 *  UWORD8 pointer to the source
   2012 *
   2013 * @param[out] pu1_dst
   2014 *  UWORD8 pointer to the destination
   2015 *
   2016 * @param[in] src_strd
   2017 *  integer source stride
   2018 *
   2019 * @param[in] dst_strd
   2020 *  integer destination stride
   2021 *
   2022 * @param[in] nt
   2023 *  integer Transform Block size
   2024 *
   2025 * @param[in] mode
   2026 *  integer intraprediction mode
   2027 *
   2028 * @returns
   2029 *
   2030 * @remarks
   2031 *  None
   2032 *
   2033 *******************************************************************************
   2034 */
   2035 
   2036 void ihevc_intra_pred_luma_mode_18_34_ssse3(UWORD8 *pu1_ref,
   2037                                             WORD32 src_strd,
   2038                                             UWORD8 *pu1_dst,
   2039                                             WORD32 dst_strd,
   2040                                             WORD32 nt,
   2041                                             WORD32 mode)
   2042 {
   2043     WORD32 row;
   2044     WORD32 two_nt = 2 * nt;
   2045     __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
   2046     UNUSED(src_strd);
   2047     if(mode == 34)
   2048     {
   2049         if(nt == 4)
   2050         {
   2051 
   2052             int temp1, temp2, temp3, temp4;
   2053 
   2054             /*pu1_ref[two_nt + col + idx + 1]*/
   2055             src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2));
   2056             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3));
   2057             src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4));
   2058             src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5));
   2059 
   2060             temp1 = _mm_cvtsi128_si32(src_temp1);
   2061             temp2 = _mm_cvtsi128_si32(src_temp2);
   2062             temp3 = _mm_cvtsi128_si32(src_temp3);
   2063             temp4 = _mm_cvtsi128_si32(src_temp4);
   2064 
   2065             /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   2066             *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
   2067             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
   2068             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
   2069             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
   2070 
   2071         }
   2072         else if(nt == 8)
   2073         {
   2074             /*pu1_ref[two_nt + col + idx + 1]*/
   2075             src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2));
   2076             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3));
   2077             src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4));
   2078             src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5));
   2079             src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 6));
   2080             src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 7));
   2081             src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 8));
   2082             src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 9));
   2083 
   2084             _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
   2085             _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
   2086             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
   2087             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
   2088             _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
   2089             _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
   2090             _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
   2091             _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
   2092 
   2093         }
   2094         else if(nt == 16)
   2095         {
   2096             for(row = 0; row < nt; row += 8)
   2097             {
   2098                 /*pu1_ref[two_nt + col + idx + 1]*/
   2099                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 0) + 2));
   2100                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 1) + 2));
   2101                 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 2) + 2));
   2102                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 3) + 2));
   2103                 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 4) + 2));
   2104                 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 5) + 2));
   2105                 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 6) + 2));
   2106                 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 7) + 2));
   2107 
   2108                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1);
   2109                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
   2110                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
   2111                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
   2112                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
   2113                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
   2114                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
   2115                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8);
   2116 
   2117 
   2118             }
   2119         }
   2120         else
   2121         {
   2122             __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
   2123             for(row = 0; row < nt; row += 8)
   2124             {
   2125                 /*pu1_ref[two_nt + col + idx + 1]*/
   2126                 src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 0) + 2));
   2127                 src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 16) + 2));
   2128                 src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 0) + 2));
   2129                 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 16) + 2));
   2130                 src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 0) + 2));
   2131                 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 16) + 2));
   2132                 src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 0) + 2));
   2133                 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 16) + 2));
   2134 
   2135                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
   2136                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
   2137                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
   2138                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
   2139                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
   2140                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
   2141                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
   2142                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
   2143 
   2144                 src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 0) + 2));
   2145                 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 16) + 2));
   2146                 src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 0) + 2));
   2147                 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 16) + 2));
   2148                 src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 0) + 2));
   2149                 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 16) + 2));
   2150                 src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 0) + 2));
   2151                 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 16) + 2));
   2152 
   2153                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
   2154                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
   2155                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
   2156                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
   2157                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
   2158                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
   2159                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
   2160                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
   2161 
   2162                 pu1_ref += 8;
   2163                 pu1_dst += 8 * dst_strd;
   2164             }
   2165         }
   2166     }
   2167     else
   2168     {
   2169         if(nt == 4)
   2170         {
   2171             int temp1, temp2, temp3, temp4;
   2172 
   2173             /*pu1_ref[two_nt + col + idx + 1]*/
   2174             src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3));
   2175             src_temp2 = _mm_srli_si128(src_temp1, 1);
   2176             src_temp3 = _mm_srli_si128(src_temp1, 2);
   2177             src_temp4 = _mm_srli_si128(src_temp1, 3);
   2178 
   2179             temp1 = _mm_cvtsi128_si32(src_temp4);
   2180             temp2 = _mm_cvtsi128_si32(src_temp3);
   2181             temp3 = _mm_cvtsi128_si32(src_temp2);
   2182             temp4 = _mm_cvtsi128_si32(src_temp1);
   2183 
   2184             /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/
   2185             *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1;
   2186             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2;
   2187             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3;
   2188             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4;
   2189 
   2190         }
   2191         else if(nt == 8)
   2192         {
   2193             /*pu1_ref[two_nt + col + idx + 1]*/
   2194             src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7));
   2195             src_temp2 = _mm_srli_si128(src_temp1, 1);
   2196             src_temp3 = _mm_srli_si128(src_temp1, 2);
   2197             src_temp4 = _mm_srli_si128(src_temp1, 3);
   2198             src_temp5 = _mm_srli_si128(src_temp1, 4);
   2199             src_temp6 = _mm_srli_si128(src_temp1, 5);
   2200             src_temp7 = _mm_srli_si128(src_temp1, 6);
   2201             src_temp8 = _mm_srli_si128(src_temp1, 7);
   2202 
   2203             _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8);
   2204             _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7);
   2205             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6);
   2206             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5);
   2207             _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4);
   2208             _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3);
   2209             _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2);
   2210             _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
   2211 
   2212 
   2213         }
   2214         else if(nt == 16)
   2215         {
   2216             for(row = 0; row < nt; row += 8)
   2217             {
   2218                 /*pu1_ref[two_nt + col + idx + 1]*/
   2219                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0)));
   2220                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1)));
   2221                 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2)));
   2222                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3)));
   2223                 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4)));
   2224                 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5)));
   2225                 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6)));
   2226                 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7)));
   2227 
   2228                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1);
   2229                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2);
   2230                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3);
   2231                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4);
   2232                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5);
   2233                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6);
   2234                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7);
   2235                 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8);
   2236 
   2237             }
   2238 
   2239         }
   2240         else
   2241         {
   2242             __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
   2243             for(row = 0; row < nt; row += 8)
   2244             {
   2245                 /*pu1_ref[two_nt + col + idx + 1]*/
   2246                 src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 0));
   2247                 src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 16));
   2248                 src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 0));
   2249                 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 16));
   2250                 src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 0));
   2251                 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 16));
   2252                 src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 0));
   2253                 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 16));
   2254 
   2255                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
   2256                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
   2257                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
   2258                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
   2259                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
   2260                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
   2261                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
   2262                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
   2263 
   2264                 src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 0));
   2265                 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 16));
   2266                 src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 0));
   2267                 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 16));
   2268                 src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 0));
   2269                 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 16));
   2270                 src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 0));
   2271                 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 16));
   2272 
   2273                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
   2274                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
   2275                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
   2276                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
   2277                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
   2278                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
   2279                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
   2280                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
   2281 
   2282                 pu1_ref -= 8;
   2283                 pu1_dst += 8 * dst_strd;
   2284             }
   2285         }
   2286     }
   2287 }
   2288 
   2289 
   2290 /**
   2291 *******************************************************************************
   2292 *
   2293 * @brief
   2294 *    Intra prediction interpolation filter for luma mode 3 to mode 9
   2295 *
   2296 * @par Description:
   2297 *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
   2298 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2299 *    block location pointed by 'pu1_dst'
   2300 *
   2301 * @param[in] pu1_src
   2302 *  UWORD8 pointer to the source
   2303 *
   2304 * @param[out] pu1_dst
   2305 *  UWORD8 pointer to the destination
   2306 *
   2307 * @param[in] src_strd
   2308 *  integer source stride
   2309 *
   2310 * @param[in] dst_strd
   2311 *  integer destination stride
   2312 *
   2313 * @param[in] nt
   2314 *  integer Transform Block size
   2315 *
   2316 * @param[in] mode
   2317 *  integer intraprediction mode
   2318 *
   2319 * @returns
   2320 *
   2321 * @remarks
   2322 *  None
   2323 *
   2324 *******************************************************************************
   2325 */
   2326 
   2327 void ihevc_intra_pred_luma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
   2328                                              WORD32 src_strd,
   2329                                              UWORD8 *pu1_dst,
   2330                                              WORD32 dst_strd,
   2331                                              WORD32 nt,
   2332                                              WORD32 mode)
   2333 {
   2334     WORD32 row, col;
   2335     WORD32 two_nt = 2 * nt;
   2336     WORD32 intra_pred_ang;
   2337 
   2338 
   2339     __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b, zero_8x16b;
   2340     __m128i fract_4x32b, intra_pred_ang_4x32b;
   2341     __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
   2342     UNUSED(src_strd);
   2343 
   2344     /* Intra Pred Angle according to the mode */
   2345     intra_pred_ang = gai4_ihevc_ang_table[mode];
   2346 
   2347     /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   2348     /* samples dependent on distance to obtain destination sample */
   2349 
   2350     /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   2351     /* samples dependent on distance to obtain destination sample */
   2352 
   2353     const_temp_4x32b  = _mm_set1_epi16(16);
   2354     const_temp2_4x32b = _mm_set1_epi32(31);
   2355     const_temp3_4x32b = _mm_set1_epi32(32);
   2356     const_temp4_4x32b = _mm_set1_epi32(4);
   2357 
   2358     two_nt_4x32b = _mm_set1_epi32(two_nt - nt);
   2359 
   2360 
   2361     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
   2362 
   2363     /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   2364     intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
   2365 
   2366     row_4x32b = _mm_set_epi32(4, 3, 2, 1);
   2367 
   2368     if(nt == 4)
   2369     {
   2370 
   2371         WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   2372         int temp11, temp21, temp31, temp41;
   2373         // WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
   2374 
   2375         __m128i fract1_8x16b, fract2_8x16b, sign_8x16b;
   2376         __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2377 
   2378         __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   2379         __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b;
   2380         __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
   2381 
   2382         row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
   2383         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2384 
   2385         /* pos = ((row + 1) * intra_pred_ang); */
   2386         res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2387         zero_8x16b      = _mm_setzero_si128();
   2388         sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
   2389         res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
   2390 
   2391         /* idx = pos >> 5; */
   2392         fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2393 
   2394         /* fract = pos & (31); */
   2395         ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   2396 
   2397         /*(32 - fract) */
   2398         row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
   2399 
   2400         fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   2401         fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
   2402 
   2403         fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   2404         row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
   2405 
   2406         fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b);
   2407         fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b);
   2408 
   2409         temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   2410         temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   2411         temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   2412         temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   2413 
   2414         ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
   2415         ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
   2416         ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
   2417         ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
   2418         ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
   2419         ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
   2420         ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
   2421 
   2422         /* loding 8-bit 16 pixels */
   2423         src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/
   2424         src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/
   2425         src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/
   2426         src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/
   2427 
   2428         src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
   2429         src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
   2430         src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
   2431         src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
   2432 
   2433         /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2434         src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   2435         src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   2436         src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   2437         src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   2438 
   2439         /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2440         src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   2441         src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   2442         src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   2443         src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   2444 
   2445         /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2446         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   2447         src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   2448         src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   2449         src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   2450 
   2451         /* converting 16 bit to 8 bit */
   2452         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   2453         src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   2454 
   2455 
   2456         src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2457         src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2458 
   2459         src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2460         src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4);
   2461         src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   2462         src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12);
   2463 
   2464         temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
   2465         temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
   2466         temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
   2467         temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
   2468 
   2469         /* loding 4-bit 8 pixels values */
   2470         *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
   2471         *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
   2472         *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
   2473         *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
   2474 
   2475     }
   2476 
   2477     else if(nt == 16 || nt == 32)
   2478     {
   2479         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2480         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   2481         const_temp2_4x32b = _mm_set1_epi16(31);
   2482         const_temp4_4x32b = _mm_set1_epi16(8);
   2483         const_temp3_4x32b = _mm_set1_epi16(32);
   2484         two_nt_4x32b = _mm_set1_epi16(two_nt);
   2485 
   2486         for(col = 0; col < nt; col += 8)
   2487         {
   2488             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   2489             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   2490             //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
   2491 
   2492             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   2493 
   2494             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2495             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   2496 
   2497             /* pos = ((row + 1) * intra_pred_ang); */
   2498             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2499 
   2500             /* idx = pos >> 5; */
   2501             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2502 
   2503             /*(32 - fract) */
   2504             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   2505 
   2506             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   2507             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   2508 
   2509             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   2510             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   2511 
   2512 
   2513             fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
   2514             fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
   2515 
   2516             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   2517             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   2518             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   2519             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   2520 
   2521             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   2522             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   2523             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   2524             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   2525 
   2526             /* fract = pos & (31); */
   2527             ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   2528 
   2529             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   2530 
   2531             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   2532             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   2533             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   2534             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   2535 
   2536             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   2537             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   2538             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   2539             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   2540 
   2541             for(row = 0; row < nt; row += 8)
   2542             {
   2543                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   2544                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   2545 
   2546 
   2547                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   2548                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   2549 
   2550                 /* loding 8-bit 16 pixels */
   2551                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/
   2552                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/
   2553                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/
   2554                 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/
   2555 
   2556                 /* loding 8-bit 16 pixels */
   2557                 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/
   2558                 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/
   2559                 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/
   2560                 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/
   2561 
   2562                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
   2563                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
   2564                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
   2565                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
   2566 
   2567                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
   2568                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
   2569                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
   2570                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
   2571 
   2572                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2573                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   2574                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   2575                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   2576                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   2577 
   2578                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2579                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   2580                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   2581                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   2582                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   2583 
   2584                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2585                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   2586                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   2587                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   2588                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   2589 
   2590                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2591                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   2592                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   2593                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   2594                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   2595 
   2596                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2597                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   2598                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   2599                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   2600                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   2601 
   2602                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2603                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   2604                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   2605                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   2606                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   2607 
   2608                 /* converting 16 bit to 8 bit */
   2609                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   2610                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   2611 
   2612                 /* converting 16 bit to 8 bit */
   2613                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
   2614                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
   2615 
   2616                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2617                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2618 
   2619                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2620                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2621 
   2622                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2623                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2624 
   2625                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2626                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2627 
   2628                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2629                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2630 
   2631                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   2632                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   2633 
   2634                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2635                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2636 
   2637                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   2638                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   2639 
   2640                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b);       /* row=7*/
   2641 
   2642                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b);       /* row=6*/
   2643 
   2644                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b);       /* row=5*/
   2645 
   2646                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b);       /* row=4*/
   2647 
   2648                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b);       /* row=3*/
   2649 
   2650                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b);       /* row=2*/
   2651 
   2652                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b);       /* row=1*/
   2653 
   2654                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b);       /* row=0*/
   2655 
   2656             }
   2657         }
   2658     }
   2659     else
   2660     {
   2661         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2662         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   2663         const_temp2_4x32b = _mm_set1_epi16(31);
   2664         const_temp4_4x32b = _mm_set1_epi16(8);
   2665         const_temp3_4x32b = _mm_set1_epi16(32);
   2666         two_nt_4x32b = _mm_set1_epi16(two_nt - nt);
   2667         {
   2668             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   2669             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   2670 
   2671             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   2672 
   2673             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2674             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   2675 
   2676             /* pos = ((row + 1) * intra_pred_ang); */
   2677             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2678 
   2679             /* idx = pos >> 5; */
   2680             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2681 
   2682             /* fract = pos & (31); */
   2683             ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   2684 
   2685             /*(32 - fract) */
   2686             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   2687 
   2688             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   2689             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   2690 
   2691             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   2692             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   2693 
   2694 
   2695             fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b);
   2696             fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b);
   2697 
   2698             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   2699             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   2700             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   2701             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   2702 
   2703             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   2704             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   2705             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   2706             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   2707 
   2708             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   2709             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   2710             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   2711             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   2712 
   2713             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   2714             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   2715             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   2716             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   2717 
   2718             {
   2719                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   2720                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   2721 
   2722                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   2723                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   2724 
   2725                 /* loding 8-bit 16 pixels */
   2726                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/
   2727                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/
   2728                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/
   2729                 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/
   2730 
   2731                 /* loding 8-bit 16 pixels */
   2732                 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/
   2733                 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/
   2734                 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/
   2735                 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/
   2736 
   2737                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/
   2738                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/
   2739                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/
   2740                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/
   2741 
   2742                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/
   2743                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/
   2744                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/
   2745                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/
   2746 
   2747                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2748                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   2749                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   2750                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   2751                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   2752 
   2753                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2754                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   2755                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   2756                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   2757                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   2758 
   2759                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2760                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   2761                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   2762                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   2763                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   2764 
   2765                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2766                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   2767                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   2768                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   2769                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   2770 
   2771                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2772                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   2773                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   2774                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   2775                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   2776 
   2777                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2778                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   2779                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   2780                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   2781                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   2782 
   2783                 /* converting 16 bit to 8 bit */
   2784                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   2785                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   2786 
   2787                 /* converting 16 bit to 8 bit */
   2788                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
   2789                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
   2790 
   2791                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2792                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   2793 
   2794                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2795                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   2796 
   2797                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2798                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2799 
   2800                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2801                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   2802 
   2803                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2804                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   2805 
   2806                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   2807                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   2808 
   2809                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2810                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   2811 
   2812                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   2813                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   2814 
   2815                 _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b);       /* row=0*/
   2816                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b);       /* row=1*/
   2817                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b);       /* row=2*/
   2818                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b);       /* row=3*/
   2819                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b);       /* row=4*/
   2820                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b);       /* row=5*/
   2821 
   2822                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b);       /* row=6*/
   2823                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b);       /* row=7*/
   2824 
   2825             }
   2826         }
   2827     }
   2828 
   2829 }
   2830 
   2831 
   2832 
   2833 /**
   2834 *******************************************************************************
   2835 *
   2836 * @brief
   2837 *   Intra prediction interpolation filter for luma mode 11 to mode 17
   2838 *
   2839 * @par Description:
   2840 *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
   2841 *    with reference  neighboring samples location pointed by 'pu1_ref' to the
   2842 *    TU block location pointed by 'pu1_dst'
   2843 *
   2844 * @param[in] pu1_src
   2845 *  UWORD8 pointer to the source
   2846 *
   2847 * @param[out] pu1_dst
   2848 *  UWORD8 pointer to the destination
   2849 *
   2850 * @param[in] src_strd
   2851 *  integer source stride
   2852 *
   2853 * @param[in] dst_strd
   2854 *  integer destination stride
   2855 *
   2856 * @param[in] nt
   2857 *  integer Transform Block size
   2858 *
   2859 * @param[in] mode
   2860 *  integer intraprediction mode
   2861 *
   2862 * @returns
   2863 *
   2864 * @remarks
   2865 *  None
   2866 *
   2867 *******************************************************************************
   2868 */
   2869 
   2870 
   2871 void ihevc_intra_pred_luma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
   2872                                                WORD32 src_strd,
   2873                                                UWORD8 *pu1_dst,
   2874                                                WORD32 dst_strd,
   2875                                                WORD32 nt,
   2876                                                WORD32 mode)
   2877 {
   2878 
   2879     /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/
   2880     /* for ref main & side samples assignment,can be combined for */
   2881     /* optimzation*/
   2882 
   2883     WORD32 row, col, k;
   2884     WORD32 two_nt;
   2885     WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
   2886     WORD32 ref_idx;
   2887 
   2888     __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
   2889     __m128i fract_4x32b,  intra_pred_ang_4x32b;
   2890     __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3;
   2891 
   2892 
   2893     UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2];
   2894     UWORD8 *ref_main;
   2895     UWORD8 *ref_temp;
   2896     UNUSED(src_strd);
   2897     inv_ang_sum = 128;
   2898     two_nt    = 2 * nt;
   2899     ref_temp = ref_tmp + 1;
   2900     ref_main = ref_temp + nt - 1;
   2901     intra_pred_ang = gai4_ihevc_ang_table[mode];
   2902 
   2903     /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   2904     /* samples dependent on distance to obtain destination sample */
   2905     const_temp_4x32b  = _mm_set1_epi16(16);
   2906     const_temp2_4x32b = _mm_set1_epi32(31);
   2907     const_temp3_4x32b = _mm_set1_epi32(32);
   2908     const_temp4_4x32b = _mm_set1_epi32(4);
   2909 
   2910     two_nt_4x32b = _mm_set1_epi32(1);
   2911 
   2912 
   2913     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
   2914 
   2915     /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   2916     intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
   2917 
   2918     row_4x32b = _mm_set_epi32(4, 3, 2, 1);
   2919 
   2920     if(nt == 4)
   2921     {
   2922 
   2923         WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   2924         int temp11, temp21, temp31, temp41;
   2925 //        WORD8  ai1_fract_temp_val[16], ai1_row_temp_val[16];
   2926 
   2927         __m128i fract1_8x16b, fract2_8x16b;
   2928         __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2929 
   2930         __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   2931         __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   2932         __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, zero_8x16b, sign_8x16b;
   2933 
   2934         /* Intermediate reference samples for negative angle modes */
   2935         /* This have to be removed during optimization*/
   2936         /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
   2937         inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
   2938 
   2939         ref_main = ref_temp + nt - 1;
   2940         for(k = 0; k < nt + 1; k++)
   2941             ref_temp[k + nt - 1] = pu1_ref[two_nt - k];
   2942 
   2943         ref_main = ref_temp + nt - 1;
   2944         ref_idx = (nt * intra_pred_ang) >> 5;
   2945         zero_8x16b = _mm_setzero_si128();
   2946 
   2947         row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
   2948         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2949         /* SIMD Optimization can be done using look-up table for the loop */
   2950         /* For negative angled derive the main reference samples from side */
   2951         /*  reference samples refer to section 8.4.4.2.6 */
   2952         for(k = -1; k > ref_idx; k--)
   2953         {
   2954             inv_ang_sum += inv_ang;
   2955             ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
   2956         }
   2957 
   2958 
   2959         /* pos = ((row + 1) * intra_pred_ang); */
   2960         res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2961 
   2962         sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
   2963         res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
   2964 
   2965         /* idx = pos >> 5; */
   2966         fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2967 
   2968         /* fract = pos & (31); */
   2969         ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   2970 
   2971         /*(32 - fract) */
   2972         row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b);
   2973 
   2974         fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   2975         fract2_8x16b = _mm_slli_epi16(row_4x32b, 8);
   2976 
   2977         fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   2978         row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */
   2979 
   2980         fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b);
   2981         fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b);
   2982 
   2983         temp1_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   2984         temp2_8x16b =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   2985         temp3_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   2986         temp4_8x16b =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   2987 
   2988         ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4);  /* next 32 bit values */
   2989         ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8);  /* next 32 bit values */
   2990         ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */
   2991         ref_main_idx1  = _mm_cvtsi128_si32(ref_main_idx_4x32b);    /* col=0*/
   2992         ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* col=1*/
   2993         ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* col=2*/
   2994         ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* col=3*/
   2995 
   2996         /* loding 8-bit 16 pixels */
   2997         src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/
   2998         src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/
   2999         src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/
   3000         src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/
   3001 
   3002         src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
   3003         src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
   3004         src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
   3005         src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
   3006 
   3007         /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   3008         src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   3009         src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   3010         src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   3011         src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   3012 
   3013         /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3014         src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   3015         src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   3016         src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   3017         src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   3018 
   3019         /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3020         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   3021         src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   3022         src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   3023         src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   3024 
   3025         /* converting 16 bit to 8 bit */
   3026         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   3027         src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   3028 
   3029 
   3030         src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   3031         src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   3032 
   3033         src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   3034         src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4);
   3035         src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8);
   3036         src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12);
   3037 
   3038         temp11 = _mm_cvtsi128_si32(src_temp7_8x16b);
   3039         temp21 = _mm_cvtsi128_si32(src_temp1_8x16b);
   3040         temp31 = _mm_cvtsi128_si32(src_temp2_8x16b);
   3041         temp41 = _mm_cvtsi128_si32(src_temp3_8x16b);
   3042 
   3043         /* loding 8-bit 4 pixels values */
   3044         *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
   3045         *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
   3046         *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
   3047         *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
   3048     }
   3049 
   3050     else if(nt == 32)
   3051     {
   3052 
   3053 
   3054         __m128i temp1, temp2, temp3, temp11, temp12;
   3055         __m128i src_values0, src_values1;
   3056         /* Intermediate reference samples for negative angle modes */
   3057 
   3058         ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
   3059         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
   3060         temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17));
   3061         temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
   3062 
   3063         /* For negative angled derive the main reference samples from side */
   3064 
   3065         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
   3066         src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/
   3067 
   3068         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode]));
   3069         temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
   3070 
   3071         src_values0 = _mm_shuffle_epi8(src_values0, temp2);
   3072         src_values1 = _mm_shuffle_epi8(src_values1, temp2);
   3073         src_values0 = _mm_shuffle_epi8(src_values0, temp12);
   3074         src_values1 = _mm_shuffle_epi8(src_values1, temp11);
   3075 
   3076         temp1 = _mm_shuffle_epi8(temp1, temp2);
   3077         temp3 = _mm_shuffle_epi8(temp3, temp2);
   3078 
   3079         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3);
   3080         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1);
   3081         _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0);
   3082         _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1);
   3083 
   3084 
   3085         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3086         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3087         const_temp2_4x32b = _mm_set1_epi16(31);
   3088         const_temp4_4x32b = _mm_set1_epi16(8);
   3089         const_temp3_4x32b = _mm_set1_epi16(32);
   3090         two_nt_4x32b = _mm_set1_epi16(1);
   3091 
   3092         for(col = 0; col < nt; col += 8)
   3093         {
   3094             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   3095             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   3096             // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
   3097 
   3098             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   3099 
   3100             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   3101             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   3102 
   3103             /* pos = ((row + 1) * intra_pred_ang); */
   3104             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3105 
   3106             /* idx = pos >> 5; */
   3107             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3108 
   3109             /* fract = pos & (31); */
   3110             ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3111 
   3112             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   3113             /*(32 - fract) */
   3114             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   3115 
   3116             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   3117             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   3118 
   3119             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   3120             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   3121 
   3122 
   3123             fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
   3124             fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
   3125 
   3126             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   3127             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   3128             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   3129             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   3130 
   3131             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   3132             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   3133             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   3134             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   3135 
   3136             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   3137             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   3138             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   3139             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   3140 
   3141             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   3142             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   3143             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   3144             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   3145 
   3146             for(row = 0; row < nt; row += 8)
   3147             {
   3148                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   3149                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   3150 
   3151 
   3152                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   3153                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   3154 
   3155                 /* loding 8-bit 16 pixels */
   3156                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
   3157                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
   3158                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
   3159                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
   3160 
   3161                 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
   3162                 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
   3163                 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
   3164                 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
   3165 
   3166                 /* loding 8-bit 16 pixels */
   3167                 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
   3168                 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
   3169                 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
   3170                 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
   3171 
   3172                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
   3173                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
   3174                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
   3175                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
   3176 
   3177                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
   3178                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
   3179                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
   3180                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
   3181 
   3182                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   3183                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   3184                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   3185                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   3186                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   3187 
   3188                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   3189                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   3190                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   3191                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   3192                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   3193 
   3194                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3195                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   3196                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   3197                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   3198                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   3199 
   3200                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3201                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   3202                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   3203                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   3204                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   3205 
   3206                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3207                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   3208                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   3209                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   3210                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   3211 
   3212                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3213                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   3214                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   3215                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   3216                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   3217 
   3218                 /* converting 16 bit to 8 bit */
   3219                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   3220                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   3221 
   3222                 /* converting 16 bit to 8 bit */
   3223                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
   3224                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
   3225 
   3226                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   3227                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   3228 
   3229                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   3230                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   3231 
   3232                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   3233                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   3234 
   3235                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   3236                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   3237 
   3238 
   3239                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   3240                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   3241 
   3242                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   3243                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   3244 
   3245                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   3246                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   3247                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   3248                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   3249 
   3250                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
   3251 
   3252                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
   3253 
   3254                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
   3255 
   3256                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
   3257 
   3258                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
   3259 
   3260                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
   3261 
   3262                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
   3263 
   3264                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
   3265 
   3266             }
   3267         }
   3268     }
   3269     else if(nt == 16)
   3270     {
   3271 
   3272         __m128i temp1, temp2, temp11, src_values0;
   3273         /* Intermediate reference samples for negative angle modes */
   3274         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   3275         ref_temp[two_nt - 1] = pu1_ref[two_nt - nt];
   3276         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1));
   3277         temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
   3278         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
   3279 
   3280         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
   3281 
   3282         src_values0 = _mm_shuffle_epi8(src_values0, temp2);
   3283         temp1 = _mm_shuffle_epi8(temp1, temp2);
   3284         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   3285 
   3286         _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
   3287         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
   3288 
   3289         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3290         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3291         const_temp2_4x32b = _mm_set1_epi16(31);
   3292         const_temp4_4x32b = _mm_set1_epi16(8);
   3293         const_temp3_4x32b = _mm_set1_epi16(32);
   3294         two_nt_4x32b = _mm_set1_epi16(1);
   3295 
   3296         for(col = 0; col < nt; col += 8)
   3297         {
   3298             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   3299             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   3300             // WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
   3301 
   3302             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   3303 
   3304             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   3305             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   3306 
   3307             /* pos = ((row + 1) * intra_pred_ang); */
   3308             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3309 
   3310             /* idx = pos >> 5; */
   3311             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3312 
   3313             /* fract = pos & (31); */
   3314             ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3315 
   3316             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   3317             /*(32 - fract) */
   3318             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   3319 
   3320             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   3321             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   3322 
   3323             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   3324             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   3325 
   3326 
   3327             fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
   3328             fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
   3329 
   3330             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   3331             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   3332             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   3333             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   3334 
   3335             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   3336             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   3337             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   3338             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   3339 
   3340             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   3341             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   3342             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   3343             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   3344 
   3345             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   3346             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   3347             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   3348             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   3349 
   3350             for(row = 0; row < nt; row += 8)
   3351             {
   3352                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   3353                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   3354 
   3355 
   3356                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   3357                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   3358 
   3359                 /* loding 8-bit 16 pixels */
   3360                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/
   3361                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/
   3362                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/
   3363                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/
   3364 
   3365                 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/
   3366                 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/
   3367                 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/
   3368                 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/
   3369 
   3370                 /* loding 8-bit 16 pixels */
   3371                 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/
   3372                 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/
   3373                 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/
   3374                 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/
   3375 
   3376                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
   3377                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
   3378                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
   3379                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
   3380 
   3381                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
   3382                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
   3383                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
   3384                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
   3385 
   3386                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   3387                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   3388                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   3389                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   3390                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   3391 
   3392                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   3393                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   3394                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   3395                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   3396                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   3397 
   3398                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3399                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   3400                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   3401                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   3402                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   3403 
   3404                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3405                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   3406                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   3407                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   3408                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   3409 
   3410                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3411                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   3412                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   3413                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   3414                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   3415 
   3416                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3417                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   3418                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   3419                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   3420                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   3421 
   3422                 /* converting 16 bit to 8 bit */
   3423                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   3424                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   3425 
   3426                 /* converting 16 bit to 8 bit */
   3427                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/
   3428                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/
   3429 
   3430                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   3431                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   3432 
   3433                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   3434                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   3435 
   3436                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   3437                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   3438 
   3439                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   3440                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   3441 
   3442 
   3443                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   3444                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   3445 
   3446                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   3447                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   3448 
   3449                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   3450                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   3451                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   3452                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   3453 
   3454                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b);          /* row=0*/
   3455 
   3456                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b);       /* row=1*/
   3457 
   3458                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b);       /* row=2*/
   3459 
   3460                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b);       /* row=4*/
   3461 
   3462                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b);       /* row=5*/
   3463 
   3464                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b);       /* row=6*/
   3465 
   3466                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b);       /* row=7*/
   3467 
   3468                 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b);       /* row=8*/
   3469 
   3470             }
   3471         }
   3472     }
   3473     else
   3474     {
   3475 
   3476 
   3477         __m128i temp1, temp2, temp11, src_values0;
   3478         /* Intermediate reference samples for negative angle modes */
   3479         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   3480         ref_temp[two_nt - 1] = pu1_ref[nt];
   3481         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1));
   3482 
   3483         /* For negative angled derive the main reference samples from side */
   3484 
   3485         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/
   3486         temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3);
   3487         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16));
   3488 
   3489         src_values0 = _mm_shuffle_epi8(src_values0, temp2);
   3490         temp1 = _mm_shuffle_epi8(temp1, temp2);
   3491         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   3492         src_values0 = _mm_srli_si128(src_values0, 8);
   3493 
   3494         _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
   3495         _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
   3496 
   3497 
   3498         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3499         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3500         const_temp2_4x32b = _mm_set1_epi16(31);
   3501         const_temp4_4x32b = _mm_set1_epi16(8);
   3502         const_temp3_4x32b = _mm_set1_epi16(32);
   3503         two_nt_4x32b = _mm_set1_epi16(1);
   3504 
   3505         {
   3506             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   3507             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   3508             //WORD8  ai1_fract_temp0_val[16], ai1_fract_temp1_val[16];
   3509 
   3510             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b;
   3511 
   3512             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   3513             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   3514 
   3515             /* pos = ((row + 1) * intra_pred_ang); */
   3516             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3517 
   3518             /* idx = pos >> 5; */
   3519             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3520 
   3521             /* fract = pos & (31); */
   3522             ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3523 
   3524             /*(32 - fract) */
   3525             fract2_8x16b =  _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   3526 
   3527             fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8);
   3528             fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */
   3529 
   3530             fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b);
   3531             fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */
   3532 
   3533             fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b);
   3534             fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b);
   3535 
   3536             temp1_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x00);
   3537             temp2_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0x55);
   3538             temp3_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xaa);
   3539             temp4_8x16b =  _mm_shuffle_epi32(fract_4x32b, 0xff);
   3540 
   3541             temp11_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x00);
   3542             temp12_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0x55);
   3543             temp13_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xaa);
   3544             temp14_8x16b =  _mm_shuffle_epi32(fract8_8x16b, 0xff);
   3545 
   3546             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   3547             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   3548             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   3549             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   3550 
   3551             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   3552             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   3553             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   3554             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   3555 
   3556             {
   3557                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   3558                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   3559 
   3560                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   3561                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   3562 
   3563                 /* loding 8-bit 16 pixels */
   3564                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
   3565                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
   3566                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
   3567                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
   3568 
   3569                 /* loding 8-bit 16 pixels */
   3570                 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/
   3571                 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/
   3572                 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/
   3573                 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/
   3574 
   3575                 src_temp1_8x16b =  _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/
   3576                 src_temp2_8x16b =  _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/
   3577                 src_temp3_8x16b =  _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/
   3578                 src_temp4_8x16b =  _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/
   3579 
   3580                 src_temp11_8x16b =  _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/
   3581                 src_temp12_8x16b =  _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/
   3582                 src_temp13_8x16b =  _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/
   3583                 src_temp14_8x16b =  _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/
   3584 
   3585                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   3586                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   3587                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   3588                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   3589                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   3590 
   3591                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   3592                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   3593                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   3594                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   3595                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   3596 
   3597                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3598                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   3599                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   3600                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   3601                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   3602 
   3603                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3604                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* row=0*/
   3605                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* row=1*/
   3606                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* row=2*/
   3607                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* row=3*/
   3608 
   3609                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3610                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   3611                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   3612                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   3613                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   3614 
   3615                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3616                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   3617                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   3618                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   3619                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   3620 
   3621                 /* converting 16 bit to 8 bit */
   3622                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/
   3623                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/
   3624 
   3625                 /* converting 16 bit to 8 bit */
   3626                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/
   3627                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/
   3628 
   3629                 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b);
   3630                 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b);
   3631 
   3632                 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b);
   3633                 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b);
   3634 
   3635                 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b);
   3636                 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b);
   3637 
   3638                 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b);
   3639                 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b);
   3640 
   3641 
   3642                 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b);
   3643                 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b);
   3644 
   3645                 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b);
   3646                 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b);
   3647 
   3648                 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
   3649                 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
   3650                 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
   3651                 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8);
   3652 
   3653                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b);       /* row=0*/
   3654 
   3655                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b);       /* row=1*/
   3656 
   3657                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b);       /* row=2*/
   3658 
   3659                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b);       /* row=3*/
   3660 
   3661                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b);       /* row=4*/
   3662 
   3663                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b);       /* row=5*/
   3664 
   3665                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b);       /* row=6*/
   3666 
   3667                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b);       /* row=7*/
   3668 
   3669             }
   3670         }
   3671     }
   3672 
   3673 }
   3674 
   3675 
   3676 /**
   3677 *******************************************************************************
   3678 *
   3679 * @brief
   3680 *   Intra prediction interpolation filter for luma mode 19 to mode 25
   3681 *
   3682 * @par Description:
   3683 *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
   3684 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   3685 *    block location pointed by 'pu1_dst'
   3686 *
   3687 * @param[in] pu1_src
   3688 *  UWORD8 pointer to the source
   3689 *
   3690 * @param[out] pu1_dst
   3691 *  UWORD8 pointer to the destination
   3692 *
   3693 * @param[in] src_strd
   3694 *  integer source stride
   3695 *
   3696 * @param[in] dst_strd
   3697 *  integer destination stride
   3698 *
   3699 * @param[in] nt
   3700 *  integer Transform Block size
   3701 *
   3702 * @param[in] mode
   3703 *  integer intraprediction mode
   3704 *
   3705 * @returns
   3706 *
   3707 * @remarks
   3708 *  None
   3709 *
   3710 *******************************************************************************
   3711 */
   3712 
   3713 void ihevc_intra_pred_luma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
   3714                                                WORD32 src_strd,
   3715                                                UWORD8 *pu1_dst,
   3716                                                WORD32 dst_strd,
   3717                                                WORD32 nt,
   3718                                                WORD32 mode)
   3719 {
   3720 
   3721     WORD32 row, k;
   3722     WORD32 two_nt, intra_pred_ang;
   3723     WORD32 inv_ang, inv_ang_sum;
   3724     //WORD32 ref_main_idx, pos, fract, idx;
   3725     WORD32 ref_idx;
   3726     UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2];
   3727     UWORD8 *ref_main, *ref_temp;
   3728 
   3729     __m128i  /*fract_8x16b,*/ const_temp_8x16b, sm3;
   3730     __m128i temp1, temp2, temp3, temp4;
   3731     __m128i temp11, temp12, temp13, temp14;
   3732     UNUSED(src_strd);
   3733     two_nt = 2 * nt;
   3734     intra_pred_ang = gai4_ihevc_ang_table[mode];
   3735     inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
   3736 
   3737     /* Intermediate reference samples for negative angle modes */
   3738     /* This have to be removed during optimization*/
   3739     /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   3740     ref_temp = ref_tmp + 1;
   3741     ref_main = ref_temp + nt - 1;
   3742 
   3743 
   3744     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
   3745 
   3746 
   3747 
   3748     const_temp_8x16b = _mm_set1_epi16(16);
   3749 
   3750     if(nt == 32)
   3751     {
   3752 
   3753         __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
   3754         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   3755         __m128i row_4x32b, two_nt_4x32b, src_values12;
   3756 
   3757         __m128i src_values0, src_values1, src_values2, src_values3;
   3758         __m128i  src_values4, src_values5, src_values6, src_values7;
   3759         WORD32 col = 0;
   3760 
   3761         /* Intermediate reference samples for negative angle modes */
   3762         /* This have to be removed during optimization*/
   3763         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   3764         ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
   3765         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
   3766         temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16));
   3767 
   3768         /* SIMD Optimization can be done using look-up table for the loop */
   3769         /* For negative angled derive the main reference samples from side */
   3770         /*  reference samples refer to section 8.4.4.2.6 */
   3771         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
   3772         src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/
   3773 
   3774         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19]));
   3775         temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
   3776 
   3777         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   3778         src_values1 = _mm_shuffle_epi8(src_values1, temp12);
   3779 
   3780         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
   3781         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3);
   3782         _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1);
   3783         _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0);
   3784 
   3785         const_temp2_4x32b = _mm_set1_epi16(31);
   3786         const_temp3_4x32b = _mm_set1_epi16(32);
   3787         const_temp8_4x32b = _mm_set1_epi16(8);
   3788 
   3789         two_nt_4x32b = _mm_set1_epi16(1);
   3790 
   3791         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   3792         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3793 
   3794         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3795 
   3796         for(row = 0; row < nt; row += 8)
   3797         {
   3798 
   3799             WORD16 ref_main_idx[9];
   3800 
   3801             __m128i res_temp5_4x32b;
   3802             __m128i fract1_8x16b, fract2_8x16b;
   3803 
   3804             /* pos = ((row + 1) * intra_pred_ang); */
   3805             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   3806 
   3807             /* fract = pos & (31); */
   3808             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   3809 
   3810             /* idx = pos >> 5; */
   3811             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   3812 
   3813             /*(32 - fract) */
   3814             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   3815 
   3816             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   3817             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   3818 
   3819             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   3820             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   3821 
   3822             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   3823             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   3824 
   3825             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   3826             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   3827             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   3828             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   3829 
   3830             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   3831             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   3832             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   3833             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   3834 
   3835             row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
   3836             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   3837             for(col = 0; col < nt; col += 16)
   3838             {
   3839                 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col));
   3840                 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col));
   3841                 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col));
   3842                 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col));
   3843                 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col));
   3844                 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col));
   3845                 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col));
   3846                 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col));
   3847 
   3848                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3849                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3850                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3851                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3852                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3853                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3854                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3855                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3856 
   3857 
   3858                 src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   3859                 src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   3860                 src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   3861                 src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   3862                 src_values4 = _mm_maddubs_epi16(src_values4, temp1);
   3863                 src_values5 = _mm_maddubs_epi16(src_values5, temp2);
   3864                 src_values6 = _mm_maddubs_epi16(src_values6, temp3);
   3865                 src_values7 = _mm_maddubs_epi16(src_values7, temp4);
   3866 
   3867                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3868                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3869                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3870                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3871                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3872                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3873                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3874                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3875                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3876 
   3877                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3878                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3879                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3880                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3881                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3882                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3883                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3884                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3885                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3886 
   3887                 /* converting 16 bit to 8 bit */
   3888                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3889                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3890                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3891                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3892 
   3893                 /* loading 8-bit 8 pixels values */
   3894                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
   3895                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
   3896                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
   3897                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
   3898 
   3899 
   3900                 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col));
   3901                 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col));
   3902                 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col));
   3903                 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col));
   3904                 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col));
   3905                 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col));
   3906                 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col));
   3907                 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col));
   3908 
   3909                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   3910                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   3911                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   3912                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   3913                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   3914                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   3915                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   3916                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   3917 
   3918 
   3919                 src_values0 = _mm_maddubs_epi16(src_values0, temp11);
   3920                 src_values1 = _mm_maddubs_epi16(src_values1, temp12);
   3921                 src_values2 = _mm_maddubs_epi16(src_values2, temp13);
   3922                 src_values3 = _mm_maddubs_epi16(src_values3, temp14);
   3923                 src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   3924                 src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   3925                 src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   3926                 src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   3927 
   3928                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   3929                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   3930                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   3931                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   3932                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   3933                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   3934                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   3935                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   3936                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   3937 
   3938                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   3939                 src_values0 = _mm_srai_epi16(src_values0,  5);
   3940                 src_values1 = _mm_srai_epi16(src_values1,  5);
   3941                 src_values2 = _mm_srai_epi16(src_values2,  5);
   3942                 src_values3 = _mm_srai_epi16(src_values3,  5);
   3943                 src_values4 = _mm_srai_epi16(src_values4,  5);
   3944                 src_values5 = _mm_srai_epi16(src_values5,  5);
   3945                 src_values6 = _mm_srai_epi16(src_values6,  5);
   3946                 src_values7 = _mm_srai_epi16(src_values7,  5);
   3947 
   3948                 /* converting 16 bit to 8 bit */
   3949                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   3950                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   3951                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   3952                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   3953 
   3954                 /* loading 8-bit 8 pixels values */
   3955                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
   3956                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
   3957                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
   3958                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
   3959 
   3960             }
   3961             pu1_dst += 8 * dst_strd;
   3962         }
   3963 
   3964     }
   3965     else if(nt == 16) /* for nt = 16 case */
   3966     {
   3967 
   3968         __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
   3969         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   3970         __m128i row_4x32b, two_nt_4x32b, src_values12;
   3971         __m128i src_values0, src_values1, src_values2, src_values3;
   3972         __m128i  src_values4, src_values5, src_values6, src_values7;
   3973 
   3974 
   3975         /* Intermediate reference samples for negative angle modes */
   3976         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   3977         ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
   3978         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt));
   3979 
   3980         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/
   3981 
   3982         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
   3983 
   3984         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   3985 
   3986         _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0);
   3987         _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1);
   3988 
   3989         const_temp2_4x32b = _mm_set1_epi16(31);
   3990         const_temp3_4x32b = _mm_set1_epi16(32);
   3991         const_temp8_4x32b = _mm_set1_epi16(8);
   3992 
   3993         two_nt_4x32b = _mm_set1_epi16(1);
   3994 
   3995         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   3996         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   3997 
   3998         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   3999 
   4000         for(row = 0; row < nt; row += 8)
   4001         {
   4002 
   4003             WORD16 ref_main_idx[9];
   4004 
   4005             __m128i res_temp5_4x32b;
   4006             __m128i fract1_8x16b, fract2_8x16b;
   4007 
   4008             /* pos = ((row + 1) * intra_pred_ang); */
   4009             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   4010 
   4011             /* fract = pos & (31); */
   4012             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   4013 
   4014             /* idx = pos >> 5; */
   4015             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   4016 
   4017             /*(32 - fract) */
   4018             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   4019 
   4020             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   4021             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   4022 
   4023             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   4024             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   4025 
   4026             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   4027             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   4028 
   4029             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   4030             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   4031             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   4032             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   4033 
   4034             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   4035             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   4036             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   4037             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   4038 
   4039             row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
   4040             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   4041 
   4042             {
   4043                 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));
   4044                 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));
   4045                 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));
   4046                 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));
   4047                 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8));
   4048                 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8));
   4049                 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8));
   4050                 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8));
   4051 
   4052                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4053                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4054                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4055                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4056                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4057                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4058                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4059                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4060 
   4061 
   4062                 src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   4063                 src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   4064                 src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   4065                 src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   4066                 src_values4 = _mm_maddubs_epi16(src_values4, temp1);
   4067                 src_values5 = _mm_maddubs_epi16(src_values5, temp2);
   4068                 src_values6 = _mm_maddubs_epi16(src_values6, temp3);
   4069                 src_values7 = _mm_maddubs_epi16(src_values7, temp4);
   4070 
   4071                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4072                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4073                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4074                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4075                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4076                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4077                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4078                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4079                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4080 
   4081                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4082                 src_values0 = _mm_srai_epi16(src_values0,  5);
   4083                 src_values1 = _mm_srai_epi16(src_values1,  5);
   4084                 src_values2 = _mm_srai_epi16(src_values2,  5);
   4085                 src_values3 = _mm_srai_epi16(src_values3,  5);
   4086                 src_values4 = _mm_srai_epi16(src_values4,  5);
   4087                 src_values5 = _mm_srai_epi16(src_values5,  5);
   4088                 src_values6 = _mm_srai_epi16(src_values6,  5);
   4089                 src_values7 = _mm_srai_epi16(src_values7,  5);
   4090 
   4091                 /* converting 16 bit to 8 bit */
   4092                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   4093                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   4094                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   4095                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   4096 
   4097                 /* loading 8-bit 8 pixels values */
   4098                 _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
   4099                 _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
   4100                 _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
   4101                 _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
   4102 
   4103 
   4104                 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));
   4105                 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));
   4106                 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));
   4107                 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));
   4108                 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8));
   4109                 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8));
   4110                 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8));
   4111                 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8));
   4112 
   4113                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4114                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4115                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4116                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4117                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4118                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4119                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4120                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4121 
   4122 
   4123                 src_values0 = _mm_maddubs_epi16(src_values0, temp11);
   4124                 src_values1 = _mm_maddubs_epi16(src_values1, temp12);
   4125                 src_values2 = _mm_maddubs_epi16(src_values2, temp13);
   4126                 src_values3 = _mm_maddubs_epi16(src_values3, temp14);
   4127                 src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   4128                 src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   4129                 src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   4130                 src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   4131 
   4132                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4133                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4134                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4135                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4136                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4137                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4138                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4139                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4140                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4141 
   4142                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4143                 src_values0 = _mm_srai_epi16(src_values0,  5);
   4144                 src_values1 = _mm_srai_epi16(src_values1,  5);
   4145                 src_values2 = _mm_srai_epi16(src_values2,  5);
   4146                 src_values3 = _mm_srai_epi16(src_values3,  5);
   4147                 src_values4 = _mm_srai_epi16(src_values4,  5);
   4148                 src_values5 = _mm_srai_epi16(src_values5,  5);
   4149                 src_values6 = _mm_srai_epi16(src_values6,  5);
   4150                 src_values7 = _mm_srai_epi16(src_values7,  5);
   4151 
   4152                 /* converting 16 bit to 8 bit */
   4153                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   4154                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   4155                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   4156                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   4157 
   4158                 /* loading 8-bit 8 pixels values */
   4159                 _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
   4160                 _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
   4161                 _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
   4162                 _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
   4163 
   4164             }
   4165             pu1_dst += 8 * dst_strd;
   4166         }
   4167     }
   4168     else if(nt == 8)
   4169     {
   4170 
   4171 
   4172         __m128i const_temp2_4x32b, const_temp3_4x32b;
   4173         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   4174 
   4175         __m128i row_4x32b, two_nt_4x32b, src_values12;
   4176         __m128i src_values0, src_values1, src_values2, src_values3;
   4177         __m128i  src_values4, src_values5, src_values6, src_values7;
   4178 
   4179 
   4180         /* Intermediate reference samples for negative angle modes */
   4181         /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   4182         ref_temp[two_nt - 1] = pu1_ref[two_nt + nt];
   4183         temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt));
   4184 
   4185         /* For negative angled derive the main reference samples from side */
   4186 
   4187         src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/
   4188 
   4189         temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16));
   4190 
   4191         src_values0 = _mm_shuffle_epi8(src_values0, temp11);
   4192         src_values0 = _mm_srli_si128(src_values0, 8);
   4193         _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1);
   4194         _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0);
   4195 
   4196 
   4197 
   4198         const_temp2_4x32b = _mm_set1_epi16(31);
   4199         const_temp3_4x32b = _mm_set1_epi16(32);
   4200 
   4201 
   4202         two_nt_4x32b = _mm_set1_epi16(1);
   4203 
   4204 
   4205         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   4206         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   4207 
   4208         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   4209 
   4210         {
   4211 
   4212             WORD16 ref_main_idx[9];
   4213 
   4214             __m128i res_temp5_4x32b;
   4215             __m128i fract1_8x16b, fract2_8x16b;
   4216 
   4217             /* pos = ((row + 1) * intra_pred_ang); */
   4218             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   4219 
   4220             /* fract = pos & (31); */
   4221             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   4222 
   4223             /* idx = pos >> 5; */
   4224             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   4225 
   4226             /*(32 - fract) */
   4227             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   4228 
   4229             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   4230             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   4231 
   4232             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   4233             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   4234 
   4235             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   4236             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   4237 
   4238             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   4239             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   4240             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   4241             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   4242 
   4243             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   4244             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   4245             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   4246             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   4247 
   4248             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   4249 
   4250             src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0]));  /* col = 0-7   */
   4251             src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1]));  /* col = 8-15  */
   4252             src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2]));  /* col = 16-23 */
   4253             src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3]));  /* col = 24-31 */
   4254             src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4]));  /* col = 32-39   */
   4255             src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5]));  /* col = 40-47  */
   4256             src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6]));  /* col = 48-55 */
   4257             src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7]));  /* col = 56-63*/
   4258 
   4259             src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4260             src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4261             src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4262             src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4263             src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4264             src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4265             src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4266             src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4267 
   4268 
   4269             src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   4270             src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   4271             src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   4272             src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   4273             src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   4274             src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   4275             src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   4276             src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   4277 
   4278             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4279             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4280             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4281             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4282             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4283             src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4284             src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4285             src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4286             src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4287 
   4288             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4289             src_values0 = _mm_srai_epi16(src_values0,  5);
   4290             src_values1 = _mm_srai_epi16(src_values1,  5);
   4291             src_values2 = _mm_srai_epi16(src_values2,  5);
   4292             src_values3 = _mm_srai_epi16(src_values3,  5);
   4293             src_values4 = _mm_srai_epi16(src_values4,  5);
   4294             src_values5 = _mm_srai_epi16(src_values5,  5);
   4295             src_values6 = _mm_srai_epi16(src_values6,  5);
   4296             src_values7 = _mm_srai_epi16(src_values7,  5);
   4297 
   4298             /* converting 16 bit to 8 bit */
   4299             src_values0 = _mm_packus_epi16(src_values0, src_values1);
   4300             src_values2 = _mm_packus_epi16(src_values2, src_values3);
   4301             src_values1 = _mm_srli_si128(src_values0, 8);
   4302             src_values3 = _mm_srli_si128(src_values2, 8);
   4303             src_values4 = _mm_packus_epi16(src_values4, src_values5);
   4304             src_values6 = _mm_packus_epi16(src_values6, src_values7);
   4305             src_values5 = _mm_srli_si128(src_values4, 8);
   4306             src_values7 = _mm_srli_si128(src_values6, 8);
   4307 
   4308             /* loading 8-bit 8 pixels values */
   4309             _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
   4310             _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
   4311             _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
   4312             _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
   4313             _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
   4314             _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
   4315             _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
   4316             _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
   4317         }
   4318     }
   4319     else /* if nt =4*/
   4320     {
   4321 
   4322         __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b;
   4323         __m128i src_values10, src_values11, intra_pred_ang_4x32b, sign_8x16b;
   4324 
   4325         __m128i row_4x32b, two_nt_4x32b, src_values12;
   4326 
   4327 
   4328         for(k = 0; k < (nt + 1); k++)
   4329             ref_temp[k + nt - 1] = pu1_ref[two_nt + k];
   4330         ref_idx = (nt * intra_pred_ang) >> 5;
   4331         inv_ang_sum = 128;
   4332 
   4333         for(k = -1; k > ref_idx; k--)
   4334         {
   4335             inv_ang_sum += inv_ang;
   4336             ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
   4337         }
   4338 
   4339 
   4340         const_temp2_4x32b = _mm_set1_epi32(31);
   4341         const_temp3_4x32b = _mm_set1_epi32(32);
   4342         zero_8x16b = _mm_setzero_si128();
   4343         two_nt_4x32b = _mm_set1_epi32(1);
   4344 
   4345 
   4346         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   4347         row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
   4348         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   4349 
   4350         {
   4351             WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   4352             int temp11, temp21, temp31, temp41;
   4353 
   4354 
   4355             __m128i fract1_8x16b, fract2_8x16b,  res_temp5_4x32b;
   4356             __m128i src_values0, src_values1, src_values2, src_values3;
   4357             __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
   4358 
   4359             /* pos = ((row + 1) * intra_pred_ang); */
   4360             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   4361             sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
   4362             res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
   4363 
   4364             /* fract = pos & (31); */
   4365             src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   4366 
   4367             ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
   4368             ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
   4369             ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
   4370             ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
   4371             ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
   4372             ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
   4373             ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
   4374 
   4375             /* idx = pos >> 5; */
   4376             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   4377 
   4378             /*(32 - fract) */
   4379             src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
   4380 
   4381             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   4382             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   4383 
   4384             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   4385             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   4386 
   4387             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   4388             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   4389 
   4390             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   4391             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   4392             temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   4393             temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   4394 
   4395             src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));     /* col = 0-7   */
   4396             src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));   /* col = 8-15  */
   4397             src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
   4398             src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */
   4399 
   4400             src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4401             src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4402             src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4403             src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4404 
   4405 
   4406             src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   4407             src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   4408             src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   4409             src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   4410 
   4411             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4412             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4413             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4414             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4415             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4416 
   4417             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4418             src_values0 = _mm_srai_epi16(src_values0,  5);
   4419             src_values1 = _mm_srai_epi16(src_values1,  5);
   4420             src_values2 = _mm_srai_epi16(src_values2,  5);
   4421             src_values3 = _mm_srai_epi16(src_values3,  5);
   4422 
   4423             /* converting 16 bit to 8 bit */
   4424             src_values0 = _mm_packus_epi16(src_values0, src_values1);
   4425             src_values2 = _mm_packus_epi16(src_values2, src_values3);
   4426             src_values1 = _mm_srli_si128(src_values0, 8);
   4427             src_values3 = _mm_srli_si128(src_values2, 8);
   4428 
   4429             temp11 = _mm_cvtsi128_si32(src_values0);
   4430             temp21 = _mm_cvtsi128_si32(src_values1);
   4431             temp31 = _mm_cvtsi128_si32(src_values2);
   4432             temp41 = _mm_cvtsi128_si32(src_values3);
   4433 
   4434             /* loding 4-bit 8 pixels values */
   4435             *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
   4436             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
   4437             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
   4438             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
   4439 
   4440         }
   4441     }
   4442 }
   4443 
   4444 
   4445 
   4446 
   4447 /**
   4448 *******************************************************************************
   4449 *
   4450 * @brief
   4451 *    Intra prediction interpolation filter for luma mode 27 to mode 33
   4452 *
   4453 * @par Description:
   4454 *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
   4455 *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   4456 *    block location pointed by 'pu1_dst'
   4457 *
   4458 * @param[in] pu1_src
   4459 *  UWORD8 pointer to the source
   4460 *
   4461 * @param[out] pu1_dst
   4462 *  UWORD8 pointer to the destination
   4463 *
   4464 * @param[in] src_strd
   4465 *  integer source stride
   4466 *
   4467 * @param[in] dst_strd
   4468 *  integer destination stride
   4469 *
   4470 * @param[in] nt
   4471 *  integer Transform Block size
   4472 *
   4473 * @param[in] mode
   4474 *  integer intraprediction mode
   4475 *
   4476 * @returns
   4477 *
   4478 * @remarks
   4479 *  None
   4480 *
   4481 *******************************************************************************
   4482 */
   4483 
   4484 
   4485 void ihevc_intra_pred_luma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
   4486                                                WORD32 src_strd,
   4487                                                UWORD8 *pu1_dst,
   4488                                                WORD32 dst_strd,
   4489                                                WORD32 nt,
   4490                                                WORD32 mode)
   4491 {
   4492     WORD32 row;
   4493     WORD32 two_nt;
   4494     WORD32 intra_pred_ang;
   4495 
   4496     __m128i temp11, temp12, temp13, temp14;
   4497 
   4498     __m128i     const_temp_8x16b;
   4499     __m128i temp1, temp2, temp3, temp4, sm3;
   4500     UNUSED(src_strd);
   4501     two_nt = 2 * nt;
   4502     intra_pred_ang = gai4_ihevc_ang_table[mode];
   4503 
   4504     const_temp_8x16b = _mm_set1_epi16(16);
   4505     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]);
   4506     if(nt == 32)
   4507     {
   4508 
   4509         __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
   4510         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   4511         __m128i row_4x32b, two_nt_4x32b, src_values12;
   4512         int col = 0;
   4513 
   4514         const_temp2_4x32b = _mm_set1_epi16(31);
   4515         const_temp3_4x32b = _mm_set1_epi16(32);
   4516         const_temp8_4x32b = _mm_set1_epi16(8);
   4517 
   4518         two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
   4519 
   4520         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   4521         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   4522 
   4523         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   4524 
   4525         for(row = 0; row < nt; row += 8)
   4526         {
   4527 
   4528             WORD16 ref_main_idx[9];
   4529 
   4530             __m128i res_temp5_4x32b;
   4531             __m128i fract1_8x16b, fract2_8x16b;
   4532             __m128i src_values0, src_values1, src_values2, src_values3;
   4533             __m128i  src_values4, src_values5, src_values6, src_values7;
   4534 
   4535             /* pos = ((row + 1) * intra_pred_ang); */
   4536             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   4537 
   4538             /* fract = pos & (31); */
   4539             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   4540 
   4541             /* idx = pos >> 5; */
   4542             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   4543 
   4544             /*(32 - fract) */
   4545             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   4546 
   4547             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   4548             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   4549 
   4550             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   4551             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   4552 
   4553             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   4554             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   4555 
   4556             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   4557             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   4558             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   4559             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   4560 
   4561             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   4562             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   4563             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   4564             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   4565 
   4566             row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
   4567             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   4568             for(col = 0; col < nt; col += 16)
   4569             {
   4570                 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col));
   4571                 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col));
   4572                 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col));
   4573                 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col));
   4574                 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col));
   4575                 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col));
   4576                 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col));
   4577                 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col));
   4578 
   4579                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4580                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4581                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4582                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4583                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4584                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4585                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4586                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4587 
   4588 
   4589                 src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   4590                 src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   4591                 src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   4592                 src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   4593                 src_values4 = _mm_maddubs_epi16(src_values4, temp1);
   4594                 src_values5 = _mm_maddubs_epi16(src_values5, temp2);
   4595                 src_values6 = _mm_maddubs_epi16(src_values6, temp3);
   4596                 src_values7 = _mm_maddubs_epi16(src_values7, temp4);
   4597 
   4598                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4599                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4600                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4601                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4602                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4603                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4604                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4605                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4606                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4607 
   4608                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4609                 src_values0 = _mm_srai_epi16(src_values0,  5);
   4610                 src_values1 = _mm_srai_epi16(src_values1,  5);
   4611                 src_values2 = _mm_srai_epi16(src_values2,  5);
   4612                 src_values3 = _mm_srai_epi16(src_values3,  5);
   4613                 src_values4 = _mm_srai_epi16(src_values4,  5);
   4614                 src_values5 = _mm_srai_epi16(src_values5,  5);
   4615                 src_values6 = _mm_srai_epi16(src_values6,  5);
   4616                 src_values7 = _mm_srai_epi16(src_values7,  5);
   4617 
   4618                 /* converting 16 bit to 8 bit */
   4619                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   4620                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   4621                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   4622                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   4623 
   4624                 /* loading 8-bit 8 pixels values */
   4625                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0);       /* row=0*/
   4626                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1);   /* row=1*/
   4627                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2);   /* row=2*/
   4628                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3);   /* row=3*/
   4629 
   4630 
   4631                 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col));
   4632                 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col));
   4633                 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col));
   4634                 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col));
   4635                 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col));
   4636                 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col));
   4637                 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col));
   4638                 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col));
   4639 
   4640                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4641                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4642                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4643                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4644                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4645                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4646                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4647                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4648 
   4649 
   4650                 src_values0 = _mm_maddubs_epi16(src_values0, temp11);
   4651                 src_values1 = _mm_maddubs_epi16(src_values1, temp12);
   4652                 src_values2 = _mm_maddubs_epi16(src_values2, temp13);
   4653                 src_values3 = _mm_maddubs_epi16(src_values3, temp14);
   4654                 src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   4655                 src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   4656                 src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   4657                 src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   4658 
   4659                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4660                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4661                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4662                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4663                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4664                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4665                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4666                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4667                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4668 
   4669                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4670                 src_values0 = _mm_srai_epi16(src_values0,  5);
   4671                 src_values1 = _mm_srai_epi16(src_values1,  5);
   4672                 src_values2 = _mm_srai_epi16(src_values2,  5);
   4673                 src_values3 = _mm_srai_epi16(src_values3,  5);
   4674                 src_values4 = _mm_srai_epi16(src_values4,  5);
   4675                 src_values5 = _mm_srai_epi16(src_values5,  5);
   4676                 src_values6 = _mm_srai_epi16(src_values6,  5);
   4677                 src_values7 = _mm_srai_epi16(src_values7,  5);
   4678 
   4679                 /* converting 16 bit to 8 bit */
   4680                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   4681                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   4682                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   4683                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   4684 
   4685                 /* loading 8-bit 8 pixels values */
   4686                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0);   /* row=4*/
   4687                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1);   /* row=5*/
   4688                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2);   /* row=6*/
   4689                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3);   /* row=7*/
   4690 
   4691             }
   4692             pu1_dst += 8 * dst_strd;
   4693         }
   4694 
   4695     }
   4696     else if(nt == 16) /* for nt = 16 case */
   4697     {
   4698 
   4699         __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b;
   4700         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   4701         __m128i row_4x32b, two_nt_4x32b, src_values12;
   4702 
   4703 
   4704         const_temp2_4x32b = _mm_set1_epi16(31);
   4705         const_temp3_4x32b = _mm_set1_epi16(32);
   4706         const_temp8_4x32b = _mm_set1_epi16(8);
   4707 
   4708         two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
   4709 
   4710         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   4711         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   4712 
   4713         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   4714 
   4715         for(row = 0; row < nt; row += 8)
   4716         {
   4717 
   4718             WORD16 ref_main_idx[9];
   4719 
   4720             __m128i res_temp5_4x32b;
   4721             __m128i fract1_8x16b, fract2_8x16b;
   4722             __m128i src_values0, src_values1, src_values2, src_values3;
   4723             __m128i  src_values4, src_values5, src_values6, src_values7;
   4724 
   4725             /* pos = ((row + 1) * intra_pred_ang); */
   4726             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   4727 
   4728             /* fract = pos & (31); */
   4729             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   4730 
   4731             /* idx = pos >> 5; */
   4732             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   4733 
   4734             /*(32 - fract) */
   4735             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   4736 
   4737             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   4738             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   4739 
   4740             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   4741             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   4742 
   4743             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   4744             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   4745 
   4746             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   4747             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   4748             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   4749             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   4750 
   4751             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   4752             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   4753             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   4754             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   4755 
   4756             row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b);
   4757             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   4758 
   4759             {
   4760                 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));
   4761                 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));
   4762                 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));
   4763                 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));
   4764                 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8));
   4765                 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8));
   4766                 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8));
   4767                 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8));
   4768 
   4769                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4770                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4771                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4772                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4773                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4774                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4775                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4776                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4777 
   4778 
   4779                 src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   4780                 src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   4781                 src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   4782                 src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   4783                 src_values4 = _mm_maddubs_epi16(src_values4, temp1);
   4784                 src_values5 = _mm_maddubs_epi16(src_values5, temp2);
   4785                 src_values6 = _mm_maddubs_epi16(src_values6, temp3);
   4786                 src_values7 = _mm_maddubs_epi16(src_values7, temp4);
   4787 
   4788                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4789                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4790                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4791                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4792                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4793                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4794                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4795                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4796                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4797 
   4798                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4799                 src_values0 = _mm_srai_epi16(src_values0,  5);
   4800                 src_values1 = _mm_srai_epi16(src_values1,  5);
   4801                 src_values2 = _mm_srai_epi16(src_values2,  5);
   4802                 src_values3 = _mm_srai_epi16(src_values3,  5);
   4803                 src_values4 = _mm_srai_epi16(src_values4,  5);
   4804                 src_values5 = _mm_srai_epi16(src_values5,  5);
   4805                 src_values6 = _mm_srai_epi16(src_values6,  5);
   4806                 src_values7 = _mm_srai_epi16(src_values7,  5);
   4807 
   4808                 /* converting 16 bit to 8 bit */
   4809                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   4810                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   4811                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   4812                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   4813 
   4814                 /* loading 8-bit 8 pixels values */
   4815                 _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0);       /* row=0*/
   4816                 _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1);   /* row=1*/
   4817                 _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2);   /* row=2*/
   4818                 _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3);   /* row=3*/
   4819 
   4820 
   4821                 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));
   4822                 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));
   4823                 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));
   4824                 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));
   4825                 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8));
   4826                 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8));
   4827                 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8));
   4828                 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8));
   4829 
   4830                 src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4831                 src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4832                 src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4833                 src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4834                 src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4835                 src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4836                 src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4837                 src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4838 
   4839 
   4840                 src_values0 = _mm_maddubs_epi16(src_values0, temp11);
   4841                 src_values1 = _mm_maddubs_epi16(src_values1, temp12);
   4842                 src_values2 = _mm_maddubs_epi16(src_values2, temp13);
   4843                 src_values3 = _mm_maddubs_epi16(src_values3, temp14);
   4844                 src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   4845                 src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   4846                 src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   4847                 src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   4848 
   4849                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4850                 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4851                 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4852                 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4853                 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4854                 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4855                 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4856                 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4857                 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4858 
   4859                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4860                 src_values0 = _mm_srai_epi16(src_values0,  5);
   4861                 src_values1 = _mm_srai_epi16(src_values1,  5);
   4862                 src_values2 = _mm_srai_epi16(src_values2,  5);
   4863                 src_values3 = _mm_srai_epi16(src_values3,  5);
   4864                 src_values4 = _mm_srai_epi16(src_values4,  5);
   4865                 src_values5 = _mm_srai_epi16(src_values5,  5);
   4866                 src_values6 = _mm_srai_epi16(src_values6,  5);
   4867                 src_values7 = _mm_srai_epi16(src_values7,  5);
   4868 
   4869                 /* converting 16 bit to 8 bit */
   4870                 src_values0 = _mm_packus_epi16(src_values0, src_values4);
   4871                 src_values1 = _mm_packus_epi16(src_values1, src_values5);
   4872                 src_values2 = _mm_packus_epi16(src_values2, src_values6);
   4873                 src_values3 = _mm_packus_epi16(src_values3, src_values7);
   4874 
   4875                 /* loading 8-bit 8 pixels values */
   4876                 _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0);   /* row=4*/
   4877                 _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1);   /* row=5*/
   4878                 _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2);   /* row=6*/
   4879                 _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3);   /* row=7*/
   4880 
   4881             }
   4882             pu1_dst += 8 * dst_strd;
   4883         }
   4884 
   4885     }
   4886     else if(nt == 8)
   4887     {
   4888 
   4889         __m128i const_temp2_4x32b, const_temp3_4x32b;
   4890         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   4891         __m128i row_4x32b, two_nt_4x32b, src_values12;
   4892 
   4893 
   4894         const_temp2_4x32b = _mm_set1_epi16(31);
   4895         const_temp3_4x32b = _mm_set1_epi16(32);
   4896 
   4897         two_nt_4x32b = _mm_set1_epi16(two_nt + 1);
   4898 
   4899 
   4900         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   4901         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   4902 
   4903         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   4904 
   4905         //for(row = 0; row < nt; row +=4)
   4906         {
   4907 
   4908             WORD16 ref_main_idx[9];
   4909 
   4910             __m128i res_temp5_4x32b;
   4911             __m128i fract1_8x16b, fract2_8x16b;
   4912             __m128i src_values0, src_values1, src_values2, src_values3;
   4913             __m128i  src_values4, src_values5, src_values6, src_values7;
   4914 
   4915             /* pos = ((row + 1) * intra_pred_ang); */
   4916             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   4917 
   4918             /* fract = pos & (31); */
   4919             src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   4920 
   4921             /* idx = pos >> 5; */
   4922             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   4923 
   4924             /*(32 - fract) */
   4925             src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11);
   4926 
   4927             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   4928             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   4929 
   4930             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   4931             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   4932 
   4933             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   4934             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   4935 
   4936             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   4937             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0x55);
   4938             temp3 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   4939             temp4 =  _mm_shuffle_epi32(fract1_8x16b, 0xff);
   4940 
   4941             temp11 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   4942             temp12 =  _mm_shuffle_epi32(fract2_8x16b, 0x55);
   4943             temp13 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   4944             temp14 =  _mm_shuffle_epi32(fract2_8x16b, 0xff);
   4945 
   4946             _mm_storeu_si128((__m128i *)ref_main_idx, src_values12);
   4947 
   4948             src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0]));  /* col = 0-7   */
   4949             src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1]));  /* col = 8-15  */
   4950             src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2]));  /* col = 16-23 */
   4951             src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3]));  /* col = 24-31 */
   4952             src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4]));  /* col = 32-39   */
   4953             src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5]));  /* col = 40-47  */
   4954             src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6]));  /* col = 48-55 */
   4955             src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7]));  /* col = 56-63*/
   4956 
   4957             src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   4958             src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   4959             src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   4960             src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   4961             src_values4 =  _mm_shuffle_epi8(src_values4, sm3);
   4962             src_values5 =  _mm_shuffle_epi8(src_values5, sm3);
   4963             src_values6 =  _mm_shuffle_epi8(src_values6, sm3);
   4964             src_values7 =  _mm_shuffle_epi8(src_values7, sm3);
   4965 
   4966 
   4967             src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   4968             src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   4969             src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   4970             src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   4971             src_values4 = _mm_maddubs_epi16(src_values4, temp11);
   4972             src_values5 = _mm_maddubs_epi16(src_values5, temp12);
   4973             src_values6 = _mm_maddubs_epi16(src_values6, temp13);
   4974             src_values7 = _mm_maddubs_epi16(src_values7, temp14);
   4975 
   4976             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   4977             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   4978             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   4979             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   4980             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   4981             src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b);
   4982             src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b);
   4983             src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b);
   4984             src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b);
   4985 
   4986             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   4987             src_values0 = _mm_srai_epi16(src_values0,  5);
   4988             src_values1 = _mm_srai_epi16(src_values1,  5);
   4989             src_values2 = _mm_srai_epi16(src_values2,  5);
   4990             src_values3 = _mm_srai_epi16(src_values3,  5);
   4991             src_values4 = _mm_srai_epi16(src_values4,  5);
   4992             src_values5 = _mm_srai_epi16(src_values5,  5);
   4993             src_values6 = _mm_srai_epi16(src_values6,  5);
   4994             src_values7 = _mm_srai_epi16(src_values7,  5);
   4995 
   4996             /* converting 16 bit to 8 bit */
   4997             src_values0 = _mm_packus_epi16(src_values0, src_values1);
   4998             src_values2 = _mm_packus_epi16(src_values2, src_values3);
   4999             src_values1 = _mm_srli_si128(src_values0, 8);
   5000             src_values3 = _mm_srli_si128(src_values2, 8);
   5001             src_values4 = _mm_packus_epi16(src_values4, src_values5);
   5002             src_values6 = _mm_packus_epi16(src_values6, src_values7);
   5003             src_values5 = _mm_srli_si128(src_values4, 8);
   5004             src_values7 = _mm_srli_si128(src_values6, 8);
   5005 
   5006             /* loading 8-bit 8 pixels values */
   5007             _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0);       /* row=0*/
   5008             _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1);   /* row=1*/
   5009             _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2);   /* row=2*/
   5010             _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3);   /* row=3*/
   5011             _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4);   /* row=4*/
   5012             _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5);   /* row=5*/
   5013             _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6);   /* row=6*/
   5014             _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7);   /* row=7*/
   5015         }
   5016 
   5017     }
   5018     else /* if nt =4*/
   5019     {
   5020 
   5021         __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b;
   5022         __m128i src_values10, src_values11, intra_pred_ang_4x32b;
   5023 
   5024         __m128i row_4x32b, two_nt_4x32b, src_values12, sign_8x16b;
   5025 
   5026 
   5027         const_temp2_4x32b = _mm_set1_epi32(31);
   5028         const_temp3_4x32b = _mm_set1_epi32(32);
   5029         zero_8x16b = _mm_setzero_si128();
   5030         two_nt_4x32b = _mm_set1_epi32(two_nt + 1);
   5031 
   5032 
   5033         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   5034         row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
   5035         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   5036         {
   5037             int temp11, temp21, temp31, temp41;
   5038 
   5039             WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   5040 
   5041             __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b;
   5042             __m128i src_values0, src_values1, src_values2, src_values3;
   5043             __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2;
   5044 
   5045             /* pos = ((row + 1) * intra_pred_ang); */
   5046             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   5047             sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
   5048             res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
   5049 
   5050             /* fract = pos & (31); */
   5051             src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   5052 
   5053             ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
   5054             ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
   5055             ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
   5056             ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
   5057             ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
   5058             ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
   5059             ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
   5060 
   5061             /* idx = pos >> 5; */
   5062             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   5063 
   5064             /*(32 - fract) */
   5065             src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
   5066 
   5067             fract1_8x16b = _mm_slli_epi16(src_values11, 8);
   5068             fract2_8x16b = _mm_slli_epi16(src_values10, 8);
   5069 
   5070             src_values11 = _mm_or_si128(src_values11, fract1_8x16b);
   5071             src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */
   5072 
   5073             fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10);
   5074             fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10);
   5075 
   5076             temp1 =  _mm_shuffle_epi32(fract1_8x16b, 0x00);
   5077             temp2 =  _mm_shuffle_epi32(fract1_8x16b, 0xaa);
   5078             temp3 =  _mm_shuffle_epi32(fract2_8x16b, 0x00);
   5079             temp4 =  _mm_shuffle_epi32(fract2_8x16b, 0xaa);
   5080 
   5081             src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));     /* col = 0-7   */
   5082             src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));   /* col = 8-15  */
   5083             src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
   5084             src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */
   5085 
   5086             src_values0 =  _mm_shuffle_epi8(src_values0, sm3);
   5087             src_values1 =  _mm_shuffle_epi8(src_values1, sm3);
   5088             src_values2 =  _mm_shuffle_epi8(src_values2, sm3);
   5089             src_values3 =  _mm_shuffle_epi8(src_values3, sm3);
   5090 
   5091             src_values0 = _mm_maddubs_epi16(src_values0, temp1);
   5092             src_values1 = _mm_maddubs_epi16(src_values1, temp2);
   5093             src_values2 = _mm_maddubs_epi16(src_values2, temp3);
   5094             src_values3 = _mm_maddubs_epi16(src_values3, temp4);
   5095 
   5096             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   5097             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   5098             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   5099             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   5100             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   5101 
   5102             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   5103             src_values0 = _mm_srai_epi16(src_values0,  5);
   5104             src_values1 = _mm_srai_epi16(src_values1,  5);
   5105             src_values2 = _mm_srai_epi16(src_values2,  5);
   5106             src_values3 = _mm_srai_epi16(src_values3,  5);
   5107 
   5108             /* converting 16 bit to 8 bit */
   5109             src_values0 = _mm_packus_epi16(src_values0, src_values1);
   5110             src_values2 = _mm_packus_epi16(src_values2, src_values3);
   5111             src_values1 = _mm_srli_si128(src_values0, 8);
   5112             src_values3 = _mm_srli_si128(src_values2, 8);
   5113 
   5114             temp11 = _mm_cvtsi128_si32(src_values0);
   5115             temp21 = _mm_cvtsi128_si32(src_values1);
   5116             temp31 = _mm_cvtsi128_si32(src_values2);
   5117             temp41 = _mm_cvtsi128_si32(src_values3);
   5118 
   5119             /* loding 4-bit 8 pixels values */
   5120             *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11;
   5121             *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21;
   5122             *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31;
   5123             *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41;
   5124 
   5125         }
   5126     }
   5127 }
   5128