Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_chroma_intra_pred_filters_atom_intr.c
     22 *
     23 * @brief
     24 *  Contains function Definition for intra prediction  interpolation filters
     25 *
     26 *
     27 * @author
     28 *  Ittiam
     29 *
     30 * @par List of Functions:
     31 *  ihevc_intra_pred_chroma_planar_ssse3()
     32 *
     33 *  ihevc_intra_pred_chroma_dc_ssse3()
     34 *
     35 *  ihevc_intra_pred_chroma_horz_ssse3()
     36 *
     37 *  ihevc_intra_pred_chroma_ver_ssse3()
     38 *
     39 *  ihevc_intra_pred_chroma_mode2_ssse3()
     40 *
     41 *  ihevc_intra_pred_chroma_mode_18_34_ssse3()
     42 *
     43 *  ihevc_intra_pred_chroma_mode_3_to_9_ssse3()
     44 *
     45 *  ihevc_intra_pred_chroma_mode_11_to_17_ssse3()
     46 *
     47 *  ihevc_intra_pred_chroma_mode_19_to_25_ssse3()
     48 *
     49 *  ihevc_intra_pred_chroma_mode_27_to_33_ssse3()
     50 *
     51 *
     52 *
     53 * @remarks
     54 *  None
     55 *
     56 *******************************************************************************
     57 */
     58 
     59 
     60 /*****************************************************************************/
     61 /* File Includes                                                             */
     62 /*****************************************************************************/
     63 
     64 #include "ihevc_typedefs.h"
     65 #include "ihevc_platform_macros.h"
     66 #include "ihevc_macros.h"
     67 #include "ihevc_func_selector.h"
     68 #include "ihevc_intra_pred.h"
     69 
     70 #include "ihevc_chroma_intra_pred.h"
     71 #include "ihevc_common_tables.h"
     72 #include "ihevc_tables_x86_intr.h"
     73 
     74 #include <mmintrin.h>
     75 #include <xmmintrin.h>
     76 #include <emmintrin.h>
     77 
     78 #include <immintrin.h>
     79 
     80 
     81 /****************************************************************************/
     82 /* Constant Macros                                                          */
     83 /****************************************************************************/
     84 #define MAX_CU_SIZE 64
     85 #define BIT_DEPTH 8
     86 #define T32_4NT 128
     87 #define T16_4NT 64
     88 #define T16C_4NT 64
     89 #define T8C_4NT 32
     90 /****************************************************************************/
     91 /* Function Macros                                                          */
     92 /****************************************************************************/
     93 
     94 #define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x)
     95 
     96 /* tables to shuffle 8-bit values */
     97 
     98 /*****************************************************************************/
     99 /* Function Definition                                                      */
    100 /*****************************************************************************/
    101 
    102 
    103 
    104 /**
    105 *******************************************************************************
    106 *
    107 * @brief
    108 *  Planar Intraprediction with reference neighboring samples location
    109 * pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'  Refer
    110 * to section 8.4.4.2.4 in the standard
    111 *
    112 * @par Description:
    113 *
    114 *
    115 * @param[in] pu1_src
    116 *  UWORD8 pointer to the source
    117 *
    118 * @param[in] pu1_dst
    119 *  UWORD8 pointer to the destination
    120 *
    121 * @param[in] src_strd
    122 *  integer source stride
    123 *
    124 * @param[in] dst_strd
    125 *  integer destination stride
    126 *
    127 * @param[in] nt
    128 *  integer Transform Block size
    129 *
    130 * @param[in] mode
    131 *  integer intraprediction mode
    132 *
    133 * @returns
    134 *
    135 * @remarks
    136 *  None
    137 *
    138 *******************************************************************************
    139 */
    140 
    141 void ihevc_intra_pred_chroma_planar_ssse3(UWORD8 *pu1_ref,
    142                                           WORD32 src_strd,
    143                                           UWORD8 *pu1_dst,
    144                                           WORD32 dst_strd,
    145                                           WORD32 nt,
    146                                           WORD32 mode)
    147 {
    148 
    149     WORD32 row, col;
    150     WORD32 log2nt = 5;
    151     WORD32 two_nt, three_nt;
    152 
    153     __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
    154     __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b;
    155     UNUSED(src_strd);
    156     UNUSED(mode);
    157     switch(nt)
    158     {
    159         case 16:
    160             log2nt = 4;
    161             break;
    162         case 8:
    163             log2nt = 3;
    164             break;
    165         case 4:
    166             log2nt = 2;
    167             break;
    168         default:
    169             break;
    170     }
    171     two_nt = 2 * nt;
    172     three_nt = 3 * nt;
    173 
    174     /* Planar filtering */
    175 
    176 /* setting vallues in  registera*/
    177 
    178 //  pu1_ref[2*(two_nt - 1 - row)]
    179 //  pu1_ref[2 * (three_nt + 1)]
    180 //  pu1_ref[2 * (two_nt + 1) + col]
    181 //  pu1_ref[2 * (nt - 1)]
    182 
    183     const_temp_4x32b  = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1],
    184                                       pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)],
    185                                       pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]);
    186 
    187     const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)],
    188                                       pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]);
    189 
    190     const_temp4_4x32b = _mm_set1_epi16(nt - 1);
    191     const_temp6_4x32b = _mm_set1_epi16(nt);
    192     const_temp7_4x32b = _mm_set1_epi16(4);
    193 
    194     zero_8x16b = _mm_set1_epi32(0);
    195 
    196 
    197     if(nt % 4 == 0)
    198     {
    199         const_temp7_4x32b = _mm_set1_epi16(4);
    200 
    201         for(row = 0; row < nt; row++)
    202         {
    203             __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b;
    204             __m128i res_temp3_8x16b;
    205 
    206             const_temp2_4x32b  = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1],
    207                                                pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)],
    208                                                pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]);
    209 
    210             const_temp3_4x32b  = _mm_set1_epi16((row + 1));
    211             row_8x16b = _mm_set1_epi16((nt - 1 - row));
    212 
    213             const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0);
    214             col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1);
    215 
    216             const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b);
    217 
    218             /*(row + 1) * pu1_ref[nt - 1]*/
    219             res_temp_8x16b  = _mm_mullo_epi16(const_temp3_4x32b,  const_temp1_4x32b);
    220 
    221             /*(row + 1) * pu1_ref[nt - 1] + nt)*/
    222             res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b);
    223 
    224             for(col = 0; col < 2 * nt; col += 8)
    225             {
    226                 __m128i src_temp_8x16b;
    227 
    228                 /* loding 8bit 16 pixles*/
    229                 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col));
    230 
    231                 //src_temp_8x16b =  _mm_cvtepu8_epi16 (src_temp_8x16b); /* row=0*/
    232                 src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b);
    233 
    234                 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */
    235                 res_temp1_8x16b  = _mm_mullo_epi16(src_temp_8x16b,  row_8x16b);
    236 
    237                 /*(col + 1) * pu1_ref[three_nt + 1]*/
    238                 res_temp2_8x16b  = _mm_mullo_epi16(const_temp_4x32b,  col_8x16b);
    239 
    240                 /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/
    241                 res_temp3_8x16b  = _mm_mullo_epi16(const_temp2_4x32b,  const_temp5_4x32b);
    242 
    243                 res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b);
    244                 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
    245                 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b);
    246 
    247                 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1));
    248                 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b);
    249 
    250                 _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b);
    251 
    252                 const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b);
    253                 col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b);
    254             } /* inner loop ends here */
    255         }
    256     }
    257 }
    258 
    259 
    260 /**
    261 *******************************************************************************
    262 *
    263 * @brief
    264 *  Intraprediction for DC mode with reference neighboring  samples location
    265 * pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
    266 * to section 8.4.4.2.5 in the standard
    267 *
    268 * @par Description:
    269 *
    270 *
    271 * @param[in] pu1_src
    272 *  UWORD8 pointer to the source
    273 *
    274 * @param[in] pu1_dst
    275 *  UWORD8 pointer to the destination
    276 *
    277 * @param[in] src_strd
    278 *  integer source stride
    279 *
    280 * @param[in] dst_strd
    281 *  integer destination stride
    282 *
    283 * @param[in] nt
    284 *  integer Transform Block size (Chroma)
    285 *
    286 * @param[in] mode
    287 *  integer intraprediction mode
    288 *
    289 * @returns
    290 *
    291 * @remarks
    292 *  None
    293 *
    294 *******************************************************************************
    295 */
    296 
    297 void ihevc_intra_pred_chroma_dc_ssse3(UWORD8 *pu1_ref,
    298                                       WORD32 src_strd,
    299                                       UWORD8 *pu1_dst,
    300                                       WORD32 dst_strd,
    301                                       WORD32 nt,
    302                                       WORD32 mode)
    303 {
    304 
    305     WORD32 acc_dc_u, acc_dc_v;
    306     WORD32 dc_val_u, dc_val_v;
    307     WORD32 row;
    308     WORD32 log2nt = 5;
    309     __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask;
    310     __m128i src_temp7, src_temp8, src_temp9, src_temp10;
    311     __m128i m_zero = _mm_set1_epi32(0);
    312     UNUSED(src_strd);
    313     UNUSED(mode);
    314 
    315     switch(nt)
    316     {
    317         case 32:
    318             log2nt = 5;
    319             break;
    320         case 16:
    321             log2nt = 4;
    322             break;
    323         case 8:
    324             log2nt = 3;
    325             break;
    326         case 4:
    327             log2nt = 2;
    328             break;
    329         default:
    330             break;
    331     }
    332 
    333     acc_dc_u = 0;
    334     acc_dc_v = 0;
    335 
    336     /* Calculate DC value for the transform block */
    337 
    338     m_mask = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]);
    339 
    340     if(nt == 16)
    341     {
    342         __m128i temp_sad, sign_8x16b;
    343 
    344         src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
    345         src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
    346         src_temp7 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32));
    347         src_temp8 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48));
    348 
    349         src_temp5  = _mm_unpacklo_epi8(src_temp3, m_zero);
    350         src_temp6  = _mm_unpacklo_epi8(src_temp4, m_zero);
    351         src_temp9  = _mm_unpacklo_epi8(src_temp7, m_zero);
    352         src_temp10 = _mm_unpacklo_epi8(src_temp8, m_zero);
    353 
    354         src_temp3 = _mm_srli_si128(src_temp3, 8);
    355         src_temp4 = _mm_srli_si128(src_temp4, 8);
    356         src_temp7 = _mm_srli_si128(src_temp7, 8);
    357         src_temp8 = _mm_srli_si128(src_temp8, 8);
    358 
    359         src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
    360         src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
    361         src_temp7 = _mm_unpacklo_epi8(src_temp7, m_zero);
    362         src_temp8 = _mm_unpacklo_epi8(src_temp8, m_zero);
    363 
    364         src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    365         src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
    366         src_temp8 = _mm_add_epi16(src_temp7, src_temp8);
    367         src_temp10 = _mm_add_epi16(src_temp9, src_temp10);
    368 
    369         src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    370         src_temp8 = _mm_add_epi16(src_temp8, src_temp10);
    371 
    372         src_temp4 = _mm_add_epi16(src_temp4, src_temp8);
    373         src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
    374         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    375         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    376 
    377         sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
    378         src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
    379 
    380         temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
    381         acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
    382         acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    383     }
    384 
    385     else if(nt == 8)
    386     {
    387         __m128i temp_sad, sign_8x16b;
    388         src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
    389         src_temp4 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16));
    390 
    391         src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero);
    392         src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero);
    393 
    394         src_temp3 = _mm_srli_si128(src_temp3, 8);
    395         src_temp4 = _mm_srli_si128(src_temp4, 8);
    396 
    397         src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero);
    398         src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero);
    399 
    400         src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    401         src_temp6 = _mm_add_epi16(src_temp3, src_temp5);
    402 
    403         src_temp4 = _mm_add_epi16(src_temp4, src_temp6);
    404         src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
    405         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    406         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    407 
    408         sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
    409         src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
    410 
    411         temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
    412         acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
    413         acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    414     }
    415 
    416     else if(nt == 4)
    417     {
    418         __m128i temp_sad, sign_8x16b;
    419         src_temp3 =  _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt)));
    420 
    421         src_temp5 =  _mm_unpacklo_epi8(src_temp3, m_zero);
    422         src_temp4 = _mm_srli_si128(src_temp3, 8);
    423 
    424         src_temp4 =  _mm_unpacklo_epi8(src_temp4, m_zero);
    425 
    426         src_temp4 = _mm_add_epi16(src_temp4, src_temp5);
    427 
    428         src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask);
    429         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    430         src_temp4 = _mm_hadd_epi16(src_temp4, m_zero);
    431 
    432         sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4);
    433         src_temp4  = _mm_unpacklo_epi16(src_temp4, sign_8x16b);
    434 
    435         temp_sad  = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */
    436         acc_dc_u  = _mm_cvtsi128_si32(src_temp4);
    437         acc_dc_v  = _mm_cvtsi128_si32(temp_sad);
    438     }
    439 
    440 
    441     acc_dc_u += pu1_ref[6 * nt];
    442     acc_dc_v += pu1_ref[6 * nt + 1];
    443 
    444     acc_dc_u -= pu1_ref[4 * nt];
    445     acc_dc_v -= pu1_ref[4 * nt + 1];
    446 
    447     dc_val_u = (acc_dc_u + nt) >> (log2nt + 1);
    448     dc_val_v = (acc_dc_v + nt) >> (log2nt + 1);
    449 
    450     dc_val_u = dc_val_u | (dc_val_v << 8);
    451 
    452     /* Fill the remaining rows with DC value*/
    453 
    454     if(nt == 4)
    455     {
    456         src_temp1 = _mm_set1_epi16(dc_val_u);
    457 
    458         /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
    459         _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    460         _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
    461         _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
    462         _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
    463 
    464     }
    465     else if(nt == 8)
    466     {
    467         src_temp1 = _mm_set1_epi16(dc_val_u);
    468 
    469         /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
    470         _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    471         _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
    472         _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
    473         _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
    474 
    475         _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
    476         _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
    477         _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
    478         _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
    479 
    480     }
    481 
    482     else /* nt == 16 */
    483     {
    484         src_temp1 = _mm_set1_epi16(dc_val_u);
    485 
    486         for(row = 0; row < nt; row += 8)
    487         {
    488             /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
    489             _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    490             _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
    491             _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
    492             _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
    493             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1);
    494             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1);
    495             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1);
    496             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1);
    497 
    498             _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1);
    499             _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1);
    500             _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1);
    501             _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1);
    502             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1);
    503             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1);
    504             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1);
    505             _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1);
    506 
    507             pu1_dst += 8 * dst_strd;
    508         }
    509     }
    510 
    511 }
    512 
    513 
    514 /**
    515 *******************************************************************************
    516 *
    517 * @brief
    518 *  Horizontal intraprediction(mode 10) with reference  samples location
    519 * pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
    520 * to section 8.4.4.2.6 in the standard (Special case)
    521 *
    522 * @par Description:
    523 *
    524 *
    525 * @param[in] pu1_src
    526 *  UWORD8 pointer to the source
    527 *
    528 * @param[in] pu1_dst
    529 *  UWORD8 pointer to the destination
    530 *
    531 * @param[in] src_strd
    532 *  integer source stride
    533 *
    534 * @param[in] dst_strd
    535 *  integer destination stride
    536 *
    537 * @param[in] nt
    538 *  integer Transform Block size
    539 *
    540 * @param[in] mode
    541 *  integer intraprediction mode
    542 *
    543 * @returns
    544 *
    545 * @remarks
    546 *  None
    547 *
    548 *******************************************************************************
    549 */
    550 
    551 void ihevc_intra_pred_chroma_horz_ssse3(UWORD8 *pu1_ref,
    552                                         WORD32 src_strd,
    553                                         UWORD8 *pu1_dst,
    554                                         WORD32 dst_strd,
    555                                         WORD32 nt,
    556                                         WORD32 mode)
    557 {
    558 
    559     WORD32 row;
    560     __m128i temp1, temp2, temp3, temp4, temp5, temp6,  temp7, temp8;
    561     UNUSED(src_strd);
    562     UNUSED(mode);
    563 
    564     /* Replication to next rows*/
    565 
    566     if(nt == 8)
    567     {
    568         for(row = 0; row < nt; row += 4)
    569         {
    570             temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
    571             temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
    572             temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
    573             temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
    574             temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
    575             temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
    576             temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
    577             temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
    578 
    579             temp2 = _mm_unpacklo_epi8(temp1, temp2);
    580             temp4 = _mm_unpacklo_epi8(temp3, temp4);
    581             temp6 = _mm_unpacklo_epi8(temp5, temp6);
    582             temp8 = _mm_unpacklo_epi8(temp7, temp8);
    583 
    584             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), temp2);
    585             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), temp4);
    586             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), temp6);
    587             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), temp8);
    588 
    589         }
    590     }
    591     else if(nt == 16)
    592     {
    593         for(row = 0; row < nt; row += 4)
    594         {
    595             temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]);
    596             temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]);
    597 
    598             temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]);
    599             temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]);
    600 
    601             temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]);
    602             temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]);
    603 
    604             temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]);
    605             temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]);
    606 
    607             temp2 = _mm_unpacklo_epi8(temp1, temp2);
    608             temp4 = _mm_unpacklo_epi8(temp3, temp4);
    609             temp6 = _mm_unpacklo_epi8(temp5, temp6);
    610             temp8 = _mm_unpacklo_epi8(temp7, temp8);
    611 
    612             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 0), temp2);
    613             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 16), temp2);
    614 
    615             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 0), temp4);
    616             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), temp4);
    617 
    618             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 0), temp6);
    619             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), temp6);
    620 
    621             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 0), temp8);
    622             _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), temp8);
    623 
    624 
    625         }
    626     }
    627     else
    628     {
    629         temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 0]);
    630         temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 0]);
    631 
    632         temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 1]);
    633         temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 1]);
    634 
    635         temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 2]);
    636         temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 2]);
    637 
    638         temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 3]);
    639         temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 3]);
    640 
    641         temp2 = _mm_unpacklo_epi8(temp1, temp2);
    642         temp4 = _mm_unpacklo_epi8(temp3, temp4);
    643         temp6 = _mm_unpacklo_epi8(temp5, temp6);
    644         temp8 = _mm_unpacklo_epi8(temp7, temp8);
    645 
    646         _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), temp2);
    647         _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), temp4);
    648         _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), temp6);
    649         _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), temp8);
    650     }
    651 }
    652 
    653 
    654 /**
    655 *******************************************************************************
    656 *
    657 * @brief
    658 *  Horizontal intraprediction with reference neighboring  samples location
    659 * pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'  Refer
    660 * to section 8.4.4.2.6 in the standard (Special case)
    661 *
    662 * @par Description:
    663 *
    664 *
    665 * @param[in] pu1_src
    666 *  UWORD8 pointer to the source
    667 *
    668 * @param[in] pu1_dst
    669 *  UWORD8 pointer to the destination
    670 *
    671 * @param[in] src_strd
    672 *  integer source stride
    673 *
    674 * @param[in] dst_strd
    675 *  integer destination stride
    676 *
    677 * @param[in] nt
    678 *  integer Transform Block size
    679 *
    680 * @param[in] mode
    681 *  integer intraprediction mode
    682 *
    683 * @returns
    684 *
    685 * @remarks
    686 *  None
    687 *
    688 *******************************************************************************
    689 */
    690 
    691 void ihevc_intra_pred_chroma_ver_ssse3(UWORD8 *pu1_ref,
    692                                        WORD32 src_strd,
    693                                        UWORD8 *pu1_dst,
    694                                        WORD32 dst_strd,
    695                                        WORD32 nt,
    696                                        WORD32 mode)
    697 {
    698     __m128i src_temp1;
    699     UNUSED(src_strd);
    700     UNUSED(mode);
    701 
    702     /* Replication to next columns*/
    703     if(nt == 8)
    704     {
    705         src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
    706 
    707         _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp1);
    708         _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1);
    709         _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp1);
    710         _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp1);
    711 
    712         _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp1);
    713         _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp1);
    714         _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp1);
    715         _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp1);
    716 
    717     }
    718     if(nt == 16)
    719     {
    720         __m128i temp1, temp2;
    721 
    722         temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
    723         temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 16));
    724 
    725         /*  pu1_dst[(row * dst_strd) + col] = dc_val;*/
    726         _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1);
    727         _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1);
    728         _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1);
    729         _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1);
    730         _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1);
    731         _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1);
    732         _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1);
    733         _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1);
    734 
    735         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2);
    736         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2);
    737         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2);
    738         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2);
    739         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2);
    740         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2);
    741         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2);
    742         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2);
    743 
    744         _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1);
    745         _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1);
    746         _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1);
    747         _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1);
    748         _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1);
    749         _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1);
    750         _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1);
    751         _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1);
    752 
    753         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2);
    754         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2);
    755         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2);
    756         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2);
    757         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2);
    758         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2);
    759         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2);
    760         _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2);
    761 
    762     }
    763     else
    764     {
    765         src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0));
    766 
    767         _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    768         _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1);
    769         _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1);
    770         _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1);
    771 
    772 
    773     }
    774 
    775 }
    776 
    777 /**
    778 *******************************************************************************
    779 *
    780 * @brief
    781 *  Intraprediction for mode 2 (sw angle) with reference  neighboring samples
    782 * location pointed by 'pu1_ref' to the  TU block location pointed by
    783 * 'pu1_dst'  Refer to section 8.4.4.2.6 in the standard
    784 *
    785 * @par Description:
    786 *
    787 *
    788 * @param[in] pu1_src
    789 *  UWORD8 pointer to the source
    790 *
    791 * @param[in] pu1_dst
    792 *  UWORD8 pointer to the destination
    793 *
    794 * @param[in] src_strd
    795 *  integer source stride
    796 *
    797 * @param[in] dst_strd
    798 *  integer destination stride
    799 *
    800 * @param[in] nt
    801 *  integer Transform Block size
    802 *
    803 * @param[in] mode
    804 *  integer intraprediction mode
    805 *
    806 * @returns
    807 *
    808 * @remarks
    809 *  None
    810 *
    811 *******************************************************************************
    812 */
    813 
    814 void ihevc_intra_pred_chroma_mode2_ssse3(UWORD8 *pu1_ref,
    815                                          WORD32 src_strd,
    816                                          UWORD8 *pu1_dst,
    817                                          WORD32 dst_strd,
    818                                          WORD32 nt,
    819                                          WORD32 mode)
    820 {
    821     WORD32 row, col;
    822 
    823 
    824     __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8, sm2, sm3;
    825     UNUSED(src_strd);
    826     UNUSED(mode);
    827 
    828     sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
    829     sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY8[0]);
    830 
    831     /* For the angle 45, replication is done from the corresponding angle */
    832     /* intra_pred_ang = tan(angle) in q5 format */
    833 
    834     if(nt == 4)
    835     {
    836         /*pu1_ref[two_nt - row - (col+1) - 1]*/
    837         src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 8 - 2));
    838         src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 8 - 2));
    839         src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 8 - 2));
    840         src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 8 - 2));
    841 
    842         _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm2));
    843         _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm2));
    844         _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm2));
    845         _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm2));
    846 
    847     }
    848     else if(nt == 8)
    849     {
    850         /*pu1_ref[two_nt - row - (col+1) - 1]*/
    851         src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 16 - 2));
    852         src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 16 - 2));
    853         src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 16 - 2));
    854         src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 16 - 2));
    855         src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 4 - 16 - 2));
    856         src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 5 - 16 - 2));
    857         src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 6 - 16 - 2));
    858         src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 7 - 16 - 2));
    859 
    860         _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
    861         _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
    862         _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
    863         _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
    864         _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
    865         _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
    866         _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
    867         _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
    868 
    869 
    870     }
    871     else
    872     {
    873         for(row = 0; row < nt; row += 8)
    874         {
    875             for(col = 0; col < 2 * nt; col += 16)
    876             {   /*pu1_ref[two_nt - row - (col+1) - 1]*/
    877                 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 0) - (col + 16) - 2));
    878                 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 1) - (col + 16) - 2));
    879                 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 2) - (col + 16) - 2));
    880                 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 3) - (col + 16) - 2));
    881                 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 4) - (col + 16) - 2));
    882                 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 5) - (col + 16) - 2));
    883                 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 6) - (col + 16) - 2));
    884                 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 7) - (col + 16) - 2));
    885 
    886                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3));
    887                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3));
    888                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3));
    889                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3));
    890                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3));
    891                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3));
    892                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3));
    893                 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3));
    894             }
    895         }
    896     }
    897 }
    898 
    899 /**
    900 *******************************************************************************
    901 *
    902 * @brief
    903 *  Intraprediction for mode 34 (ne angle) and  mode 18 (nw angle) with
    904 * reference  neighboring samples location pointed by 'pu1_ref' to the  TU
    905 * block location pointed by 'pu1_dst'
    906 *
    907 * @par Description:
    908 *
    909 *
    910 * @param[in] pu1_src
    911 *  UWORD8 pointer to the source
    912 *
    913 * @param[in] pu1_dst
    914 *  UWORD8 pointer to the destination
    915 *
    916 * @param[in] src_strd
    917 *  integer source stride
    918 *
    919 * @param[in] dst_strd
    920 *  integer destination stride
    921 *
    922 * @param[in] nt
    923 *  integer Transform Block size
    924 *
    925 * @param[in] mode
    926 *  integer intraprediction mode
    927 *
    928 * @returns
    929 *
    930 * @remarks
    931 *  None
    932 *
    933 *******************************************************************************
    934 */
    935 
    936 void ihevc_intra_pred_chroma_mode_18_34_ssse3(UWORD8 *pu1_ref,
    937                                               WORD32 src_strd,
    938                                               UWORD8 *pu1_dst,
    939                                               WORD32 dst_strd,
    940                                               WORD32 nt,
    941                                               WORD32 mode)
    942 {
    943     WORD32 row;
    944     WORD32 idx = 0;
    945 
    946     __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8;
    947     UNUSED(src_strd);
    948 
    949     if(mode == 34)
    950     {
    951         if(nt == 4)
    952         {
    953             /*pu1_ref[two_nt + col + idx + 1]*/
    954             src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
    955             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
    956             src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
    957             src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
    958 
    959             _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    960             _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
    961             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
    962             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
    963 
    964         }
    965         else if(nt == 8)
    966         {
    967             /*pu1_ref[two_nt + col + idx + 1]*/
    968             src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
    969             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
    970             src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
    971             src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
    972             src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
    973             src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
    974             src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
    975             src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
    976 
    977             _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
    978             _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
    979             _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
    980             _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
    981             _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
    982             _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
    983             _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
    984             _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
    985 
    986 
    987         }
    988         else
    989         {
    990             __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
    991             for(row = 0; row < nt; row += 8)
    992             {
    993                 /*pu1_ref[two_nt + col + idx + 1]*/
    994                 src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) +  0 + (4 * nt) + 2 * idx + 2));
    995                 src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
    996                 src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) +  0 + (4 * nt) + 2 * idx + 2));
    997                 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
    998                 src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) +  0 + (4 * nt) + 2 * idx + 2));
    999                 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1000                 src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1001                 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1002 
   1003                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
   1004                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
   1005                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
   1006                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
   1007                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
   1008                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
   1009                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
   1010                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
   1011 
   1012                 src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1013                 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1014                 src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1015                 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1016                 src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1017                 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1018                 src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1019                 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1020 
   1021                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
   1022                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
   1023                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
   1024                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
   1025                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
   1026                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
   1027                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
   1028                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
   1029 
   1030                 pu1_ref += 2 * 8;
   1031                 pu1_dst += 8 * dst_strd;
   1032             }
   1033         }
   1034     }
   1035     else
   1036     {
   1037         if(nt == 4)
   1038         {
   1039             /*pu1_ref[two_nt + col + idx + 1]*/
   1040             src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
   1041             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
   1042             src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
   1043             src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
   1044 
   1045             _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
   1046             _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
   1047             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
   1048             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
   1049 
   1050 
   1051         }
   1052         else if(nt == 8)
   1053         {
   1054             /*pu1_ref[two_nt + col + idx + 1]*/
   1055             src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2));
   1056             src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2));
   1057             src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2));
   1058             src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2));
   1059             src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + (4 * nt) + 2 * idx + 2));
   1060             src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + (4 * nt) + 2 * idx + 2));
   1061             src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + (4 * nt) + 2 * idx + 2));
   1062             src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + (4 * nt) + 2 * idx + 2));
   1063 
   1064             _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1);
   1065             _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2);
   1066             _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3);
   1067             _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4);
   1068             _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5);
   1069             _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6);
   1070             _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7);
   1071             _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8);
   1072 
   1073 
   1074         }
   1075         else
   1076         {
   1077             __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16;
   1078             for(row = 0; row < nt; row += 8)
   1079             {
   1080                 /*pu1_ref[two_nt + col + idx + 1]*/
   1081                 src_temp1  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1082                 src_temp9  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1083                 src_temp2  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1084                 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1085                 src_temp3  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1086                 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1087                 src_temp4  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1088                 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1089 
   1090                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (0 * dst_strd)), src_temp1);
   1091                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9);
   1092                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (1 * dst_strd)), src_temp2);
   1093                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10);
   1094                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (2 * dst_strd)), src_temp3);
   1095                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11);
   1096                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (3 * dst_strd)), src_temp4);
   1097                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12);
   1098 
   1099                 src_temp5  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1100                 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1101                 src_temp6  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1102                 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1103                 src_temp7  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1104                 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1105                 src_temp8  = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) +  0 + (4 * nt) + 2 * idx + 2));
   1106                 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2));
   1107 
   1108                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (4 * dst_strd)), src_temp5);
   1109                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13);
   1110                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (5 * dst_strd)), src_temp6);
   1111                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14);
   1112                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (6 * dst_strd)), src_temp7);
   1113                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15);
   1114                 _mm_storeu_si128((__m128i *)(pu1_dst +  0 + (7 * dst_strd)), src_temp8);
   1115                 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16);
   1116 
   1117                 pu1_ref -= 2 * 8;
   1118                 pu1_dst += 8 * dst_strd;
   1119             }
   1120         }
   1121     }
   1122 
   1123 }
   1124 
   1125 /**
   1126 *******************************************************************************
   1127 *
   1128 * @brief
   1129 *  Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
   1130 * reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   1131 * block location pointed by 'pu1_dst'
   1132 *
   1133 * @par Description:
   1134 *
   1135 *
   1136 * @param[in] pu1_src
   1137 *  UWORD8 pointer to the source
   1138 *
   1139 * @param[in] pu1_dst
   1140 *  UWORD8 pointer to the destination
   1141 *
   1142 * @param[in] src_strd
   1143 *  integer source stride
   1144 *
   1145 * @param[in] dst_strd
   1146 *  integer destination stride
   1147 *
   1148 * @param[in] nt
   1149 *  integer Transform Block size
   1150 *
   1151 * @param[in] mode
   1152 *  integer intraprediction mode
   1153 *
   1154 * @returns
   1155 *
   1156 * @remarks
   1157 *  None
   1158 *
   1159 *******************************************************************************
   1160 */
   1161 
   1162 void ihevc_intra_pred_chroma_mode_3_to_9_ssse3(UWORD8 *pu1_ref,
   1163                                                WORD32 src_strd,
   1164                                                UWORD8 *pu1_dst,
   1165                                                WORD32 dst_strd,
   1166                                                WORD32 nt,
   1167                                                WORD32 mode)
   1168 {
   1169     WORD32 row, col;
   1170 
   1171     WORD32 intra_pred_ang;
   1172 
   1173     __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
   1174     __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
   1175     __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm1;
   1176     UNUSED(src_strd);
   1177 
   1178     /* Intra Pred Angle according to the mode */
   1179     intra_pred_ang = gai4_ihevc_ang_table[mode];
   1180 
   1181     /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   1182     /* samples dependent on distance to obtain destination sample */
   1183 
   1184     sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]);
   1185     const_temp_4x32b  = _mm_set1_epi16(16);
   1186     const_temp2_4x32b = _mm_set1_epi32(31);
   1187     const_temp3_4x32b = _mm_set1_epi16(32);
   1188     const_temp4_4x32b = _mm_set1_epi32(4);
   1189 
   1190     two_nt_4x32b = _mm_set1_epi32(1);
   1191 
   1192     zero_8x16b = _mm_set1_epi16(0);
   1193 
   1194 
   1195     /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   1196     intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
   1197 
   1198     row_4x32b = _mm_set_epi32(4, 3, 2, 1);
   1199 
   1200     if(nt == 4)
   1201     {
   1202         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   1203         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   1204         const_temp2_4x32b = _mm_set1_epi16(31);
   1205         const_temp4_4x32b = _mm_set1_epi16(4);
   1206         two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
   1207 
   1208         {
   1209             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   1210             WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
   1211 
   1212             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
   1213             __m128i src_values10;
   1214 
   1215             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   1216 
   1217             /* pos = ((row + 1) * intra_pred_ang); */
   1218             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   1219 
   1220             /* fract = pos & (31); */
   1221             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   1222 
   1223             ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b,  5);
   1224 
   1225             ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b,  ref_main_idx_4x32b);
   1226 
   1227             ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
   1228 
   1229             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   1230 
   1231             /*(32 - fract) */
   1232             src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   1233 
   1234             _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
   1235             _mm_storel_epi64((__m128i *)(ai1_src_temp_val),  src_values10);
   1236 
   1237             fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
   1238             fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
   1239             fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
   1240             fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
   1241 
   1242             temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
   1243             temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
   1244             temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
   1245             temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
   1246 
   1247             temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
   1248             temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
   1249             temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
   1250             temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
   1251 
   1252             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   1253             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   1254             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   1255             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   1256 
   1257             {
   1258                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1259                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1260 
   1261                 /* loding 8-bit 16 pixels */
   1262                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 8)); /* col=0*/
   1263                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 8)); /* col=1*/
   1264                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 8)); /* col=2*/
   1265                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 8)); /* col=3*/
   1266 
   1267                 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
   1268                 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
   1269                 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
   1270                 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
   1271 
   1272                 src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
   1273                 src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
   1274                 src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
   1275                 src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
   1276 
   1277                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1278                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   1279                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   1280                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   1281                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   1282 
   1283                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1284                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   1285                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   1286                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   1287                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   1288 
   1289                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1290                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   1291                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   1292                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   1293                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   1294 
   1295                 /* converting 16 bit to 8 bit */
   1296                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
   1297                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
   1298                 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
   1299                 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
   1300 
   1301                 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
   1302                 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
   1303                 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
   1304                 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
   1305 
   1306                 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1307                 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1308 
   1309                 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
   1310                 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
   1311 
   1312                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b);             /* row=0*/
   1313 
   1314                 src_temp2_8x16b  = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
   1315                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b);       /* row=1*/
   1316 
   1317                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b);       /* row=2*/
   1318 
   1319                 src_temp4_8x16b  = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
   1320                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b);       /* row=4*/
   1321 
   1322             }
   1323         }
   1324     }
   1325     else
   1326     {
   1327         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   1328         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   1329         const_temp2_4x32b = _mm_set1_epi16(31);
   1330         const_temp4_4x32b = _mm_set1_epi16(8);
   1331         two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2);
   1332 
   1333         for(col = 0; col < 2 * nt; col += 16)
   1334         {
   1335             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   1336             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   1337             WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
   1338 
   1339             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
   1340             __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
   1341 
   1342             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   1343             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   1344 
   1345             /* pos = ((row + 1) * intra_pred_ang); */
   1346             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   1347 
   1348             /* fract = pos & (31); */
   1349             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   1350 
   1351             ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b,  5);
   1352 
   1353             ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b,  ref_main_idx_4x32b);
   1354 
   1355             ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b);
   1356 
   1357             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   1358 
   1359             /*(32 - fract) */
   1360             src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   1361 
   1362             _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
   1363             _mm_storeu_si128((__m128i *)(ai1_src_temp_val),  src_values10);
   1364 
   1365             fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
   1366             fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
   1367             fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
   1368             fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
   1369 
   1370             temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
   1371             temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
   1372             temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
   1373             temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
   1374 
   1375             temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
   1376             temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
   1377             temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
   1378             temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
   1379 
   1380             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   1381             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   1382             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   1383             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   1384 
   1385             fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]);  /* col=5*/
   1386             fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]);  /* col=6*/
   1387             fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]);  /* col=7*/
   1388             fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]);  /* col=8*/
   1389 
   1390             temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]);  /* col=0*/
   1391             temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]);  /* col=1*/
   1392             temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]);  /* col=2*/
   1393             temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]);  /* col=3*/
   1394 
   1395             temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
   1396             temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
   1397             temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
   1398             temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
   1399 
   1400             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   1401             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   1402             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   1403             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   1404 
   1405             for(row = 0; row < nt; row += 4)
   1406             {
   1407                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1408                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1409 
   1410                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   1411                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   1412 
   1413                 /* loding 8-bit 16 pixels */
   1414                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - row - (8 + row))); /* col=0*/
   1415                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - row - (8 + row))); /* col=1*/
   1416                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - row - (8 + row))); /* col=2*/
   1417                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - row - (8 + row))); /* col=3*/
   1418 
   1419                 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
   1420                 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
   1421                 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
   1422                 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
   1423 
   1424                 src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/
   1425                 src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/
   1426                 src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/
   1427                 src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/
   1428 
   1429                 /* loding 8-bit 16 pixels */
   1430                 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - row - row - 8)); /* col=5*/
   1431                 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - row - row - 8)); /* col=6*/
   1432                 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - row - row - 8)); /* col=7*/
   1433                 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - row - row - 8)); /* col=8*/
   1434 
   1435                 src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
   1436                 src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
   1437                 src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
   1438                 src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
   1439 
   1440                 src_temp11_8x16b =  _mm_unpacklo_epi8(src_temp11_8x16b, src_temp15_8x16b); /* col=0*/
   1441                 src_temp12_8x16b =  _mm_unpacklo_epi8(src_temp12_8x16b, src_temp16_8x16b); /* col=1*/
   1442                 src_temp13_8x16b =  _mm_unpacklo_epi8(src_temp13_8x16b, src_temp17_8x16b); /* col=2*/
   1443                 src_temp14_8x16b =  _mm_unpacklo_epi8(src_temp14_8x16b, src_temp18_8x16b); /* col=3*/
   1444 
   1445                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1446                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   1447                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   1448                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   1449                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   1450 
   1451                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1452                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   1453                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   1454                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   1455                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   1456 
   1457                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1458                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   1459                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   1460                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   1461                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   1462 
   1463                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1464                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   1465                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   1466                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   1467                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   1468 
   1469                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1470                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   1471                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   1472                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   1473                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   1474 
   1475                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1476                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   1477                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   1478                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   1479                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   1480 
   1481                 /* converting 16 bit to 8 bit */
   1482                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
   1483                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
   1484                 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
   1485                 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
   1486 
   1487                 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1);
   1488                 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1);
   1489                 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1);
   1490                 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1);
   1491 
   1492                 /* converting 16 bit to 8 bit */
   1493                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
   1494                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
   1495                 src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
   1496                 src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
   1497 
   1498                 src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm1);
   1499                 src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm1);
   1500                 src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm1);
   1501                 src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm1);
   1502 
   1503                 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1504                 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1505 
   1506                 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
   1507                 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
   1508 
   1509                 src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
   1510                 src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
   1511 
   1512                 src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
   1513                 src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
   1514 
   1515                 src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
   1516                 src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
   1517                 src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
   1518                 src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
   1519 
   1520                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)),    src_temp11_8x16b);          /* row=0*/
   1521                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b);       /* row=1*/
   1522                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b);       /* row=2*/
   1523                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b);       /* row=4*/
   1524 
   1525             }
   1526         }
   1527     }
   1528 }
   1529 
   1530 /**
   1531 *******************************************************************************
   1532 *
   1533 * @brief
   1534 *  Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
   1535 * with reference  neighboring samples location pointed by 'pu1_ref' to the
   1536 * TU block location pointed by 'pu1_dst'
   1537 *
   1538 * @par Description:
   1539 *
   1540 *
   1541 * @param[in] pu1_src
   1542 *  UWORD8 pointer to the source
   1543 *
   1544 * @param[in] pu1_dst
   1545 *  UWORD8 pointer to the destination
   1546 *
   1547 * @param[in] src_strd
   1548 *  integer source stride
   1549 *
   1550 * @param[in] dst_strd
   1551 *  integer destination stride
   1552 *
   1553 * @param[in] nt
   1554 *  integer Transform Block size
   1555 *
   1556 * @param[in] mode
   1557 *  integer intraprediction mode
   1558 *
   1559 * @returns
   1560 *
   1561 * @remarks
   1562 *  None
   1563 *
   1564 *******************************************************************************
   1565 */
   1566 
   1567 
   1568 void ihevc_intra_pred_chroma_mode_11_to_17_ssse3(UWORD8 *pu1_ref,
   1569                                                  WORD32 src_strd,
   1570                                                  UWORD8 *pu1_dst,
   1571                                                  WORD32 dst_strd,
   1572                                                  WORD32 nt,
   1573                                                  WORD32 mode)
   1574 {
   1575     /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/
   1576     /* for ref main & side samples assignment,can be combined for */
   1577     /* optimzation*/
   1578 
   1579     WORD32 row, col, k;
   1580     WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
   1581     WORD32 ref_idx;
   1582 
   1583 
   1584     __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b;
   1585     __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b;
   1586     __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b;
   1587 
   1588     UWORD8 ref_temp[2 * MAX_CU_SIZE + 2];
   1589     UWORD8 *ref_main;
   1590     UNUSED(src_strd);
   1591 
   1592     inv_ang_sum = 128;
   1593 
   1594     intra_pred_ang = gai4_ihevc_ang_table[mode];
   1595 
   1596     inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
   1597     /* Intermediate reference samples for negative angle modes */
   1598     /* This have to be removed during optimization*/
   1599 
   1600     /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
   1601 
   1602 
   1603     ref_main = ref_temp + 2 * nt;
   1604     for(k = 0; k < (2 * (nt + 1)); k += 2)
   1605     {
   1606         ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k];
   1607         ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1];
   1608     }
   1609 
   1610     ref_main = ref_temp + (2 * (nt - 1));
   1611     ref_idx = (nt * intra_pred_ang) >> 5;
   1612 
   1613     /* SIMD Optimization can be done using look-up table for the loop */
   1614     /* For negative angled derive the main reference samples from side */
   1615     /* reference samples refer to section 8.4.4.2.6 */
   1616 
   1617     for(k = -2; k > (2 * ref_idx); k -= 2)
   1618     {
   1619         inv_ang_sum += inv_ang;
   1620         ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)];
   1621         ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)];
   1622     }
   1623 
   1624     /* For the angles other then 45 degree, interpolation btw 2 neighboring */
   1625     /* samples dependent on distance to obtain destination sample */
   1626 
   1627     const_temp_4x32b  = _mm_set1_epi16(16);
   1628     const_temp2_4x32b = _mm_set1_epi32(31);
   1629     const_temp3_4x32b = _mm_set1_epi16(32);
   1630     const_temp4_4x32b = _mm_set1_epi32(4);
   1631 
   1632     two_nt_4x32b = _mm_set1_epi32(1);
   1633 
   1634     zero_8x16b = _mm_set1_epi16(0);
   1635 
   1636 
   1637     /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   1638     intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang);
   1639 
   1640     row_4x32b = _mm_set_epi32(4, 3, 2, 1);
   1641 
   1642     if(nt == 4)
   1643     {
   1644         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   1645         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   1646         const_temp2_4x32b = _mm_set1_epi16(31);
   1647         const_temp4_4x32b = _mm_set1_epi16(4);
   1648         two_nt_4x32b = _mm_set1_epi16(1);
   1649 
   1650         {
   1651             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   1652             WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
   1653 
   1654             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
   1655             __m128i src_values10;
   1656 
   1657             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   1658 
   1659             /* pos = ((row + 1) * intra_pred_ang); */
   1660             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   1661 
   1662             /* fract = pos & (31); */
   1663             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   1664 
   1665             ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   1666             ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
   1667 
   1668             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   1669 
   1670             /*(32 - fract) */
   1671             src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   1672 
   1673             _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b);
   1674             _mm_storel_epi64((__m128i *)(ai1_src_temp_val),  src_values10);
   1675 
   1676             fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
   1677             fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
   1678             fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
   1679             fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
   1680 
   1681             temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
   1682             temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
   1683             temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
   1684             temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
   1685 
   1686             temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
   1687             temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
   1688             temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
   1689             temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
   1690 
   1691             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   1692             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   1693             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   1694             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   1695 
   1696             {
   1697                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1698                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1699 
   1700                 /* loding 8-bit 16 pixels */
   1701                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/
   1702                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/
   1703                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/
   1704                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/
   1705 
   1706                 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
   1707                 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
   1708                 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
   1709                 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
   1710 
   1711                 src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
   1712                 src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
   1713                 src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
   1714                 src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
   1715 
   1716                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1717                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   1718                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   1719                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   1720                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   1721 
   1722                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1723                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   1724                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   1725                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   1726                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   1727 
   1728                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1729                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   1730                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   1731                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   1732                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   1733 
   1734                 /* converting 16 bit to 8 bit */
   1735                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
   1736                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
   1737                 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
   1738                 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
   1739 
   1740                 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1741                 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1742 
   1743                 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
   1744                 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
   1745 
   1746                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b);             /* row=0*/
   1747 
   1748                 src_temp2_8x16b  = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
   1749                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b);       /* row=1*/
   1750 
   1751                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b);       /* row=2*/
   1752 
   1753                 src_temp4_8x16b  = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2));
   1754                 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b);       /* row=4*/
   1755 
   1756             }
   1757         }
   1758     }
   1759     else
   1760     {
   1761         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   1762         row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1);
   1763         const_temp2_4x32b = _mm_set1_epi16(31);
   1764         const_temp4_4x32b = _mm_set1_epi16(8);
   1765         two_nt_4x32b = _mm_set1_epi16(1);
   1766 
   1767         for(col = 0; col < 2 * nt; col += 16)
   1768         {
   1769             WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4;
   1770             WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8;
   1771             WORD8  ai1_fract_temp_val[16], ai1_src_temp_val[16];
   1772 
   1773             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b;
   1774             __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10;
   1775 
   1776             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   1777             __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b;
   1778 
   1779             /* pos = ((row + 1) * intra_pred_ang); */
   1780             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   1781 
   1782             /* fract = pos & (31); */
   1783             fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   1784 
   1785             ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b,  5));
   1786             ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b);
   1787 
   1788             row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b);
   1789 
   1790             /*(32 - fract) */
   1791             src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b);
   1792 
   1793             _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b);
   1794             _mm_storeu_si128((__m128i *)(ai1_src_temp_val),  src_values10);
   1795 
   1796             fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]);  /* col=0*/
   1797             fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]);  /* col=1*/
   1798             fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]);  /* col=2*/
   1799             fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]);  /* col=3*/
   1800 
   1801             temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]);  /* col=0*/
   1802             temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]);  /* col=1*/
   1803             temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]);  /* col=2*/
   1804             temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]);  /* col=3*/
   1805 
   1806             temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
   1807             temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
   1808             temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
   1809             temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
   1810 
   1811             pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0);    /* col=0*/
   1812             pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1);    /* col=1*/
   1813             pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2);    /* col=2*/
   1814             pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3);    /* col=3*/
   1815 
   1816             fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]);  /* col=5*/
   1817             fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]);  /* col=6*/
   1818             fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]);  /* col=7*/
   1819             fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]);  /* col=8*/
   1820 
   1821             temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]);  /* col=0*/
   1822             temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]);  /* col=1*/
   1823             temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]);  /* col=2*/
   1824             temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]);  /* col=3*/
   1825 
   1826             temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b);
   1827             temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b);
   1828             temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b);
   1829             temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b);
   1830 
   1831             pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4);    /* col=5*/
   1832             pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5);    /* col=6*/
   1833             pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6);    /* col=7*/
   1834             pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7);    /* col=8*/
   1835 
   1836             for(row = 0; row < nt; row += 4)
   1837             {
   1838                 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1839                 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1840 
   1841                 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b;
   1842                 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b;
   1843 
   1844                 /* loding 8-bit 16 pixels */
   1845                 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row + row)); /* col=0*/
   1846                 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row + row)); /* col=1*/
   1847                 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row + row)); /* col=2*/
   1848                 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row + row)); /* col=3*/
   1849 
   1850                 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/
   1851                 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/
   1852                 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/
   1853                 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/
   1854 
   1855                 src_temp1_8x16b =  _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/
   1856                 src_temp2_8x16b =  _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/
   1857                 src_temp3_8x16b =  _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/
   1858                 src_temp4_8x16b =  _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/
   1859 
   1860                 /* loding 8-bit 16 pixels */
   1861                 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row + row)); /* col=5*/
   1862                 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row + row)); /* col=6*/
   1863                 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row + row)); /* col=7*/
   1864                 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row + row)); /* col=8*/
   1865 
   1866                 src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/
   1867                 src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/
   1868                 src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/
   1869                 src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/
   1870 
   1871                 src_temp11_8x16b =  _mm_unpacklo_epi8(src_temp15_8x16b, src_temp11_8x16b); /* col=0*/
   1872                 src_temp12_8x16b =  _mm_unpacklo_epi8(src_temp16_8x16b, src_temp12_8x16b); /* col=1*/
   1873                 src_temp13_8x16b =  _mm_unpacklo_epi8(src_temp17_8x16b, src_temp13_8x16b); /* col=2*/
   1874                 src_temp14_8x16b =  _mm_unpacklo_epi8(src_temp18_8x16b, src_temp14_8x16b); /* col=3*/
   1875 
   1876                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1877                 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b);
   1878                 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b);
   1879                 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b);
   1880                 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b);
   1881 
   1882                 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   1883                 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b);
   1884                 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b);
   1885                 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b);
   1886                 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b);
   1887 
   1888                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1889                 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b);
   1890                 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b);
   1891                 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b);
   1892                 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b);
   1893 
   1894                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1895                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  5);   /* col=0*/
   1896                 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b,  5);   /* col=1*/
   1897                 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  5);   /* col=2*/
   1898                 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b,  5);   /* col=3*/
   1899 
   1900                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   1901                 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b);
   1902                 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b);
   1903                 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b);
   1904                 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b);
   1905 
   1906                 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   1907                 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  5);   /* col=5*/
   1908                 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b,  5);   /* col=6*/
   1909                 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  5);   /* col=7*/
   1910                 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b,  5);   /* col=8*/
   1911 
   1912                 /* converting 16 bit to 8 bit */
   1913                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/
   1914                 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/
   1915                 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/
   1916                 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/
   1917 
   1918                 /* converting 16 bit to 8 bit */
   1919                 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/
   1920                 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/
   1921                 src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/
   1922                 src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/
   1923 
   1924                 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1925                 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1926 
   1927                 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b);
   1928                 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b);
   1929 
   1930                 src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b);
   1931                 src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b);
   1932 
   1933                 src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b);
   1934                 src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b);
   1935 
   1936                 src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b);
   1937                 src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b);
   1938                 src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b);
   1939                 src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b);
   1940 
   1941                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)),    src_temp11_8x16b);          /* row=0*/
   1942                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b);       /* row=1*/
   1943                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b);       /* row=2*/
   1944                 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b);       /* row=4*/
   1945 
   1946             }
   1947         }
   1948     }
   1949 }
   1950 
   1951 /**
   1952 *******************************************************************************
   1953 *
   1954 * @brief
   1955 *  Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
   1956 * reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   1957 * block location pointed by 'pu1_dst'
   1958 *
   1959 * @par Description:
   1960 *
   1961 *
   1962 * @param[in] pu1_src
   1963 *  UWORD8 pointer to the source
   1964 *
   1965 * @param[in] pu1_dst
   1966 *  UWORD8 pointer to the destination
   1967 *
   1968 * @param[in] src_strd
   1969 *  integer source stride
   1970 *
   1971 * @param[in] dst_strd
   1972 *  integer destination stride
   1973 *
   1974 * @param[in] nt
   1975 *  integer Transform Block size
   1976 *
   1977 * @param[in] mode
   1978 *  integer intraprediction mode
   1979 *
   1980 * @returns
   1981 *
   1982 * @remarks
   1983 *  None
   1984 *
   1985 *******************************************************************************
   1986 */
   1987 
   1988 void ihevc_intra_pred_chroma_mode_19_to_25_ssse3(UWORD8 *pu1_ref,
   1989                                                  WORD32 src_strd,
   1990                                                  UWORD8 *pu1_dst,
   1991                                                  WORD32 dst_strd,
   1992                                                  WORD32 nt,
   1993                                                  WORD32 mode)
   1994 {
   1995     WORD32 row, k;
   1996     WORD32 intra_pred_ang, idx;
   1997     WORD32 inv_ang, inv_ang_sum, pos, fract;
   1998     WORD32 ref_main_idx, ref_idx;
   1999     UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2];
   2000     UWORD8 *ref_main;
   2001 
   2002     __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
   2003     UNUSED(src_strd);
   2004 
   2005     intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
   2006     inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12];
   2007 
   2008     /* Intermediate reference samples for negative angle modes */
   2009     /* This have to be removed during optimization*/
   2010     /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
   2011     ref_main = ref_temp + 2 * nt;
   2012     for(k = 0; k < (2 * (nt + 1)); k += 2)
   2013     {
   2014         ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k];
   2015         ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1];
   2016     }
   2017 
   2018     ref_idx = (nt * intra_pred_ang) >> 5;
   2019     inv_ang_sum = 128;
   2020     ref_main = ref_temp + (2 * (nt - 1));
   2021     /* SIMD Optimization can be done using look-up table for the loop */
   2022     /* For negative angled derive the main reference samples from side */
   2023     /*  reference samples refer to section 8.4.4.2.6 */
   2024     for(k = -2; k > (2 * ref_idx); k -= 2)
   2025     {
   2026         inv_ang_sum += inv_ang;
   2027         ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2];
   2028         ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2];
   2029     }
   2030 
   2031     const_temp_8x16b = _mm_set1_epi16(16);
   2032 
   2033     if(nt == 4) /* if nt =4*/
   2034     {
   2035         __m128i const_temp2_4x32b, const_temp3_4x32b;
   2036         __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
   2037         __m128i row_4x32b, two_nt_4x32b, src_values12;
   2038 
   2039 
   2040         const_temp2_4x32b = _mm_set1_epi32(31);
   2041         const_temp3_4x32b = _mm_set1_epi32(32);
   2042 
   2043         two_nt_4x32b = _mm_set1_epi32(2);
   2044 
   2045         zero_8x16b = _mm_set1_epi16(0);
   2046 
   2047         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   2048         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2049 
   2050         row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
   2051         {
   2052             WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   2053             WORD8  ai1_src_temp0_val[16], ai1_src_temp1_val[16];
   2054 
   2055             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
   2056             __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
   2057             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2058             __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
   2059 
   2060             /* pos = ((row + 1) * intra_pred_ang); */
   2061             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2062             sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
   2063             res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
   2064 
   2065             src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   2066             src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b,  5));
   2067 
   2068             ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
   2069             ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
   2070             ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
   2071             ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
   2072             ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
   2073             ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
   2074             ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
   2075 
   2076             /* fract = pos & (31); */
   2077             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2078 
   2079             /*(32 - fract) */
   2080             src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
   2081 
   2082             _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
   2083             _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
   2084 
   2085             fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]);  /* row=0*/
   2086             fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]);  /* row=1*/
   2087             fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]);  /* row=2*/
   2088             fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]);  /* row=3*/
   2089 
   2090             temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]);  /* row=0*/
   2091             temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]);  /* row=1*/
   2092             temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]);  /* row=2*/
   2093             temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]);  /* row=3*/
   2094 
   2095             temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
   2096             temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
   2097             temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
   2098             temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
   2099 
   2100 // inner loop starts from here
   2101             src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));  /* col = 0-7   */
   2102             src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2));  /* col = 8-15  */
   2103             src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3));  /* col = 16-23 */
   2104             src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4));  /* col = 24-31 */
   2105 
   2106             src_values10 = _mm_srli_si128(src_values0, 2);
   2107             src_values11 = _mm_srli_si128(src_values1, 2);
   2108             src_values12 = _mm_srli_si128(src_values2, 2);
   2109             src_values13 = _mm_srli_si128(src_values3, 2);
   2110 
   2111             src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
   2112             src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
   2113             src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
   2114             src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
   2115 
   2116             src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
   2117             src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
   2118             src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
   2119             src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
   2120 
   2121             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2122             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   2123             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   2124             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   2125             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   2126 
   2127             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2128             src_values0 = _mm_srai_epi16(src_values0,  5);
   2129             src_values1 = _mm_srai_epi16(src_values1,  5);
   2130             src_values2 = _mm_srai_epi16(src_values2,  5);
   2131             src_values3 = _mm_srai_epi16(src_values3,  5);
   2132 
   2133             /* converting 16 bit to 8 bit */
   2134             src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
   2135             src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
   2136             src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
   2137             src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
   2138 
   2139             _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0);       /* row=0*/
   2140             _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1);   /* row=1*/
   2141             _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2);   /* row=2*/
   2142             _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3);   /* row=3*/
   2143 
   2144         }
   2145     }
   2146     else if(nt == 8) /* for nt = 16 case */
   2147     {
   2148         WORD32 ref_main_idx1, fract1, temp, temp1;
   2149         __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
   2150 
   2151         zero_8x16b = _mm_set1_epi16(0);
   2152 
   2153         for(row = 0; row < nt; row += 2)
   2154         {
   2155             __m128i src_values0, src_values1, src_values2, src_values3;
   2156             __m128i  src_values10, src_values11, src_values12, src_values13;
   2157 
   2158             pos = ((row + 1) * intra_pred_ang);
   2159             idx = pos >> 5;
   2160             fract = pos & (31);
   2161             temp = 32 - fract;
   2162             ref_main_idx = 2 * idx + 2; /* col from 0-15 */
   2163 
   2164             pos = ((row + 2) * intra_pred_ang);
   2165             idx = pos >> 5;
   2166             fract1 = pos & (31);
   2167             temp1 = 32 - fract1;
   2168             ref_main_idx1 = 2 * idx + 2; /* col from 0-15 */
   2169 
   2170             fract_8x16b  = _mm_set1_epi8(fract);
   2171             fract1_8x16b = _mm_set1_epi8(fract1);
   2172             temp_8x16b   = _mm_set1_epi8(temp);
   2173             temp1_8x16b  = _mm_set1_epi8(temp1);
   2174 
   2175             temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
   2176             temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
   2177 
   2178             /* row=0 */
   2179             src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx));     /* col = 0-7   */
   2180             src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8));   /* col = 8-15  */
   2181 
   2182             /* row=1 */
   2183             src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1));   /* col = 0-7  */
   2184             src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1 + 8));  /* col = 8-15 */
   2185 
   2186             src_values10 = _mm_srli_si128(src_values0, 2);
   2187             src_values11 = _mm_srli_si128(src_values1, 2);
   2188             src_values12 = _mm_srli_si128(src_values2, 2);
   2189             src_values13 = _mm_srli_si128(src_values3, 2);
   2190 
   2191             src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
   2192             src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
   2193             src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
   2194             src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
   2195 
   2196             src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
   2197             src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
   2198 
   2199             src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
   2200             src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
   2201 
   2202             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2203             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   2204             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   2205 
   2206             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   2207             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   2208 
   2209             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2210             src_values0 = _mm_srai_epi16(src_values0,  5);
   2211             src_values1 = _mm_srai_epi16(src_values1,  5);
   2212 
   2213             src_values2 = _mm_srai_epi16(src_values2,  5);
   2214             src_values3 = _mm_srai_epi16(src_values3,  5);
   2215 
   2216             /* converting 16 bit to 8 bit */
   2217             src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
   2218             src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
   2219 
   2220             src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
   2221             src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
   2222 
   2223             /* loding 8-bit 8 pixels values */
   2224             _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
   2225             _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
   2226 
   2227             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
   2228             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
   2229 
   2230             pu1_dst += 2 * dst_strd;
   2231         }
   2232     }
   2233     else if(nt == 16)
   2234     {
   2235         WORD32 temp;
   2236         /* unroll the col loop (inner) */
   2237         zero_8x16b = _mm_set1_epi16(0);
   2238 
   2239         for(row = 0; row < nt; row += 1)
   2240         {
   2241             __m128i  src_values0, src_values1, src_values2, src_values3, temp_8x16b;
   2242             __m128i  src_values10, src_values11, src_values12, src_values13;
   2243 
   2244             pos = ((row + 1) * intra_pred_ang);
   2245             idx = pos >> 5;
   2246             fract = pos & (31);
   2247             temp = 32 - fract;
   2248             ref_main_idx = 2 * idx + 2; /* col from 0-31 */
   2249 
   2250             fract_8x16b = _mm_set1_epi8(fract);
   2251             temp_8x16b  = _mm_set1_epi8(temp);
   2252 
   2253             temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
   2254 
   2255             src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx));     /* col = 0-7   */
   2256             src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8));   /* col = 8-15  */
   2257             src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 16));  /* col = 16-23 */
   2258             src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 24));  /* col = 24-31 */
   2259 
   2260             src_values10 = _mm_srli_si128(src_values0, 2);
   2261             src_values11 = _mm_srli_si128(src_values1, 2);
   2262             src_values12 = _mm_srli_si128(src_values2, 2);
   2263             src_values13 = _mm_srli_si128(src_values3, 2);
   2264 
   2265             src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
   2266             src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
   2267             src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
   2268             src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
   2269 
   2270             /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2271             src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
   2272             src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
   2273             src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
   2274             src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
   2275 
   2276             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2277             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   2278             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   2279             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   2280             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   2281 
   2282             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2283             src_values0 = _mm_srai_epi16(src_values0,  5);
   2284             src_values1 = _mm_srai_epi16(src_values1,  5);
   2285             src_values2 = _mm_srai_epi16(src_values2,  5);
   2286             src_values3 = _mm_srai_epi16(src_values3,  5);
   2287 
   2288             /* converting 16 bit to 8 bit */
   2289             src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
   2290             src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
   2291             src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
   2292             src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
   2293 
   2294             /* loding 8-bit 8 pixels values */
   2295             _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
   2296             _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
   2297             _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
   2298             _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
   2299 
   2300             pu1_dst += dst_strd;
   2301 
   2302         }
   2303     }
   2304 }
   2305 
   2306 
   2307 /**
   2308 *******************************************************************************
   2309 *
   2310 * @brief
   2311 *  Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
   2312 * reference  neighboring samples location pointed by 'pu1_ref' to the  TU
   2313 * block location pointed by 'pu1_dst'
   2314 *
   2315 * @par Description:
   2316 *
   2317 *
   2318 * @param[in] pu1_src
   2319 *  UWORD8 pointer to the source
   2320 *
   2321 * @param[in] pu1_dst
   2322 *  UWORD8 pointer to the destination
   2323 *
   2324 * @param[in] src_strd
   2325 *  integer source stride
   2326 *
   2327 * @param[in] dst_strd
   2328 *  integer destination stride
   2329 *
   2330 * @param[in] nt
   2331 *  integer Transform Block size
   2332 *
   2333 * @param[in] mode
   2334 *  integer intraprediction mode
   2335 *
   2336 * @returns
   2337 *
   2338 * @remarks
   2339 *  None
   2340 *
   2341 *******************************************************************************
   2342 */
   2343 
   2344 void ihevc_intra_pred_chroma_mode_27_to_33_ssse3(UWORD8 *pu1_ref,
   2345                                                  WORD32 src_strd,
   2346                                                  UWORD8 *pu1_dst,
   2347                                                  WORD32 dst_strd,
   2348                                                  WORD32 nt,
   2349                                                  WORD32 mode)
   2350 {
   2351     WORD32 row;
   2352     WORD32 pos, fract;
   2353     WORD32 intra_pred_ang;
   2354     WORD32 idx, ref_main_idx;
   2355 
   2356     __m128i zero_8x16b, fract_8x16b, const_temp_8x16b;
   2357     UNUSED(src_strd);
   2358 
   2359     intra_pred_ang = gai4_ihevc_ang_table_chroma[mode];
   2360     const_temp_8x16b = _mm_set1_epi16(16);
   2361 
   2362     if(nt == 4) /* if nt =4*/
   2363     {
   2364         __m128i const_temp2_4x32b, const_temp3_4x32b;
   2365         __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b;
   2366         __m128i row_4x32b, two_nt_4x32b, src_values12;
   2367 
   2368         const_temp2_4x32b = _mm_set1_epi32(31);
   2369         const_temp3_4x32b = _mm_set1_epi32(32);
   2370 
   2371         two_nt_4x32b = _mm_set1_epi32((4 * nt) + 2);
   2372 
   2373         zero_8x16b = _mm_set1_epi16(0);
   2374 
   2375         /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */
   2376         intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang);
   2377         row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1);
   2378 
   2379         {
   2380             WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4;
   2381             WORD8  ai1_src_temp0_val[16], ai1_src_temp1_val[16];
   2382 
   2383             __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b;
   2384             __m128i src_values0, src_values1, src_values2, src_values3, src_values13;
   2385             __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b;
   2386             __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b;
   2387 
   2388             /* pos = ((row + 1) * intra_pred_ang); */
   2389             res_temp5_4x32b  = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b);
   2390             sign_8x16b      = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b);
   2391             res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b);
   2392 
   2393             src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b,  5));
   2394             src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b,  5));
   2395 
   2396             ref_main_temp0 = _mm_srli_si128(src_values12, 4);  /* next 32 bit values */
   2397             ref_main_temp1 = _mm_srli_si128(src_values12, 8);  /* next 32 bit values */
   2398             ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */
   2399             ref_main_idx1  = _mm_cvtsi128_si32(src_values12);    /* row=0*/
   2400             ref_main_idx2  = _mm_cvtsi128_si32(ref_main_temp0);  /* row=1*/
   2401             ref_main_idx3  = _mm_cvtsi128_si32(ref_main_temp1);  /* row=2*/
   2402             ref_main_idx4  = _mm_cvtsi128_si32(ref_main_temp2);  /* row=3*/
   2403 
   2404             /* fract = pos & (31); */
   2405             src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b);
   2406 
   2407             /*(32 - fract) */
   2408             src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11);
   2409 
   2410             _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11);
   2411             _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10);
   2412 
   2413             fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]);  /* row=0*/
   2414             fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]);  /* row=1*/
   2415             fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]);  /* row=2*/
   2416             fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]);  /* row=3*/
   2417 
   2418             temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]);  /* row=0*/
   2419             temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]);  /* row=1*/
   2420             temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]);  /* row=2*/
   2421             temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]);  /* row=3*/
   2422 
   2423             temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
   2424             temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b);
   2425             temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b);
   2426             temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b);
   2427 
   2428 // inner loop starts from here
   2429             src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));  /* col = 0-7   */
   2430             src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2));  /* col = 8-15  */
   2431             src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3));  /* col = 16-23 */
   2432             src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4));  /* col = 24-31 */
   2433 
   2434             src_values10 = _mm_srli_si128(src_values0, 2);
   2435             src_values11 = _mm_srli_si128(src_values1, 2);
   2436             src_values12 = _mm_srli_si128(src_values2, 2);
   2437             src_values13 = _mm_srli_si128(src_values3, 2);
   2438 
   2439             src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
   2440             src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
   2441             src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
   2442             src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
   2443 
   2444             src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b);
   2445             src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b);
   2446             src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b);
   2447             src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b);
   2448 
   2449             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2450             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   2451             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   2452             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   2453             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   2454 
   2455             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2456             src_values0 = _mm_srai_epi16(src_values0,  5);
   2457             src_values1 = _mm_srai_epi16(src_values1,  5);
   2458             src_values2 = _mm_srai_epi16(src_values2,  5);
   2459             src_values3 = _mm_srai_epi16(src_values3,  5);
   2460 
   2461             /* converting 16 bit to 8 bit */
   2462             src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
   2463             src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
   2464             src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
   2465             src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
   2466 
   2467             _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0);       /* row=0*/
   2468             _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1);   /* row=1*/
   2469             _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2);   /* row=2*/
   2470             _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3);   /* row=3*/
   2471 
   2472         }
   2473     }
   2474 
   2475     else if(nt == 8) /* for nt = 16 case */
   2476     {
   2477         WORD32 ref_main_idx1, fract1, temp, temp1;
   2478         __m128i fract1_8x16b, temp_8x16b, temp1_8x16b;
   2479 
   2480         zero_8x16b = _mm_set1_epi16(0);
   2481 
   2482         for(row = 0; row < nt; row += 2)
   2483         {
   2484             __m128i src_values0, src_values1, src_values2, src_values3;
   2485             __m128i  src_values10, src_values11, src_values12, src_values13;
   2486 
   2487             pos = ((row + 1) * intra_pred_ang);
   2488             idx = pos >> 5;
   2489             fract = pos & (31);
   2490             temp = 32 - fract;
   2491             ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
   2492 
   2493             pos = ((row + 2) * intra_pred_ang);
   2494             idx = pos >> 5;
   2495             fract1 = pos & (31);
   2496             temp1 = 32 - fract1;
   2497             ref_main_idx1 = (4 * nt) + 2 * idx + 2; /* col from 0-15 */
   2498 
   2499             fract_8x16b  = _mm_set1_epi8(fract);
   2500             fract1_8x16b = _mm_set1_epi8(fract1);
   2501             temp_8x16b   = _mm_set1_epi8(temp);
   2502             temp1_8x16b  = _mm_set1_epi8(temp1);
   2503 
   2504             temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
   2505             temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b);
   2506 
   2507             /* row=0 */
   2508             src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx));     /* col = 0-7   */
   2509             src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8));   /* col = 8-15  */
   2510 
   2511             /* row=1 */
   2512             src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1));    /* col = 0-7  */
   2513             src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 + 8));  /* col = 8-15 */
   2514 
   2515             src_values10 = _mm_srli_si128(src_values0, 2);
   2516             src_values11 = _mm_srli_si128(src_values1, 2);
   2517             src_values12 = _mm_srli_si128(src_values2, 2);
   2518             src_values13 = _mm_srli_si128(src_values3, 2);
   2519 
   2520             src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
   2521             src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
   2522             src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
   2523             src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
   2524 
   2525             src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
   2526             src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
   2527 
   2528             src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b);
   2529             src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b);
   2530 
   2531             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2532             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   2533             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   2534 
   2535             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   2536             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   2537 
   2538             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2539             src_values0 = _mm_srai_epi16(src_values0,  5);
   2540             src_values1 = _mm_srai_epi16(src_values1,  5);
   2541 
   2542             src_values2 = _mm_srai_epi16(src_values2,  5);
   2543             src_values3 = _mm_srai_epi16(src_values3,  5);
   2544 
   2545             /* converting 16 bit to 8 bit */
   2546             src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
   2547             src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
   2548 
   2549             src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
   2550             src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
   2551 
   2552             /* loding 8-bit 8 pixels values */
   2553             _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
   2554             _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
   2555 
   2556             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2);
   2557             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3);
   2558 
   2559             pu1_dst += 2 * dst_strd;
   2560         }
   2561     }
   2562     else if(nt == 16)
   2563     {
   2564         WORD32 temp;
   2565         /* unroll the col loop (inner) */
   2566         zero_8x16b = _mm_set1_epi16(0);
   2567 
   2568         for(row = 0; row < nt; row += 1)
   2569         {
   2570             __m128i  src_values0, src_values1, src_values2, src_values3, temp_8x16b;
   2571             __m128i  src_values10, src_values11, src_values12, src_values13;
   2572 
   2573             pos = ((row + 1) * intra_pred_ang);
   2574             idx = pos >> 5;
   2575             fract = pos & (31);
   2576             temp = 32 - fract;
   2577             ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-31 */
   2578 
   2579             fract_8x16b = _mm_set1_epi8(fract);
   2580             temp_8x16b  = _mm_set1_epi8(temp);
   2581 
   2582             temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b);
   2583 
   2584             src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx));     /* col = 0-7   */
   2585             src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8));   /* col = 8-15  */
   2586             src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 16));  /* col = 16-23 */
   2587             src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 24));  /* col = 24-31 */
   2588 
   2589             src_values10 = _mm_srli_si128(src_values0, 2);
   2590             src_values11 = _mm_srli_si128(src_values1, 2);
   2591             src_values12 = _mm_srli_si128(src_values2, 2);
   2592             src_values13 = _mm_srli_si128(src_values3, 2);
   2593 
   2594             src_values0 = _mm_unpacklo_epi8(src_values0, src_values10);
   2595             src_values1 = _mm_unpacklo_epi8(src_values1, src_values11);
   2596             src_values2 = _mm_unpacklo_epi8(src_values2, src_values12);
   2597             src_values3 = _mm_unpacklo_epi8(src_values3, src_values13);
   2598 
   2599             /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */
   2600             src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b);
   2601             src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b);
   2602             src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b);
   2603             src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b);
   2604 
   2605             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/
   2606             src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b);
   2607             src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b);
   2608             src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b);
   2609             src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b);
   2610 
   2611             /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/
   2612             src_values0 = _mm_srai_epi16(src_values0,  5);
   2613             src_values1 = _mm_srai_epi16(src_values1,  5);
   2614             src_values2 = _mm_srai_epi16(src_values2,  5);
   2615             src_values3 = _mm_srai_epi16(src_values3,  5);
   2616 
   2617             /* converting 16 bit to 8 bit */
   2618             src_values0 = _mm_packus_epi16(src_values0, zero_8x16b);
   2619             src_values1 = _mm_packus_epi16(src_values1, zero_8x16b);
   2620             src_values2 = _mm_packus_epi16(src_values2, zero_8x16b);
   2621             src_values3 = _mm_packus_epi16(src_values3, zero_8x16b);
   2622 
   2623             /* loding 8-bit 8 pixels values */
   2624             _mm_storel_epi64((__m128i *)(pu1_dst), src_values0);
   2625             _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1);
   2626             _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2);
   2627             _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3);
   2628 
   2629             pu1_dst += dst_strd;
   2630 
   2631         }
   2632     }
   2633 }
   2634