Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_weighted_pred_atom_intr.c
     22 *
     23 * @brief
     24 *  Contains function definitions for weighted prediction used in inter
     25 * prediction
     26 *
     27 * @author
     28 *
     29 *
     30 * @par List of Functions:
     31 *   - ihevc_weighted_pred_uni_ssse3()
     32 *   - ihevc_weighted_pred_bi_ssse3()
     33 *   - ihevc_weighted_pred_bi_default_ssse3()
     34 *   - ihevc_weighted_pred_chroma_uni_ssse3()
     35 *   - ihevc_weighted_pred_chroma_bi_ssse3()
     36 *   - ihevc_weighted_pred_chroma_bi_default_ssse3()
     37 *
     38 * @remarks
     39 *  None
     40 *
     41 *******************************************************************************
     42 */
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 #include <stdio.h>
     47 #include <assert.h>
     48 
     49 #include "ihevc_debug.h"
     50 #include "ihevc_typedefs.h"
     51 #include "ihevc_macros.h"
     52 #include "ihevc_platform_macros.h"
     53 #include "ihevc_func_selector.h"
     54 #include "ihevc_defs.h"
     55 #include "ihevc_weighted_pred.h"
     56 #include "ihevc_inter_pred.h"
     57 
     58 
     59 #include <immintrin.h>
     60 
     61 /**
     62 *******************************************************************************
     63 *
     64 * @brief
     65 *  Does uni-weighted prediction on the array pointed by  pi2_src and stores
     66 * it at the location pointed by pi2_dst
     67 *
     68 * @par Description:
     69 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
     70 * offset
     71 *
     72 * @param[in] pi2_src
     73 *  Pointer to the source
     74 *
     75 * @param[out] pu1_dst
     76 *  Pointer to the destination
     77 *
     78 * @param[in] src_strd
     79 *  Source stride
     80 *
     81 * @param[in] dst_strd
     82 *  Destination stride
     83 *
     84 * @param[in] wgt0
     85 *  weight to be multiplied to the source
     86 *
     87 * @param[in] off0
     88 *  offset to be added after rounding and
     89 *
     90 * @param[in] shifting
     91 *
     92 *
     93 * @param[in] shift
     94 *  (14 Bit depth) + log2_weight_denominator
     95 *
     96 * @param[in] lvl_shift
     97 *  added before shift and offset
     98 *
     99 * @param[in] ht
    100 *  height of the source
    101 *
    102 * @param[in] wd
    103 *  width of the source
    104 *
    105 * @returns
    106 *
    107 * @remarks
    108 *  None
    109 *
    110 *******************************************************************************
    111 */
    112 
    113 void ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src,
    114                                    UWORD8 *pu1_dst,
    115                                    WORD32 src_strd,
    116                                    WORD32 dst_strd,
    117                                    WORD32 wgt0,
    118                                    WORD32 off0,
    119                                    WORD32 shift,
    120                                    WORD32 lvl_shift,
    121                                    WORD32 ht,
    122                                    WORD32 wd)
    123 {
    124     WORD32 row, col, temp;
    125 
    126     /* all 128 bit registers are named with a suffix mxnb, where m is the */
    127     /* number of n bits packed in the register                            */
    128     __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
    129     __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
    130     __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b;
    131 
    132     ASSERT(wd % 4 == 0); /* checking assumption*/
    133     ASSERT(ht % 4 == 0); /* checking assumption*/
    134 
    135     temp = 1 << (shift - 1);
    136 
    137     // seting values in register
    138     lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
    139     wgt0_8x16b = _mm_set1_epi16(wgt0);
    140 
    141     /* lvl_shift * wgt0 */
    142     res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
    143     res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
    144 
    145     const_temp_4x32b = _mm_set1_epi32(temp);
    146     off0_4x32b = _mm_set1_epi32(off0);
    147 
    148 
    149     /* lvl_shift * wgt0 */
    150     lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
    151     /* lvl_shift * wgt0 + 1 << (shift - 1) */
    152     lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
    153 
    154     if(0 == (wd & 7)) /* wd multiple of 8 case */
    155     {
    156         __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
    157 
    158         /*  outer for loop starts from here */
    159         for(row = 0; row < ht; row += 4)
    160         {
    161             for(col = 0; col < wd; col += 8)
    162             {   /* for row =0 ,1,2,3*/
    163 
    164                 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    165                 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
    166                 /* row = 1 */
    167                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
    168                 /* row = 2 */
    169                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
    170                 /* row = 3 */
    171                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
    172 
    173                 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
    174                 res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
    175                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
    176                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
    177                 res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
    178 
    179                 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
    180                 src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
    181                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
    182                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
    183                 src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
    184 
    185                 /* Get 32 bit Result */
    186                 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
    187                 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
    188                 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
    189                 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
    190 
    191                 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
    192                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
    193                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
    194                 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
    195 
    196                 /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
    197                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
    198                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
    199                 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
    200                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
    201                 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
    202                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
    203                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
    204                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
    205 
    206                 /* (i4_tmp >> shift) */ /* First 4 pixels */
    207                 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
    208                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
    209                 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
    210                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
    211 
    212                 /* (i4_tmp >> shift) */ /* Last 4 pixels */
    213                 res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift);
    214                 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift);
    215                 res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift);
    216                 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift);
    217 
    218                 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
    219                 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
    220                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
    221                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
    222                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
    223 
    224                 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
    225                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
    226                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
    227                 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
    228                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
    229 
    230                 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b);
    231                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
    232                 res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b);
    233                 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
    234                 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    235                 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
    236                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
    237                 res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b);
    238                 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
    239 
    240                 /* store four 8-bit output values  */
    241                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
    242                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/
    243                 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/
    244                 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/
    245 
    246                 /* To update pointer */
    247                 pi2_src += 8;
    248                 pu1_dst += 8;
    249 
    250             } /* inner loop ends here(4-output values in single iteration) */
    251 
    252             pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
    253             pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
    254 
    255         }
    256     }
    257     else  /* wd multiple of 4 case */
    258     {
    259         WORD32 dst0, dst1, dst2, dst3;
    260         /*  outer for loop starts from here */
    261         for(row = 0; row < ht; row += 4)
    262         {
    263             for(col = 0; col < wd; col += 4)
    264             {   /* for row =0 ,1,2,3*/
    265 
    266                 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    267                 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
    268                 /* row = 1 */
    269                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
    270                 /* row = 2 */
    271                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd));
    272                 /* row = 3 */
    273                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd));
    274 
    275                 /* 2 rows together */
    276                 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b);
    277                 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
    278 
    279                 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
    280                 res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
    281                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
    282                 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */
    283                 src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
    284                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
    285 
    286                 /* Get 32 bit Result */
    287                 res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
    288                 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
    289 
    290                 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
    291                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
    292 
    293                 /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
    294                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
    295                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
    296                 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
    297                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
    298 
    299                 /* (i4_tmp >> shift) */
    300                 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift);
    301                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift);
    302                 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift);
    303                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift);
    304 
    305                 /*i4_tmp = (i4_tmp >> shift) + off0; */
    306                 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
    307                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
    308                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
    309                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
    310 
    311                 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
    312                 res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b);
    313 
    314                 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    315                 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b);
    316 
    317                 dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
    318                 /* dst row = 1 to 3 */
    319                 res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
    320                 res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2);
    321                 res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3);
    322 
    323                 /* store four 8-bit output values  */
    324                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
    325 
    326                 dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
    327                 dst2 = _mm_cvtsi128_si32(res_temp2_4x32b);
    328                 dst3 = _mm_cvtsi128_si32(res_temp3_4x32b);
    329 
    330                 /* row = 1 to row = 3 */
    331                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
    332                 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
    333                 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
    334 
    335                 /* To update pointer */
    336                 pi2_src += 4;
    337                 pu1_dst += 4;
    338 
    339             } /* inner loop ends here(4-output values in single iteration) */
    340 
    341             pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
    342             pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
    343 
    344         }
    345     }
    346 }
    347 
    348 /**
    349 *******************************************************************************
    350 *
    351 * @brief
    352 * Does chroma uni-weighted prediction on array pointed by pi2_src and stores
    353 * it at the location pointed by pi2_dst
    354 *
    355 * @par Description:
    356 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
    357 * offset
    358 *
    359 * @param[in] pi2_src
    360 *  Pointer to the source
    361 *
    362 * @param[out] pu1_dst
    363 *  Pointer to the destination
    364 *
    365 * @param[in] src_strd
    366 *  Source stride
    367 *
    368 * @param[in] dst_strd
    369 *  Destination stride
    370 *
    371 * @param[in] wgt0
    372 *  weight to be multiplied to the source
    373 *
    374 * @param[in] off0
    375 *  offset to be added after rounding and
    376 *
    377 * @param[in] shifting
    378 *
    379 *
    380 * @param[in] shift
    381 *  (14 Bit depth) + log2_weight_denominator
    382 *
    383 * @param[in] lvl_shift
    384 *  added before shift and offset
    385 *
    386 * @param[in] ht
    387 *  height of the source
    388 *
    389 * @param[in] wd
    390 *  width of the source (each colour component)
    391 *
    392 * @returns
    393 *
    394 * @remarks
    395 *  None
    396 *
    397 *******************************************************************************
    398 */
    399 
    400 
    401 void ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src,
    402                                           UWORD8 *pu1_dst,
    403                                           WORD32 src_strd,
    404                                           WORD32 dst_strd,
    405                                           WORD32 wgt0_cb,
    406                                           WORD32 wgt0_cr,
    407                                           WORD32 off0_cb,
    408                                           WORD32 off0_cr,
    409                                           WORD32 shift,
    410                                           WORD32 lvl_shift,
    411                                           WORD32 ht,
    412                                           WORD32 wd)
    413 {
    414     WORD32 row, col, temp, wdx2;
    415     /* all 128 bit registers are named with a suffix mxnb, where m is the */
    416     /* number of n bits packed in the register                            */
    417 
    418     __m128i src_temp0_8x16b, src_temp1_8x16b;
    419     __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b;
    420     __m128i res_temp0_4x32b, res_temp1_4x32b;
    421 
    422     ASSERT(wd % 2 == 0); /* checking assumption*/
    423     ASSERT(ht % 2 == 0); /* checking assumption*/
    424 
    425     temp = 1 << (shift - 1);
    426     wdx2 = 2 * wd;
    427 
    428     // seting values in register
    429     lvl_shift_4x32b = _mm_set1_epi16(lvl_shift);
    430     wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
    431 
    432     /* lvl_shift * wgt0 */
    433     res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b);
    434     res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b);
    435 
    436     const_temp_4x32b = _mm_set1_epi32(temp);
    437     off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
    438 
    439     /* lvl_shift * wgt0 */
    440     lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b);
    441     /* lvl_shift * wgt0 + 1 << (shift - 1) */
    442     lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b);
    443 
    444     {
    445         if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
    446         {
    447             __m128i src_temp2_8x16b, src_temp3_8x16b;
    448             __m128i res_temp2_4x32b, res_temp3_4x32b;
    449             __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b;
    450 
    451             /*  outer for loop starts from here */
    452             for(row = 0; row < ht; row += 2)
    453             {
    454                 for(col = 0; col < wdx2; col += 16)
    455                 {
    456                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    457                     src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
    458                     /* row = 1 */
    459                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
    460                     /* row = 0 */ /* Next 8 pixels */
    461                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
    462                     /* row = 1 */
    463                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
    464 
    465                     /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
    466                     res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
    467                     res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
    468                     res_temp4_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b);
    469                     res_temp5_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
    470 
    471                     /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
    472                     src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
    473                     src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
    474                     src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b);
    475                     src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
    476 
    477                     /* Get 32 bit Result */
    478                     res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
    479                     res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
    480                     res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b);
    481                     res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b);
    482 
    483                     res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
    484                     res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
    485                     res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b);
    486                     res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b);
    487 
    488                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
    489                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
    490                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
    491                     res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
    492                     res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
    493                     res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b);
    494                     res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b);
    495                     res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b);
    496                     res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b);
    497 
    498                     /* (i4_tmp >> shift) */
    499                     res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
    500                     res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
    501                     res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
    502                     res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
    503                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    504                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
    505                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
    506                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
    507                     res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
    508                     res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
    509 
    510                     /* (i4_tmp >> shift) */
    511                     res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b,  shift);
    512                     res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
    513                     res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b,  shift);
    514                     res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
    515                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
    516                     res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b);
    517                     res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b);
    518                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
    519                     res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b);
    520                     res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b);
    521 
    522                     res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
    523                     res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
    524                     res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b);
    525                     res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b);
    526                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    527                     res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b);
    528                     res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b);
    529 
    530                     /* store 16 8-bit output values  */
    531                     _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
    532                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
    533 
    534                     pi2_src += 16;  /* Pointer update */
    535                     pu1_dst += 16; /* Pointer update */
    536 
    537                 } /* inner loop ends here(4-output values in single iteration) */
    538                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
    539                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
    540             }
    541         }
    542         else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
    543         {
    544             __m128i res_temp2_4x32b, res_temp3_4x32b;
    545             /*  outer for loop starts from here */
    546             for(row = 0; row < ht; row += 2)
    547             {
    548                 for(col = 0; col < wdx2; col += 8)
    549                 {
    550                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    551                     src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src));
    552                     /* row = 1 */
    553                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
    554 
    555                     /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
    556                     res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
    557                     res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
    558                     /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
    559                     src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
    560                     src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
    561 
    562                     /* Get 32 bit Result */
    563                     res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
    564                     res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
    565 
    566                     res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
    567                     res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
    568 
    569                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
    570                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
    571                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
    572                     res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b);
    573                     res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b);
    574 
    575                     /* (i4_tmp >> shift) */
    576                     res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
    577                     res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
    578                     res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b,  shift);
    579                     res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
    580 
    581                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    582                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
    583                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
    584                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
    585                     res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b);
    586                     res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b);
    587 
    588                     res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b);
    589                     res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
    590 
    591                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    592                     res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
    593                     res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
    594 
    595                     /* store four 8-bit output values  */
    596                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/
    597                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/
    598 
    599                     pi2_src += 8;   /* Pointer update */
    600                     pu1_dst += 8; /* Pointer update */
    601 
    602                 } /* inner loop ends here(4-output values in single iteration) */
    603                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
    604                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
    605             }
    606         }
    607         else /* 2*wd multiple of 4 case */
    608         {
    609             WORD32 dst0, dst1;
    610             /*  outer for loop starts from here */
    611             for(row = 0; row < ht; row += 2)
    612             {
    613                 for(col = 0; col < wdx2; col += 4)
    614                 {
    615                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    616                     src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src));
    617                     /* row = 1 */
    618                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
    619 
    620                     /* 2 rows together */
    621                     src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
    622 
    623                     /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */
    624                     res_temp0_4x32b  = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b);
    625                     /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */
    626                     src_temp0_8x16b  = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b);
    627 
    628                     /* Get 32 bit Result */
    629                     res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b);
    630                     res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b);
    631 
    632                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */
    633                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b);
    634                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b);
    635 
    636                     /* (i4_tmp >> shift) */
    637                     res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b,  shift);
    638                     res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
    639 
    640                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    641                     res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b);
    642                     res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b);
    643 
    644                     res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b);
    645 
    646                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    647                     res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b);
    648 
    649                     dst0 = _mm_cvtsi128_si32(res_temp0_4x32b);
    650                     /* dst row = 1 to 3 */
    651                     res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1);
    652 
    653                     /* store four 8-bit output values  */
    654                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
    655 
    656                     dst1 = _mm_cvtsi128_si32(res_temp1_4x32b);
    657                     /* row = 1 */
    658                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
    659 
    660                     pi2_src += 4;   /* Pointer update */
    661                     pu1_dst += 4; /* Pointer update */
    662 
    663                 } /* inner loop ends here(4-output values in single iteration) */
    664                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
    665                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
    666             }
    667         }
    668     }
    669 }
    670 
    671 /**
    672 *******************************************************************************
    673 *
    674 * @brief
    675 *  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
    676 * pi2_src2 and stores it at location pointed  by pi2_dst
    677 *
    678 * @par Description:
    679 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
    680 * off1 + 1) << (shift - 1) ) >> shift
    681 *
    682 * @param[in] pi2_src1
    683 *  Pointer to source 1
    684 *
    685 * @param[in] pi2_src2
    686 *  Pointer to source 2
    687 *
    688 * @param[out] pu1_dst
    689 *  Pointer to destination
    690 *
    691 * @param[in] src_strd1
    692 *  Source stride 1
    693 *
    694 * @param[in] src_strd2
    695 *  Source stride 2
    696 *
    697 * @param[in] dst_strd
    698 *  Destination stride
    699 *
    700 * @param[in] wgt0
    701 *  weight to be multiplied to source 1
    702 *
    703 * @param[in] off0
    704 *  offset 0
    705 *
    706 * @param[in] wgt1
    707 *  weight to be multiplied to source 2
    708 *
    709 * @param[in] off1
    710 *  offset 1
    711 *
    712 * @param[in] shift
    713 *  (14 Bit depth) + log2_weight_denominator
    714 *
    715 * @param[in] lvl_shift1
    716 *  added before shift and offset
    717 *
    718 * @param[in] lvl_shift2
    719 *  added before shift and offset
    720 *
    721 * @param[in] ht
    722 *  height of the source
    723 *
    724 * @param[in] wd
    725 *  width of the source
    726 *
    727 * @returns
    728 *
    729 * @remarks
    730 *  None
    731 *
    732 *******************************************************************************
    733 */
    734 
    735 
    736 void ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1,
    737                                   WORD16 *pi2_src2,
    738                                   UWORD8 *pu1_dst,
    739                                   WORD32 src_strd1,
    740                                   WORD32 src_strd2,
    741                                   WORD32 dst_strd,
    742                                   WORD32 wgt0,
    743                                   WORD32 off0,
    744                                   WORD32 wgt1,
    745                                   WORD32 off1,
    746                                   WORD32 shift,
    747                                   WORD32 lvl_shift1,
    748                                   WORD32 lvl_shift2,
    749                                   WORD32 ht,
    750                                   WORD32 wd)
    751 {
    752     WORD32 row, col, temp;
    753 
    754     __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
    755     __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
    756     __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
    757 
    758 #include <assert.h>
    759     ASSERT(wd % 4 == 0); /* checking assumption*/
    760     ASSERT(ht % 4 == 0); /* checking assumption*/
    761 
    762     temp = (off0 + off1 + 1) << (shift - 1);
    763 
    764     // seting values in register
    765     lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
    766     wgt0_8x16b = _mm_set1_epi16(wgt0);
    767     lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
    768     wgt1_8x16b = _mm_set1_epi16(wgt1);
    769 
    770     /* lvl_shift1 * wgt0 */
    771     res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
    772     res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
    773     /* lvl_shift2 * wgt1 */
    774     res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
    775     res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
    776 
    777     const_temp_4x32b = _mm_set1_epi32(temp);
    778 
    779     /* lvl_shift1 * wgt0 */
    780     lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
    781     /* lvl_shift2 * wgt1 */
    782     lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
    783 
    784     if(0 == (wd & 7)) /* wd multiple of 8 case */
    785     {
    786         __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
    787         /*  outer for loop starts from here */
    788         for(row = 0; row < ht; row += 2)
    789         {
    790             for(col = 0; col < wd; col += 8)
    791             {
    792                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    793                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
    794                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
    795                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
    796                 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
    797 
    798                 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
    799                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
    800                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
    801                 res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
    802                 res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
    803                 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
    804                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
    805                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
    806                 src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
    807                 src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
    808 
    809                 /* Get 32 bit Result */
    810                 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
    811                 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
    812                 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
    813                 res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
    814 
    815                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
    816                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
    817                 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
    818                 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
    819 
    820                 /* (pi2_src[col] + lvl_shift) * wgt */
    821                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
    822                 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
    823                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
    824                 res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
    825                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
    826                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
    827                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
    828                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
    829 
    830                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
    831                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
    832                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
    833                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
    834                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
    835                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
    836                 /* (i4_tmp >> shift) */
    837                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
    838                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
    839 
    840                 /* Next 4 Pixels */
    841                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
    842                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
    843                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
    844                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
    845                 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
    846                 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
    847 
    848                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
    849                 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
    850 
    851                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
    852                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
    853                 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
    854 
    855                 /* store four 8-bit output values  */
    856                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
    857                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
    858 
    859                 pi2_src1 += 8;  /* Pointer update */
    860                 pi2_src2 += 8;  /* Pointer update */
    861                 pu1_dst  += 8;  /* Pointer update */
    862 
    863             } /* inner loop ends here(4-output values in single iteration) */
    864 
    865             pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
    866             pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
    867             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
    868 
    869         } /* outer loop ends */
    870     }
    871     else /* wd multiple of 4 case */
    872     {
    873         WORD32 dst0, dst1;
    874         /*  outer for loop starts from here */
    875         for(row = 0; row < ht; row += 2)
    876         {
    877             for(col = 0; col < wd; col += 4)
    878             {
    879                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    880                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
    881                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
    882                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
    883                 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
    884 
    885                 /* 2 rows together */
    886                 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
    887                 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
    888 
    889                 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
    890                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
    891                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
    892                 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
    893                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
    894                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
    895 
    896                 /* Get 32 bit Result */
    897                 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
    898                 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
    899 
    900                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
    901                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
    902 
    903                 /* (pi2_src[col] + lvl_shift) * wgt */
    904                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
    905                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
    906                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
    907                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
    908 
    909                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
    910                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
    911                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
    912 
    913                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
    914                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
    915                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
    916 
    917                 /* (i4_tmp >> shift) */
    918                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
    919                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
    920 
    921                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
    922 
    923                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
    924                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
    925 
    926                 dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
    927 
    928                 /* dst row = 1 to 3 */
    929                 res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
    930 
    931                 /* store four 8-bit output values  */
    932                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
    933 
    934                 dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
    935 
    936                 /* row = 1 */
    937                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
    938 
    939                 pi2_src1 += 4;  /* Pointer update */
    940                 pi2_src2 += 4;  /* Pointer update */
    941                 pu1_dst  += 4;  /* Pointer update */
    942 
    943             } /* inner loop ends here(4-output values in single iteration) */
    944 
    945             pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
    946             pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
    947             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
    948 
    949         } /* outer loop ends */
    950     }
    951 
    952 }
    953 
    954 /**
    955 *******************************************************************************
    956 *
    957 * @brief
    958 * Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
    959 * pi2_src2 and stores it at location pointed  by pi2_dst
    960 *
    961 * @par Description:
    962 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
    963 * off1 + 1) << (shift - 1) ) >> shift
    964 *
    965 * @param[in] pi2_src1
    966 *  Pointer to source 1
    967 *
    968 * @param[in] pi2_src2
    969 *  Pointer to source 2
    970 *
    971 * @param[out] pu1_dst
    972 *  Pointer to destination
    973 *
    974 * @param[in] src_strd1
    975 *  Source stride 1
    976 *
    977 * @param[in] src_strd2
    978 *  Source stride 2
    979 *
    980 * @param[in] dst_strd
    981 *  Destination stride
    982 *
    983 * @param[in] wgt0
    984 *  weight to be multiplied to source 1
    985 *
    986 * @param[in] off0
    987 *  offset 0
    988 *
    989 * @param[in] wgt1
    990 *  weight to be multiplied to source 2
    991 *
    992 * @param[in] off1
    993 *  offset 1
    994 *
    995 * @param[in] shift
    996 *  (14 Bit depth) + log2_weight_denominator
    997 *
    998 * @param[in] lvl_shift1
    999 *  added before shift and offset
   1000 *
   1001 * @param[in] lvl_shift2
   1002 *  added before shift and offset
   1003 *
   1004 * @param[in] ht
   1005 *  height of the source
   1006 *
   1007 * @param[in] wd
   1008 *  width of the source (each colour component)
   1009 *
   1010 * @returns
   1011 *
   1012 * @remarks
   1013 *  None
   1014 *
   1015 *******************************************************************************
   1016 */
   1017 
   1018 
   1019 void ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1,
   1020                                          WORD16 *pi2_src2,
   1021                                          UWORD8 *pu1_dst,
   1022                                          WORD32 src_strd1,
   1023                                          WORD32 src_strd2,
   1024                                          WORD32 dst_strd,
   1025                                          WORD32 wgt0_cb,
   1026                                          WORD32 wgt0_cr,
   1027                                          WORD32 off0_cb,
   1028                                          WORD32 off0_cr,
   1029                                          WORD32 wgt1_cb,
   1030                                          WORD32 wgt1_cr,
   1031                                          WORD32 off1_cb,
   1032                                          WORD32 off1_cr,
   1033                                          WORD32 shift,
   1034                                          WORD32 lvl_shift1,
   1035                                          WORD32 lvl_shift2,
   1036                                          WORD32 ht,
   1037                                          WORD32 wd)
   1038 {
   1039     WORD32 row, col, temp1, temp2;
   1040     WORD32 wdx2;
   1041 
   1042     __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1043     __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b;
   1044     __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b;
   1045 
   1046     ASSERT(wd % 2 == 0); /* checking assumption*/
   1047     ASSERT(ht % 2 == 0); /* checking assumption*/
   1048 
   1049     temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
   1050     temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
   1051 
   1052     // seting values in register
   1053     lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1);
   1054     wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
   1055     lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2);
   1056     wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
   1057 
   1058     /* lvl_shift1 * wgt0 */
   1059     res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b);
   1060     res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b);
   1061     /* lvl_shift2 * wgt1 */
   1062     res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b);
   1063     res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b);
   1064 
   1065     const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
   1066     wdx2 = wd * 2;
   1067 
   1068     /* lvl_shift1 * wgt0 */
   1069     lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b);
   1070     /* lvl_shift2 * wgt1 */
   1071     lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b);
   1072 
   1073     if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
   1074     {
   1075         __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b;
   1076         /*  outer for loop starts from here */
   1077         for(row = 0; row < ht; row += 2)
   1078         {
   1079             for(col = 0; col < wdx2; col += 8)
   1080             {
   1081                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1082                 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
   1083                 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
   1084                 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
   1085                 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
   1086 
   1087                 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
   1088                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
   1089                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
   1090                 res_temp3_4x32b  = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b);
   1091                 res_temp4_4x32b  = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b);
   1092                 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
   1093                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
   1094                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
   1095                 src_temp3_8x16b  = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b);
   1096                 src_temp4_8x16b  = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b);
   1097 
   1098                 /* Get 32 bit Result */
   1099                 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
   1100                 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
   1101                 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b);
   1102                 res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b);
   1103 
   1104                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
   1105                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
   1106                 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b);
   1107                 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b);
   1108 
   1109                 /* (pi2_src[col] + lvl_shift) * wgt */
   1110                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b);
   1111                 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b);
   1112                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b);
   1113                 res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b);
   1114                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
   1115                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
   1116                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
   1117                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
   1118 
   1119                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
   1120                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
   1121                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
   1122                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
   1123                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
   1124                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
   1125                 /* (i4_tmp >> shift) */
   1126                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
   1127                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
   1128 
   1129                 /* Next 4 Pixels */
   1130                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b);
   1131                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b);
   1132                 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b);
   1133                 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b);
   1134                 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b,  shift);
   1135                 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b,  shift);
   1136 
   1137                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b);
   1138                 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b);
   1139 
   1140                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1141                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
   1142                 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b);
   1143 
   1144                 /* store four 8-bit output values  */
   1145                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/
   1146                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/
   1147 
   1148                 pi2_src1 += 8;  /* Pointer update */
   1149                 pi2_src2 += 8;  /* Pointer update */
   1150                 pu1_dst  += 8;  /* Pointer update */
   1151 
   1152             } /* inner loop ends here(4-output values in single iteration) */
   1153 
   1154             pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
   1155             pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
   1156             pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
   1157 
   1158         } /* outer loop ends */
   1159     }
   1160     else /* wdx2 multiple of 4 case */
   1161     {
   1162         WORD32 dst0, dst1;
   1163         /*  outer for loop starts from here */
   1164         for(row = 0; row < ht; row += 2)
   1165         {
   1166             for(col = 0; col < wdx2; col += 4)
   1167             {
   1168                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1169                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */
   1170                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */
   1171                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
   1172                 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
   1173 
   1174                 /* 2 rows together */
   1175                 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
   1176                 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
   1177 
   1178                 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */
   1179                 res_temp1_4x32b  = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b);
   1180                 res_temp2_4x32b  = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b);
   1181                 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */
   1182                 src_temp1_8x16b  = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b);
   1183                 src_temp2_8x16b  = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b);
   1184 
   1185                 /* Get 32 bit Result */
   1186                 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b);
   1187                 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b);
   1188 
   1189                 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b);
   1190                 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b);
   1191 
   1192                 /* (pi2_src[col] + lvl_shift) * wgt */
   1193                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b);
   1194                 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b);
   1195                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b);
   1196                 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b);
   1197 
   1198                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
   1199                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b);
   1200                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b);
   1201 
   1202                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
   1203                 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b);
   1204                 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b);
   1205 
   1206                 /* (i4_tmp >> shift) */
   1207                 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b,  shift);
   1208                 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b,  shift);
   1209 
   1210                 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b);
   1211 
   1212                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1213                 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b);
   1214 
   1215                 dst0 = _mm_cvtsi128_si32(res_temp1_4x32b);
   1216 
   1217                 /* dst row = 1 to 3 */
   1218                 res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1);
   1219 
   1220                 /* store four 8-bit output values  */
   1221                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   1222 
   1223                 dst1 = _mm_cvtsi128_si32(res_temp2_4x32b);
   1224 
   1225                 /* row = 1 */
   1226                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   1227 
   1228                 pi2_src1 += 4;  /* Pointer update */
   1229                 pi2_src2 += 4;  /* Pointer update */
   1230                 pu1_dst  += 4;  /* Pointer update */
   1231 
   1232             } /* inner loop ends here(4-output values in single iteration) */
   1233 
   1234             pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
   1235             pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
   1236             pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
   1237         }
   1238     }
   1239 
   1240 }
   1241 
   1242 /**
   1243 *******************************************************************************
   1244 *
   1245 * @brief
   1246 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
   1247 * pi2_src2 and stores it at location  pointed by pi2_dst
   1248 *
   1249 * @par Description:
   1250 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
   1251 * >> shift  where shift = 15 - BitDepth
   1252 *
   1253 * @param[in] pi2_src1
   1254 *  Pointer to source 1
   1255 *
   1256 * @param[in] pi2_src2
   1257 *  Pointer to source 2
   1258 *
   1259 * @param[out] pu1_dst
   1260 *  Pointer to destination
   1261 *
   1262 * @param[in] src_strd1
   1263 *  Source stride 1
   1264 *
   1265 * @param[in] src_strd2
   1266 *  Source stride 2
   1267 *
   1268 * @param[in] dst_strd
   1269 *  Destination stride
   1270 *
   1271 * @param[in] lvl_shift1
   1272 *  added before shift and offset
   1273 *
   1274 * @param[in] lvl_shift2
   1275 *  added before shift and offset
   1276 *
   1277 * @param[in] ht
   1278 *  height of the source
   1279 *
   1280 * @param[in] wd
   1281 *  width of the source
   1282 *
   1283 * @returns
   1284 *
   1285 * @remarks
   1286 *  None
   1287 *
   1288 * Assumption : ht%4 == 0, wd%4 == 0
   1289 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
   1290 * final result will match even if intermediate precision is in 16 bit.
   1291 *
   1292 *******************************************************************************
   1293 */
   1294 void ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1,
   1295                                           WORD16 *pi2_src2,
   1296                                           UWORD8 *pu1_dst,
   1297                                           WORD32 src_strd1,
   1298                                           WORD32 src_strd2,
   1299                                           WORD32 dst_strd,
   1300                                           WORD32 lvl_shift1,
   1301                                           WORD32 lvl_shift2,
   1302                                           WORD32 ht,
   1303                                           WORD32 wd)
   1304 {
   1305     {
   1306         WORD32 row, col, temp;
   1307         WORD32 shift;
   1308 
   1309         __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1310         __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
   1311         __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1312 
   1313         ASSERT(wd % 4 == 0); /* checking assumption*/
   1314         ASSERT(ht % 2 == 0); /* checking assumption*/
   1315 
   1316         shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
   1317         temp = 1 << (shift - 1);
   1318 
   1319         // seting values in register
   1320         lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
   1321         lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
   1322         const_temp_8x16b = _mm_set1_epi16(temp);
   1323 
   1324         lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
   1325         lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
   1326 
   1327         if(0 == (ht & 3)) /* ht multiple of 4*/
   1328         {
   1329             if(0 == (wd & 15)) /* wd multiple of 16 case */
   1330             {
   1331                 __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
   1332                 __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
   1333                 /*  outer for loop starts from here */
   1334                 for(row = 0; row < ht; row += 4)
   1335                 {
   1336                     for(col = 0; col < wd; col += 16)
   1337                     {
   1338                         /*load 8 pixel values */ /* First 8 Values */
   1339                         src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
   1340                         src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
   1341                         /* row = 1 */
   1342                         src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
   1343                         src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
   1344                         /* row = 2 */
   1345                         src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
   1346                         src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
   1347                         /* row = 3 */
   1348                         src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
   1349                         src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
   1350 
   1351                         /*load 8 pixel values */ /* Second 8 Values */
   1352                         src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
   1353                         src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
   1354                         /* row = 1 */
   1355                         src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
   1356                         src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
   1357                         /* row = 2 */
   1358                         src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
   1359                         src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
   1360 
   1361                         /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
   1362                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1363                         src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1364                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1365                         src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
   1366 
   1367                         /*load 8 pixel values */ /* Second 8 Values */
   1368                         /* row = 3 */
   1369                         src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
   1370                         src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
   1371 
   1372                         /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
   1373                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1374                         src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
   1375                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   1376                         src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
   1377 
   1378                         /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
   1379                         src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
   1380                         src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
   1381                         src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
   1382                         src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
   1383 
   1384                         /* (i4_tmp >> shift) */ /* First 8 Values */
   1385                         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1386                         src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
   1387                         src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   1388                         src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
   1389 
   1390                         /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
   1391                         src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
   1392                         src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
   1393                         src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
   1394                         src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
   1395 
   1396                         /* (i4_tmp >> shift) */ /* Second 8 Values */
   1397                         src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
   1398                         src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
   1399                         src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
   1400                         src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
   1401 
   1402                         /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
   1403                         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
   1404                         src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
   1405                         src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
   1406                         src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
   1407 
   1408                         /* store four 8-bit output values  */ /* 16 8 Values */
   1409                         _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
   1410                         _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
   1411                         _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
   1412                         _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
   1413 
   1414                         /* To update pointer */
   1415                         pi2_src1 += 16;
   1416                         pi2_src2 += 16;
   1417                         pu1_dst  += 16;
   1418 
   1419                     } /* inner loop ends here(8-output values in single iteration) */
   1420 
   1421                     pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
   1422                     pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
   1423                     pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
   1424 
   1425                 }
   1426             }
   1427             else if(0 == (wd & 7)) /* multiple of 8 case */
   1428             {
   1429                 /*  outer for loop starts from here */
   1430                 for(row = 0; row < ht; row += 4)
   1431                 {
   1432                     for(col = 0; col < wd; col += 8)
   1433                     {
   1434                         /*load 8 pixel values */
   1435                         src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
   1436                         src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
   1437                         /* row = 1 */
   1438                         src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
   1439                         src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
   1440                         /* row = 2 */
   1441                         src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
   1442                         src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
   1443                         /* row = 3 */
   1444                         src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
   1445                         src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
   1446 
   1447                         /* (pi2_src1[col] + pi2_src2[col]) */
   1448                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1449                         src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1450                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1451                         src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
   1452 
   1453                         /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   1454                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1455                         src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
   1456                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   1457                         src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
   1458 
   1459                         /* (i4_tmp >> shift) */
   1460                         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1461                         src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
   1462                         src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   1463                         src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
   1464 
   1465                         /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1466                         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   1467                         src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
   1468                         src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
   1469                         src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
   1470 
   1471                         /* store four 8-bit output values  */
   1472                         _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
   1473                         _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
   1474                         _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
   1475                         _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
   1476 
   1477                         /* To update pointer */
   1478                         pi2_src1 += 8;
   1479                         pi2_src2 += 8;
   1480                         pu1_dst  += 8;
   1481 
   1482                     } /* inner loop ends here(8-output values in single iteration) */
   1483 
   1484                     pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
   1485                     pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
   1486                     pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
   1487 
   1488                 }
   1489             }
   1490             else /* wd multiple of 4 case*/
   1491             {
   1492                 WORD32 dst0, dst1, dst2, dst3;
   1493 
   1494                 /*  outer for loop starts from here */
   1495                 for(row = 0; row < ht; row += 4)
   1496                 {
   1497                     for(col = 0; col < wd; col += 4)
   1498                     {
   1499                         /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
   1500                         src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
   1501                         /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1502                         src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
   1503 
   1504                         /* row = 1 */
   1505                         src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
   1506                         src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
   1507                         /* row = 2 */
   1508                         src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
   1509                         src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
   1510                         /* row = 3 */
   1511                         src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
   1512                         src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
   1513 
   1514                         /* Pack two rows together */
   1515                         src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
   1516                         src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
   1517                         src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
   1518                         src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
   1519 
   1520                         /* (pi2_src1[col] + pi2_src2[col]) */
   1521                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1522                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1523 
   1524                         /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   1525                         src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1526                         src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   1527 
   1528                         /* (i4_tmp >> shift) */
   1529                         src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1530                         src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   1531 
   1532                         /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1533                         src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   1534                         src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
   1535 
   1536                         dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
   1537                         /* dst row = 1 to 3 */
   1538                         src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
   1539                         src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
   1540 
   1541                         /* store four 8-bit output values  */
   1542                         *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   1543 
   1544                         dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
   1545                         dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
   1546                         dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
   1547 
   1548                         /* row = 1 to row = 3 */
   1549                         *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   1550                         *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
   1551                         *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
   1552 
   1553                         /* To update pointer */
   1554                         pi2_src1 += 4;
   1555                         pi2_src2 += 4;
   1556                         pu1_dst  += 4;
   1557 
   1558                     } /* inner loop ends here(4-output values in single iteration) */
   1559 
   1560                     pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
   1561                     pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
   1562                     pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
   1563 
   1564                 }
   1565             }
   1566         }
   1567         else /* ht multiple of 2 case and wd multiple of 4 case*/
   1568         {
   1569 
   1570             WORD32 dst0, dst1;
   1571 
   1572             /*  outer for loop starts from here */
   1573             for(row = 0; row < ht; row += 2)
   1574             {
   1575                 for(col = 0; col < wd; col += 4)
   1576                 {
   1577                     /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
   1578                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
   1579                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1580                     src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
   1581 
   1582                     /* row = 1 */
   1583                     src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
   1584                     src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
   1585 
   1586                     /* Pack two rows together */
   1587                     src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
   1588                     src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
   1589 
   1590                     /* (pi2_src1[col] + pi2_src2[col]) */
   1591                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1592 
   1593                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   1594                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1595 
   1596                     /* (i4_tmp >> shift) */
   1597                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1598 
   1599                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1600                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   1601 
   1602                     dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
   1603                     /* dst row = 1 to 3 */
   1604                     src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
   1605 
   1606                     /* store four 8-bit output values  */
   1607                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   1608 
   1609                     dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
   1610 
   1611                     /* row = 1 to row = 3 */
   1612                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   1613 
   1614                     /* To update pointer */
   1615                     pi2_src1 += 4;
   1616                     pi2_src2 += 4;
   1617                     pu1_dst  += 4;
   1618 
   1619                 } /* inner loop ends here(4-output values in single iteration) */
   1620 
   1621                 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
   1622                 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
   1623                 pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
   1624 
   1625             }
   1626 
   1627         }
   1628 
   1629     }
   1630 }
   1631 
   1632 
   1633 /**
   1634 *******************************************************************************
   1635 *
   1636 * @brief
   1637 *  Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and
   1638 * pi2_src2 and stores it at location  pointed by pi2_dst
   1639 *
   1640 * @par Description:
   1641 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
   1642 * >> shift  where shift = 15 - BitDepth
   1643 *
   1644 * @param[in] pi2_src1
   1645 *  Pointer to source 1
   1646 *
   1647 * @param[in] pi2_src2
   1648 *  Pointer to source 2
   1649 *
   1650 * @param[out] pu1_dst
   1651 *  Pointer to destination
   1652 *
   1653 * @param[in] src_strd1
   1654 *  Source stride 1
   1655 *
   1656 * @param[in] src_strd2
   1657 *  Source stride 2
   1658 *
   1659 * @param[in] dst_strd
   1660 *  Destination stride
   1661 *
   1662 * @param[in] lvl_shift1
   1663 *  added before shift and offset
   1664 *
   1665 * @param[in] lvl_shift2
   1666 *  added before shift and offset
   1667 *
   1668 * @param[in] ht
   1669 *  height of the source
   1670 *
   1671 * @param[in] wd
   1672 *  width of the source (each colour component)
   1673 *
   1674 * @returns
   1675 *
   1676 * @remarks
   1677 *  None
   1678 *
   1679 * Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0.
   1680 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
   1681 * final result will match even if intermediate precision is in 16 bit.
   1682 *******************************************************************************
   1683 */
   1684 
   1685 void ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1,
   1686                                                  WORD16 *pi2_src2,
   1687                                                  UWORD8 *pu1_dst,
   1688                                                  WORD32 src_strd1,
   1689                                                  WORD32 src_strd2,
   1690                                                  WORD32 dst_strd,
   1691                                                  WORD32 lvl_shift1,
   1692                                                  WORD32 lvl_shift2,
   1693                                                  WORD32 ht,
   1694                                                  WORD32 wd)
   1695 {
   1696     WORD32 row, col, temp;
   1697     WORD32 shift, wdx2;
   1698 
   1699     __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1700     __m128i lvl_shift1_8x16b;
   1701     __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1702 
   1703     ASSERT(wd % 2 == 0); /* checking assumption*/
   1704     ASSERT(ht % 2 == 0); /* checking assumption*/
   1705     UNUSED(lvl_shift1);
   1706     UNUSED(lvl_shift2);
   1707     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
   1708     temp = 1 << (shift - 1);
   1709     wdx2 = wd * 2;
   1710 
   1711     // seting values in register
   1712     lvl_shift1_8x16b = _mm_set1_epi16(temp);
   1713 
   1714     if(0 == (ht & 3)) /* ht multiple of 4 case */
   1715     {
   1716         if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
   1717         {
   1718             __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
   1719             __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
   1720             /*  outer for loop starts from here */
   1721             for(row = 0; row < ht; row += 4)
   1722             {
   1723                 for(col = 0; col < wdx2; col += 16)
   1724                 {
   1725                     /*load 8 pixel values */ /* First 8 Values */
   1726                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
   1727                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
   1728                     /* row = 1 */
   1729                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
   1730                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
   1731                     /* row = 2 */
   1732                     src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
   1733                     src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
   1734                     /* row = 3 */
   1735                     src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
   1736                     src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
   1737 
   1738                     /*load 8 pixel values */ /* Second 8 Values */
   1739                     src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
   1740                     src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
   1741                     /* row = 1 */
   1742                     src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
   1743                     src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
   1744                     /* row = 2 */
   1745                     src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
   1746                     src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
   1747 
   1748                     /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
   1749                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1750                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1751                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1752                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
   1753 
   1754                     /*load 8 pixel values */ /* Second 8 Values */
   1755                     /* row = 3 */
   1756                     src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
   1757                     src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
   1758 
   1759                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
   1760                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1761                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
   1762                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   1763                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
   1764 
   1765                     /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
   1766                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
   1767                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
   1768                     src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
   1769                     src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
   1770 
   1771                     /* (i4_tmp >> shift) */ /* First 8 Values */
   1772                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1773                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
   1774                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   1775                     src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
   1776 
   1777                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
   1778                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
   1779                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
   1780                     src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
   1781                     src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
   1782 
   1783                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
   1784                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   1785                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
   1786                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
   1787                     src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
   1788 
   1789                     /* (i4_tmp >> shift) */ /* Second 8 Values */
   1790                     src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
   1791                     src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
   1792                     src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
   1793                     src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
   1794 
   1795                     /* store four 8-bit output values  */ /* First 8 Values */
   1796                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
   1797                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
   1798                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
   1799                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
   1800 
   1801                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
   1802                     src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
   1803                     src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
   1804                     src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b);
   1805                     src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b);
   1806 
   1807                     /* store four 8-bit output values  */ /* Second 8 Values */
   1808                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
   1809                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
   1810                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/
   1811                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/
   1812 
   1813                     /* To update pointer */
   1814                     pi2_src1 += 16;
   1815                     pi2_src2 += 16;
   1816                     pu1_dst  += 16;
   1817 
   1818                 } /* inner loop ends here(8-output values in single iteration) */
   1819 
   1820                 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
   1821                 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
   1822                 pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
   1823 
   1824             }
   1825         }
   1826         else if(0 == (wdx2 & 7)) /* multiple of 8 case */
   1827         {
   1828             /*  outer for loop starts from here */
   1829             for(row = 0; row < ht; row += 4)
   1830             {
   1831                 for(col = 0; col < wdx2; col += 8)
   1832                 {
   1833                     /*load 8 pixel values */
   1834                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
   1835                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
   1836                     /* row = 1 */
   1837                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
   1838                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
   1839                     /* row = 2 */
   1840                     src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
   1841                     src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
   1842                     /* row = 3 */
   1843                     src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
   1844                     src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
   1845 
   1846                     /* (pi2_src1[col] + pi2_src2[col]) */
   1847                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1848                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1849                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1850                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
   1851 
   1852                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   1853                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1854                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
   1855                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   1856                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
   1857 
   1858                     /* (i4_tmp >> shift) */
   1859                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1860                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
   1861                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   1862                     src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
   1863 
   1864                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1865                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   1866                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
   1867                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
   1868                     src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
   1869 
   1870                     /* store four 8-bit output values  */
   1871                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
   1872                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
   1873                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
   1874                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
   1875 
   1876                     /* To update pointer */
   1877                     pi2_src1 += 8;
   1878                     pi2_src2 += 8;
   1879                     pu1_dst  += 8;
   1880 
   1881                 } /* inner loop ends here(8-output values in single iteration) */
   1882 
   1883                 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;    /* Pointer update */
   1884                 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;    /* Pointer update */
   1885                 pu1_dst  = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */
   1886 
   1887             }
   1888         }
   1889         else /* 2*wd multiple of 4 case */
   1890         {
   1891             WORD32 dst0, dst1, dst2, dst3;
   1892             /*  outer for loop starts from here */
   1893             for(row = 0; row < ht; row += 4)
   1894             {
   1895                 for(col = 0; col < wdx2; col += 4)
   1896                 {
   1897                     /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
   1898                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
   1899                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1900                     src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
   1901 
   1902                     /* row = 1 */
   1903                     src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
   1904                     src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
   1905                     /* row = 2 */
   1906                     src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
   1907                     src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
   1908                     /* row = 3 */
   1909                     src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
   1910                     src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
   1911 
   1912                     /* Pack two rows together */
   1913                     src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
   1914                     src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
   1915                     src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
   1916                     src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
   1917 
   1918                     /* (pi2_src1[col] + pi2_src2[col]) */
   1919                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1920                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1921 
   1922                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   1923                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1924                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   1925 
   1926                     /* (i4_tmp >> shift) */
   1927                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1928                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   1929 
   1930                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1931                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   1932                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
   1933 
   1934                     dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
   1935                     /* dst row = 1 to 3 */
   1936                     src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
   1937                     src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
   1938 
   1939                     /* store four 8-bit output values  */
   1940                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   1941 
   1942                     dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
   1943                     dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
   1944                     dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
   1945 
   1946                     /* row = 1 to row = 3 */
   1947                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   1948                     *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
   1949                     *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
   1950 
   1951                     /* To update pointer */
   1952                     pi2_src1 += 4;
   1953                     pi2_src2 += 4;
   1954                     pu1_dst  += 4;
   1955 
   1956                 } /* inner loop ends here(4-output values in single iteration) */
   1957 
   1958                 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1;   /* Pointer update */
   1959                 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2;   /* Pointer update */
   1960                 pu1_dst  = pu1_dst  - wdx2 + 4 * dst_strd;    /* Pointer update */
   1961 
   1962             }
   1963         }
   1964     }
   1965     else /* ht multiple of 2 case */
   1966     {
   1967         if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */
   1968         {
   1969             __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
   1970             /*  outer for loop starts from here */
   1971             for(row = 0; row < ht; row += 2)
   1972             {
   1973                 for(col = 0; col < wdx2; col += 16)
   1974                 {
   1975                     /*load 8 pixel values */ /* First 8 Values */
   1976                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
   1977                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
   1978                     /* row = 1 */
   1979                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
   1980                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
   1981 
   1982                     /*load 8 pixel values */ /* Second 8 Values */
   1983                     src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
   1984                     src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
   1985                     /* row = 1 */
   1986                     src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
   1987                     src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
   1988 
   1989                     /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
   1990                     src_temp1_8x16b  = _mm_adds_epi16(src_temp1_8x16b,  src_temp2_8x16b);
   1991                     src_temp3_8x16b  = _mm_adds_epi16(src_temp3_8x16b,  src_temp4_8x16b);
   1992 
   1993                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
   1994                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1995                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
   1996 
   1997                     /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
   1998                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
   1999                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
   2000 
   2001                     /* (i4_tmp >> shift) */ /* First 8 Values */
   2002                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   2003                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
   2004 
   2005                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
   2006                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
   2007                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
   2008 
   2009                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */
   2010                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   2011                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
   2012 
   2013                     /* (i4_tmp >> shift) */ /* Second 8 Values */
   2014                     src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
   2015                     src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
   2016 
   2017                     /* store four 8-bit output values  */ /* First 8 Values */
   2018                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
   2019                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
   2020 
   2021                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */
   2022                     src_temp9_8x16b  = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b);
   2023                     src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b);
   2024 
   2025                     /* store four 8-bit output values  */ /* Second 8 Values */
   2026                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/
   2027                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/
   2028 
   2029                     /* To update pointer */
   2030                     pi2_src1 += 16;
   2031                     pi2_src2 += 16;
   2032                     pu1_dst  += 16;
   2033 
   2034                 } /* inner loop ends here(8-output values in single iteration) */
   2035 
   2036                 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
   2037                 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
   2038                 pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
   2039 
   2040             }
   2041         }
   2042         else if(0 == (wdx2 & 7)) /* multiple of 8 case */
   2043         {
   2044             /*  outer for loop starts from here */
   2045             for(row = 0; row < ht; row += 2)
   2046             {
   2047                 for(col = 0; col < wdx2; col += 8)
   2048                 {
   2049                     /*load 8 pixel values */
   2050                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
   2051                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
   2052                     /* row = 1 */
   2053                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
   2054                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
   2055 
   2056                     /* (pi2_src1[col] + pi2_src2[col]) */
   2057                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   2058                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
   2059 
   2060                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   2061                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   2062                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
   2063 
   2064                     /* (i4_tmp >> shift) */
   2065                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   2066                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
   2067 
   2068                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   2069                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   2070                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
   2071 
   2072                     /* store four 8-bit output values  */
   2073                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
   2074                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/
   2075 
   2076                     /* To update pointer */
   2077                     pi2_src1 += 8;
   2078                     pi2_src2 += 8;
   2079                     pu1_dst  += 8;
   2080 
   2081                 } /* inner loop ends here(8-output values in single iteration) */
   2082 
   2083                 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
   2084                 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
   2085                 pu1_dst  = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
   2086 
   2087             }
   2088         }
   2089         else /* 2*wd multiple of 4 case */
   2090         {
   2091             WORD32 dst0, dst1;
   2092             /*  outer for loop starts from here */
   2093             for(row = 0; row < ht; row += 2)
   2094             {
   2095                 for(col = 0; col < wdx2; col += 4)
   2096                 {
   2097                     /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
   2098                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
   2099                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   2100                     src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
   2101                     /* row = 1 */
   2102                     src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
   2103                     src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
   2104 
   2105                     /* Pack two rows together */
   2106                     src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
   2107                     src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
   2108 
   2109                     /* (pi2_src1[col] + pi2_src2[col]) */
   2110                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   2111                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   2112                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   2113 
   2114                     /* (i4_tmp >> shift) */
   2115                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   2116                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   2117                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   2118 
   2119                     dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
   2120                     /* dst row = 1 */
   2121                     src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
   2122 
   2123                     /* store four 8-bit output values  */
   2124                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   2125 
   2126                     dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
   2127                     /* row = 1 */
   2128                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   2129 
   2130                     /* To update pointer */
   2131                     pi2_src1 += 4;
   2132                     pi2_src2 += 4;
   2133                     pu1_dst  += 4;
   2134                 } /* inner loop ends here(4-output values in single iteration) */
   2135 
   2136                 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;   /* Pointer update */
   2137                 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;   /* Pointer update */
   2138                 pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;    /* Pointer update */
   2139 
   2140             }
   2141         }
   2142     }
   2143 }
   2144