Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_weighted_pred_x86_intr.c
     22 *
     23 * @brief
     24 *  Contains function definitions for weighted prediction used in inter
     25 * prediction
     26 *
     27 * @author
     28 *
     29 *
     30 * @par List of Functions:
     31 *   - ihevc_weighted_pred_uni_sse42()
     32 *   - ihevc_weighted_pred_bi_sse42()
     33 *   - ihevc_weighted_pred_bi_default_sse42()
     34 *   - ihevc_weighted_pred_chroma_uni_sse42()
     35 *   - ihevc_weighted_pred_chroma_bi_sse42()
     36 *
     37 * @remarks
     38 *  None
     39 *
     40 *******************************************************************************
     41 */
     42 /*****************************************************************************/
     43 /* File Includes                                                             */
     44 /*****************************************************************************/
     45 #include <stdio.h>
     46 #include <assert.h>
     47 
     48 #include "ihevc_debug.h"
     49 #include "ihevc_typedefs.h"
     50 #include "ihevc_macros.h"
     51 #include "ihevc_platform_macros.h"
     52 #include "ihevc_func_selector.h"
     53 #include "ihevc_defs.h"
     54 #include "ihevc_weighted_pred.h"
     55 #include "ihevc_inter_pred.h"
     56 
     57 #include <immintrin.h>
     58 
     59 /**
     60 *******************************************************************************
     61 *
     62 * @brief
     63 *  Does uni-weighted prediction on the array pointed by  pi2_src and stores
     64 * it at the location pointed by pi2_dst
     65 *
     66 * @par Description:
     67 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
     68 * offset
     69 *
     70 * @param[in] pi2_src
     71 *  Pointer to the source
     72 *
     73 * @param[out] pu1_dst
     74 *  Pointer to the destination
     75 *
     76 * @param[in] src_strd
     77 *  Source stride
     78 *
     79 * @param[in] dst_strd
     80 *  Destination stride
     81 *
     82 * @param[in] wgt0
     83 *  weight to be multiplied to the source
     84 *
     85 * @param[in] off0
     86 *  offset to be added after rounding and
     87 *
     88 * @param[in] shifting
     89 *
     90 *
     91 * @param[in] shift
     92 *  (14 Bit depth) + log2_weight_denominator
     93 *
     94 * @param[in] lvl_shift
     95 *  added before shift and offset
     96 *
     97 * @param[in] ht
     98 *  height of the source
     99 *
    100 * @param[in] wd
    101 *  width of the source
    102 *
    103 * @returns
    104 *
    105 * @remarks
    106 *  None
    107 *
    108 *******************************************************************************
    109 */
    110 
    111 void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src,
    112                                    UWORD8 *pu1_dst,
    113                                    WORD32 src_strd,
    114                                    WORD32 dst_strd,
    115                                    WORD32 wgt0,
    116                                    WORD32 off0,
    117                                    WORD32 shift,
    118                                    WORD32 lvl_shift,
    119                                    WORD32 ht,
    120                                    WORD32 wd)
    121 {
    122     WORD32 row, col, temp;
    123     WORD32 dst0, dst1, dst2, dst3;
    124 
    125     /* all 128 bit registers are named with a suffix mxnb, where m is the */
    126     /* number of n bits packed in the register                            */
    127     __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b;
    128     __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
    129 
    130     ASSERT(wd % 4 == 0); /* checking assumption*/
    131     ASSERT(ht % 4 == 0); /* checking assumption*/
    132 
    133     temp = 1 << (shift - 1);
    134 
    135     // seting values in register
    136     const_temp_4x32b = _mm_set1_epi32(temp);
    137     lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
    138     wgt0_4x32b = _mm_set1_epi32(wgt0);
    139     off0_4x32b = _mm_set1_epi32(off0);
    140 
    141     if(0 == (wd & 7)) /* wd multiple of 8 case */
    142     {
    143         __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
    144 
    145         /*  outer for loop starts from here */
    146         for(row = 0; row < ht; row += 4)
    147         {
    148             for(col = 0; col < wd; col += 8)
    149             {   /* for row =0 ,1,2,3*/
    150 
    151                 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    152                 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
    153                 /* row = 1 */
    154                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
    155                 /* row = 2 */
    156                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
    157                 /* row = 3 */
    158                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
    159 
    160                 /* row = 0 */ /* Last 4 pixels */
    161                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
    162                 /* row = 1 */
    163                 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
    164                 /* row = 2 */
    165                 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4));
    166                 /* row = 3 */
    167                 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4));
    168 
    169                 /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */
    170                 src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
    171                 src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
    172                 src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
    173                 src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
    174 
    175                 /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */
    176                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
    177                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
    178                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
    179                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
    180 
    181                 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */
    182                 src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
    183                 src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
    184                 src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
    185                 src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
    186 
    187                 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
    188                 src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
    189                 src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
    190                 src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
    191                 src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
    192 
    193                 /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */
    194                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
    195                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
    196                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
    197                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
    198 
    199                 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */
    200                 src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
    201                 src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
    202                 src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
    203                 src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
    204 
    205                 /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */
    206                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
    207                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
    208                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
    209                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
    210 
    211                 /* (i4_tmp >> shift) */ /* First 4 pixels */
    212                 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
    213                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
    214                 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
    215                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
    216 
    217                 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
    218                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
    219                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
    220                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
    221                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
    222 
    223                 /* (i4_tmp >> shift) */ /* Last 4 pixels */
    224                 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
    225                 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
    226                 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
    227                 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
    228 
    229                 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
    230                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
    231                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
    232                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
    233                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
    234 
    235                 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
    236                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
    237                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
    238                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
    239                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
    240 
    241                 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b);
    242                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
    243                 src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b);
    244                 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
    245                 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    246                 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
    247                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
    248                 src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b);
    249                 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
    250 
    251                 /* store four 8-bit output values  */
    252                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
    253                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/
    254                 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/
    255                 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/
    256 
    257                 /* To update pointer */
    258                 pi2_src += 8;
    259                 pu1_dst += 8;
    260 
    261             } /* inner loop ends here(4-output values in single iteration) */
    262 
    263             pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
    264             pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
    265 
    266         }
    267     }
    268     else  /* wd multiple of 4 case */
    269     {
    270         /*  outer for loop starts from here */
    271         for(row = 0; row < ht; row += 4)
    272         {
    273             for(col = 0; col < wd; col += 4)
    274             {   /* for row =0 ,1,2,3*/
    275 
    276                 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    277                 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
    278                 /* row = 1 */
    279                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
    280                 /* row = 2 */
    281                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
    282                 /* row = 3 */
    283                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
    284 
    285                 /* considering pix. 4:0 by converting 16-into 32 bit */
    286                 src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
    287                 src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
    288                 src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
    289                 src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
    290 
    291                 /* (pi2_src[col] + lvl_shift)*/
    292                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
    293                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
    294                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
    295                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
    296 
    297                 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    298                 src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
    299                 src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
    300                 src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
    301                 src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
    302 
    303                 /* i4_tmp += 1 << (shift - 1) */
    304                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
    305                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
    306                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
    307                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
    308 
    309                 /* (i4_tmp >> shift) */
    310                 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
    311                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
    312                 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
    313                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
    314 
    315                 /*i4_tmp = (i4_tmp >> shift) + off0; */
    316                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
    317                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
    318                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
    319                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
    320 
    321                 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
    322                 src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b);
    323 
    324                 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    325                 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b);
    326 
    327                 dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
    328                 /* dst row = 1 to 3 */
    329                 src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
    330                 src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2);
    331                 src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3);
    332 
    333                 /* store four 8-bit output values  */
    334                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
    335 
    336                 dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
    337                 dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
    338                 dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
    339 
    340                 /* row = 1 to row = 3 */
    341                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
    342                 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
    343                 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
    344 
    345                 /* To update pointer */
    346                 pi2_src += 4;
    347                 pu1_dst += 4;
    348 
    349             } /* inner loop ends here(4-output values in single iteration) */
    350 
    351             pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
    352             pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
    353 
    354         }
    355     }
    356 }
    357 
    358 /**
    359 *******************************************************************************
    360 *
    361 * @brief
    362 * Does chroma uni-weighted prediction on array pointed by pi2_src and stores
    363 * it at the location pointed by pi2_dst
    364 *
    365 * @par Description:
    366 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
    367 * offset
    368 *
    369 * @param[in] pi2_src
    370 *  Pointer to the source
    371 *
    372 * @param[out] pu1_dst
    373 *  Pointer to the destination
    374 *
    375 * @param[in] src_strd
    376 *  Source stride
    377 *
    378 * @param[in] dst_strd
    379 *  Destination stride
    380 *
    381 * @param[in] wgt0
    382 *  weight to be multiplied to the source
    383 *
    384 * @param[in] off0
    385 *  offset to be added after rounding and
    386 *
    387 * @param[in] shifting
    388 *
    389 *
    390 * @param[in] shift
    391 *  (14 Bit depth) + log2_weight_denominator
    392 *
    393 * @param[in] lvl_shift
    394 *  added before shift and offset
    395 *
    396 * @param[in] ht
    397 *  height of the source
    398 *
    399 * @param[in] wd
    400 *  width of the source (each colour component)
    401 *
    402 * @returns
    403 *
    404 * @remarks
    405 *  None
    406 *
    407 *******************************************************************************
    408 */
    409 
    410 void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src,
    411                                           UWORD8 *pu1_dst,
    412                                           WORD32 src_strd,
    413                                           WORD32 dst_strd,
    414                                           WORD32 wgt0_cb,
    415                                           WORD32 wgt0_cr,
    416                                           WORD32 off0_cb,
    417                                           WORD32 off0_cr,
    418                                           WORD32 shift,
    419                                           WORD32 lvl_shift,
    420                                           WORD32 ht,
    421                                           WORD32 wd)
    422 {
    423     WORD32 row, col, temp, wdx2;
    424     /* all 128 bit registers are named with a suffix mxnb, where m is the */
    425     /* number of n bits packed in the register                            */
    426 
    427     __m128i src_temp0_4x32b, src_temp1_4x32b;
    428     __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
    429 
    430     ASSERT(wd % 2 == 0); /* checking assumption*/
    431     ASSERT(ht % 2 == 0); /* checking assumption*/
    432 
    433     temp = 1 << (shift - 1);
    434     wdx2 = 2 * wd;
    435 
    436     // seting values in register
    437     const_temp_4x32b = _mm_set1_epi32(temp);
    438     lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
    439     wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
    440     off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
    441 
    442 #if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */
    443     if( 0 == (ht & 3)) /* ht multiple of 4 case */
    444     {
    445         if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
    446         {
    447             __m128i src_temp2_4x32b, src_temp3_4x32b;
    448             __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
    449             __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b;
    450             __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b;
    451             /*  outer for loop starts from here */
    452             for(row = 0; row < ht; row +=4)
    453             {
    454                 for(col = 0; col < wdx2; col +=16)
    455                 {
    456                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    457                     src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
    458                     /* row = 1 */
    459                     src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
    460                     /* row = 0 */ /* Second 4 pixels */
    461                     src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
    462                     /* row = 1 */
    463                     src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
    464                     /* row = 0 */ /* Third 4 pixels */
    465                     src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8));
    466                     /* row = 1 */
    467                     src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8));
    468                     /* row = 0 */ /* Last 4 pixels */
    469                     src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12));
    470                     /* row = 1 */
    471                     src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12));
    472 
    473                     /* considering pix. 4:0 by converting 16-into 32 bit */
    474                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
    475                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
    476                     /* (pi2_src[col] + lvl_shift)*/
    477                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
    478                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
    479                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    480                     src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
    481                     src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
    482 
    483                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
    484                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
    485                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
    486                     /* (pi2_src[col] + lvl_shift)*/
    487                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
    488                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
    489                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    490                     src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
    491                     src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
    492 
    493                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
    494                     src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
    495                     src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
    496                     /* (pi2_src[col] + lvl_shift)*/
    497                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
    498                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
    499                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    500                     src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
    501                     src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
    502 
    503                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
    504                     src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
    505                     src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
    506                     /* (pi2_src[col] + lvl_shift)*/
    507                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
    508                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
    509                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    510                     src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
    511                     src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
    512 
    513                     /* i4_tmp += 1 << (shift - 1) */
    514                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
    515                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
    516                     /* (i4_tmp >> shift) */
    517                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
    518                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
    519 
    520                     /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
    521                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
    522                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
    523                     /* (i4_tmp >> shift) */
    524                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
    525                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
    526 
    527                     /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
    528                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
    529                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
    530                     /* (i4_tmp >> shift) */
    531                     src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
    532                     src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
    533 
    534                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
    535                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
    536                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
    537                     /* (i4_tmp >> shift) */
    538                     src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
    539                     src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
    540 
    541                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    542                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
    543                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
    544                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
    545                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
    546                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
    547                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
    548                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
    549                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
    550                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
    551                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
    552                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
    553 
    554                     src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b);
    555                     src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b);
    556                     src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b);
    557                     src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b);
    558                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    559                     src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b);
    560                     src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b);
    561 
    562                     /* store 16 8-bit output values  */
    563                     _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
    564                     _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
    565 
    566                     /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    567                     src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
    568                     /* row = 3 */
    569                     src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
    570                     /* row = 2 */ /* Second 4 pixels */
    571                     src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
    572                     /* row = 3 */
    573                     src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
    574                     /* row = 2 */ /* Third 4 pixels */
    575                     src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8));
    576                     /* row = 3 */
    577                     src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8));
    578                     /* row = 2 */ /* Last 4 pixels */
    579                     src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12));
    580                     /* row = 3 */
    581                     src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12));
    582 
    583                     /* considering pix. 4:0 by converting 16-into 32 bit */
    584                     src_temp8_4x32b  = _mm_cvtepi16_epi32(src_temp8_4x32b);
    585                     src_temp9_4x32b  = _mm_cvtepi16_epi32(src_temp9_4x32b);
    586                     /* (pi2_src[col] + lvl_shift)*/
    587                     src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b);
    588                     src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b);
    589                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    590                     src_temp8_4x32b  = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b);
    591                     src_temp9_4x32b  = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b);
    592 
    593                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
    594                     src_temp10_4x32b  = _mm_cvtepi16_epi32(src_temp10_4x32b);
    595                     src_temp11_4x32b  = _mm_cvtepi16_epi32(src_temp11_4x32b);
    596                     /* (pi2_src[col] + lvl_shift)*/
    597                     src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b);
    598                     src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b);
    599                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    600                     src_temp10_4x32b  = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b);
    601                     src_temp11_4x32b  = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b);
    602 
    603                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
    604                     src_temp12_4x32b  = _mm_cvtepi16_epi32(src_temp12_4x32b);
    605                     src_temp13_4x32b  = _mm_cvtepi16_epi32(src_temp13_4x32b);
    606                     /* (pi2_src[col] + lvl_shift)*/
    607                     src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b);
    608                     src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b);
    609                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    610                     src_temp12_4x32b  = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b);
    611                     src_temp13_4x32b  = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b);
    612 
    613                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
    614                     src_temp14_4x32b  = _mm_cvtepi16_epi32(src_temp14_4x32b);
    615                     src_temp15_4x32b  = _mm_cvtepi16_epi32(src_temp15_4x32b);
    616                     /* (pi2_src[col] + lvl_shift)*/
    617                     src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b);
    618                     src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b);
    619                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    620                     src_temp14_4x32b  = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b);
    621                     src_temp15_4x32b  = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b);
    622 
    623                     /* i4_tmp += 1 << (shift - 1) */
    624                     src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b);
    625                     src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b);
    626                     /* (i4_tmp >> shift) */
    627                     src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b,  shift);
    628                     src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b,  shift);
    629 
    630                     /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
    631                     src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b);
    632                     src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b);
    633                     /* (i4_tmp >> shift) */
    634                     src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b,  shift);
    635                     src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b,  shift);
    636 
    637                     /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
    638                     src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b);
    639                     src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b);
    640                     /* (i4_tmp >> shift) */
    641                     src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b,  shift);
    642                     src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b,  shift);
    643 
    644                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
    645                     src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b);
    646                     src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b);
    647                     /* (i4_tmp >> shift) */
    648                     src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b,  shift);
    649                     src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b,  shift);
    650 
    651                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    652                     src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b);
    653                     src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b);
    654                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
    655                     src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b);
    656                     src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b);
    657                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
    658                     src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b);
    659                     src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b);
    660                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
    661                     src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b);
    662                     src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b);
    663 
    664                     src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b);
    665                     src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b);
    666                     src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b);
    667                     src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b);
    668                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    669                     src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b);
    670                     src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b);
    671 
    672                     /* store 16 8-bit output values  */
    673                     _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/
    674                     _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/
    675 
    676                     pi2_src += 16;  /* Pointer update */
    677                     pu1_dst += 16; /* Pointer update */
    678 
    679                 } /* inner loop ends here(4-output values in single iteration) */
    680                 pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
    681                 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
    682             }
    683         }
    684         else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
    685         {
    686             __m128i src_temp2_4x32b,src_temp3_4x32b;
    687             __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
    688             /*  outer for loop starts from here */
    689             for(row = 0; row < ht; row +=4)
    690             {
    691                 for(col = 0; col < wdx2; col +=8)
    692                 {
    693                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    694                     src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
    695                     /* row = 1 */
    696                     src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
    697                     /* row = 2 */
    698                     src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
    699                     /* row = 3 */
    700                     src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
    701 
    702                     /* row = 0 */ /* Last 4 pixels */
    703                     src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
    704                     /* row = 1 */
    705                     src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
    706                     /* row = 2 */
    707                     src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
    708                     /* row = 3 */
    709                     src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
    710 
    711                     /* considering pix. 4:0 by converting 16-into 32 bit */
    712                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
    713                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
    714                     /* (pi2_src[col] + lvl_shift)*/
    715                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
    716                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
    717                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    718                     src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
    719                     src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
    720 
    721                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
    722                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
    723                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
    724                     /* (pi2_src[col] + lvl_shift)*/
    725                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
    726                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
    727                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    728                     src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
    729                     src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
    730 
    731                     /* considering pix. 4:0 by converting 16-into 32 bit */
    732                     src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
    733                     src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
    734                     /* (pi2_src[col] + lvl_shift)*/
    735                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
    736                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
    737                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    738                     src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
    739                     src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
    740 
    741                     /* considering pix. 4:0 by converting 16-into 32 bit */
    742                     src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
    743                     src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
    744                     /* (pi2_src[col] + lvl_shift)*/
    745                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
    746                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
    747                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    748                     src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
    749                     src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
    750 
    751                     /* i4_tmp += 1 << (shift - 1) */
    752                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
    753                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
    754                     /* (i4_tmp >> shift) */
    755                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
    756                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
    757 
    758                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
    759                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
    760                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
    761                     /* (i4_tmp >> shift) */
    762                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
    763                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
    764 
    765                     /* i4_tmp += 1 << (shift - 1) */
    766                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
    767                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
    768                     /* (i4_tmp >> shift) */
    769                     src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
    770                     src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
    771 
    772                     /* i4_tmp += 1 << (shift - 1) */
    773                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
    774                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
    775                     /* (i4_tmp >> shift) */
    776                     src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
    777                     src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
    778 
    779                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    780                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
    781                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
    782                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
    783                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
    784                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
    785                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    786                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
    787                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
    788                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    789                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
    790                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
    791 
    792                     src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b);
    793                     src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b);
    794                     src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b);
    795                     src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b);
    796 
    797                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    798                     src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b);
    799                     src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b);
    800                     src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b);
    801                     src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b);
    802 
    803                     /* store four 8-bit output values  */
    804                     _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
    805                     _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
    806                     _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/
    807                     _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/
    808 
    809                     pi2_src += 8;   /* Pointer update */
    810                     pu1_dst += 8; /* Pointer update */
    811 
    812                 } /* inner loop ends here(4-output values in single iteration) */
    813                 pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
    814                 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
    815             }
    816         }
    817         else /* 2*wd multiple of 4 case */
    818         {
    819             WORD32 dst0, dst1, dst2, dst3;
    820             __m128i src_temp2_4x32b,src_temp3_4x32b;
    821             /*  outer for loop starts from here */
    822             for(row = 0; row < ht; row +=4)
    823             {
    824                 for(col = 0; col < wdx2; col +=4)
    825                 {
    826                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    827                     src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
    828                     /* row = 1 */
    829                     src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd));
    830                     /* row = 2 */
    831                     src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
    832                     /* row = 3 */
    833                     src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
    834 
    835                     /* considering pix. 4:0 by converting 16-into 32 bit */
    836                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
    837                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
    838                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
    839                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
    840 
    841                     /* (pi2_src[col] + lvl_shift)*/
    842                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
    843                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
    844                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    845                     src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
    846                     src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
    847 
    848                     /* (pi2_src[col] + lvl_shift)*/
    849                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
    850                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
    851                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    852                     src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
    853                     src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
    854 
    855                     /* i4_tmp += 1 << (shift - 1) */
    856                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
    857                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
    858                     /* (i4_tmp >> shift) */
    859                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
    860                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
    861                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    862                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
    863                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
    864 
    865                     /* i4_tmp += 1 << (shift - 1) */
    866                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
    867                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
    868                     /* (i4_tmp >> shift) */
    869                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
    870                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
    871                     /*i4_tmp = (i4_tmp >> shift) + off0; */
    872                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
    873                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
    874 
    875                     src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b);
    876                     src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b);
    877 
    878                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
    879                     src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b);
    880 
    881                     dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
    882                     /* dst row = 1 to 3 */
    883                     src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1);
    884                     src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2);
    885                     src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3);
    886 
    887                     /* store four 8-bit output values  */
    888                     *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;
    889 
    890                     dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
    891                     dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
    892                     dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
    893                     /* row = 1 */
    894                     *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
    895                     /* row = 2 */
    896                     *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
    897                     /* row = 3 */
    898                     *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;
    899 
    900                     pi2_src += 4;   /* Pointer update */
    901                     pu1_dst += 4; /* Pointer update */
    902 
    903                 } /* inner loop ends here(4-output values in single iteration) */
    904                 pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
    905                 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
    906             }
    907         }
    908     }
    909     else /* ht multiple of 2 case */
    910 #endif
    911 
    912     {
    913         if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
    914         {
    915             __m128i src_temp2_4x32b, src_temp3_4x32b;
    916             __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
    917             /*  outer for loop starts from here */
    918             for(row = 0; row < ht; row += 2)
    919             {
    920                 for(col = 0; col < wdx2; col += 16)
    921                 {
    922                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
    923                     src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
    924                     /* row = 1 */
    925                     src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
    926 
    927                     /* row = 0 */ /* Second 4 pixels */
    928                     src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
    929                     /* row = 1 */
    930                     src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
    931                     /* row = 0 */ /* Third 4 pixels */
    932                     src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
    933                     /* row = 1 */
    934                     src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
    935                     /* row = 0 */ /* Last 4 pixels */
    936                     src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12));
    937                     /* row = 1 */
    938                     src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12));
    939 
    940                     /* considering pix. 4:0 by converting 16-into 32 bit */
    941                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
    942                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
    943                     /* (pi2_src[col] + lvl_shift)*/
    944                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
    945                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
    946                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    947                     src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
    948                     src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
    949 
    950                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
    951                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
    952                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
    953                     /* (pi2_src[col] + lvl_shift)*/
    954                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
    955                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
    956                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    957                     src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
    958                     src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
    959 
    960                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
    961                     src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
    962                     src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
    963                     /* (pi2_src[col] + lvl_shift)*/
    964                     src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
    965                     src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
    966                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    967                     src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
    968                     src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
    969 
    970                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
    971                     src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
    972                     src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
    973                     /* (pi2_src[col] + lvl_shift)*/
    974                     src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
    975                     src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
    976                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
    977                     src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
    978                     src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
    979 
    980                     /* i4_tmp += 1 << (shift - 1) */
    981                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
    982                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
    983                     /* (i4_tmp >> shift) */
    984                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
    985                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
    986 
    987                     /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
    988                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
    989                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
    990                     /* (i4_tmp >> shift) */
    991                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
    992                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
    993 
    994                     /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
    995                     src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
    996                     src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
    997                     /* (i4_tmp >> shift) */
    998                     src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
    999                     src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
   1000 
   1001                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
   1002                     src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
   1003                     src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
   1004                     /* (i4_tmp >> shift) */
   1005                     src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
   1006                     src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
   1007 
   1008                     /*i4_tmp = (i4_tmp >> shift) + off0; */
   1009                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
   1010                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
   1011                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
   1012                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
   1013                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
   1014                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
   1015                     src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
   1016                     src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
   1017                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
   1018                     src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
   1019                     src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
   1020 
   1021                     src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
   1022                     src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
   1023                     src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b);
   1024                     src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b);
   1025                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
   1026                     src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b);
   1027                     src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b);
   1028 
   1029                     /* store 16 8-bit output values  */
   1030                     _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
   1031                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
   1032 
   1033                     pi2_src += 16;  /* Pointer update */
   1034                     pu1_dst += 16; /* Pointer update */
   1035 
   1036                 } /* inner loop ends here(4-output values in single iteration) */
   1037                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
   1038                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
   1039             }
   1040         }
   1041         else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
   1042         {
   1043             __m128i src_temp2_4x32b, src_temp3_4x32b;
   1044             /*  outer for loop starts from here */
   1045             for(row = 0; row < ht; row += 2)
   1046             {
   1047                 for(col = 0; col < wdx2; col += 8)
   1048                 {
   1049                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1050                     src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
   1051                     /* row = 1 */
   1052                     src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
   1053 
   1054                     /* row = 0 */ /* Last 4 pixels */
   1055                     src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
   1056                     /* row = 1 */
   1057                     src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
   1058 
   1059                     /* considering pix. 4:0 by converting 16-into 32 bit */
   1060                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
   1061                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
   1062                     /* (pi2_src[col] + lvl_shift)*/
   1063                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
   1064                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
   1065                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
   1066                     src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
   1067                     src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
   1068 
   1069                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
   1070                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
   1071                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
   1072                     /* (pi2_src[col] + lvl_shift)*/
   1073                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
   1074                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
   1075                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
   1076                     src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
   1077                     src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
   1078 
   1079                     /* i4_tmp += 1 << (shift - 1) */
   1080                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
   1081                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
   1082                     /* (i4_tmp >> shift) */
   1083                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
   1084                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
   1085 
   1086                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
   1087                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
   1088                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
   1089                     /* (i4_tmp >> shift) */
   1090                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
   1091                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
   1092 
   1093                     /*i4_tmp = (i4_tmp >> shift) + off0; */
   1094                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
   1095                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
   1096                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
   1097                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
   1098                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
   1099 
   1100                     src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
   1101                     src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
   1102 
   1103                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
   1104                     src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
   1105                     src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
   1106 
   1107                     /* store four 8-bit output values  */
   1108                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
   1109                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
   1110 
   1111                     pi2_src += 8;   /* Pointer update */
   1112                     pu1_dst += 8; /* Pointer update */
   1113 
   1114                 } /* inner loop ends here(4-output values in single iteration) */
   1115                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
   1116                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
   1117             }
   1118         }
   1119         else /* 2*wd multiple of 4 case */
   1120         {
   1121             WORD32 dst0, dst1;
   1122             /*  outer for loop starts from here */
   1123             for(row = 0; row < ht; row += 2)
   1124             {
   1125                 for(col = 0; col < wdx2; col += 4)
   1126                 {
   1127                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1128                     src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
   1129                     /* row = 1 */
   1130                     src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
   1131 
   1132                     /* considering pix. 4:0 by converting 16-into 32 bit */
   1133                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
   1134                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
   1135 
   1136                     /* (pi2_src[col] + lvl_shift)*/
   1137                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
   1138                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
   1139 
   1140                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
   1141                     src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
   1142                     src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
   1143 
   1144                     /* i4_tmp += 1 << (shift - 1) */
   1145                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
   1146                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
   1147 
   1148                     /* (i4_tmp >> shift) */
   1149                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
   1150                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
   1151 
   1152                     /*i4_tmp = (i4_tmp >> shift) + off0; */
   1153                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
   1154                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
   1155 
   1156                     src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
   1157 
   1158                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
   1159                     src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
   1160 
   1161                     dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
   1162                     /* dst row = 1 to 3 */
   1163                     src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
   1164 
   1165                     /* store four 8-bit output values  */
   1166                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   1167 
   1168                     dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
   1169                     /* row = 1 */
   1170                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   1171 
   1172                     pi2_src += 4;   /* Pointer update */
   1173                     pu1_dst += 4; /* Pointer update */
   1174 
   1175                 } /* inner loop ends here(4-output values in single iteration) */
   1176                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
   1177                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
   1178             }
   1179         }
   1180     }
   1181 }
   1182 
   1183 /**
   1184 *******************************************************************************
   1185 *
   1186 * @brief
   1187 *  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
   1188 * pi2_src2 and stores it at location pointed  by pi2_dst
   1189 *
   1190 * @par Description:
   1191 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
   1192 * off1 + 1) << (shift - 1) ) >> shift
   1193 *
   1194 * @param[in] pi2_src1
   1195 *  Pointer to source 1
   1196 *
   1197 * @param[in] pi2_src2
   1198 *  Pointer to source 2
   1199 *
   1200 * @param[out] pu1_dst
   1201 *  Pointer to destination
   1202 *
   1203 * @param[in] src_strd1
   1204 *  Source stride 1
   1205 *
   1206 * @param[in] src_strd2
   1207 *  Source stride 2
   1208 *
   1209 * @param[in] dst_strd
   1210 *  Destination stride
   1211 *
   1212 * @param[in] wgt0
   1213 *  weight to be multiplied to source 1
   1214 *
   1215 * @param[in] off0
   1216 *  offset 0
   1217 *
   1218 * @param[in] wgt1
   1219 *  weight to be multiplied to source 2
   1220 *
   1221 * @param[in] off1
   1222 *  offset 1
   1223 *
   1224 * @param[in] shift
   1225 *  (14 Bit depth) + log2_weight_denominator
   1226 *
   1227 * @param[in] lvl_shift1
   1228 *  added before shift and offset
   1229 *
   1230 * @param[in] lvl_shift2
   1231 *  added before shift and offset
   1232 *
   1233 * @param[in] ht
   1234 *  height of the source
   1235 *
   1236 * @param[in] wd
   1237 *  width of the source
   1238 *
   1239 * @returns
   1240 *
   1241 * @remarks
   1242 *  None
   1243 *
   1244 *******************************************************************************
   1245 */
   1246 
   1247 void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1,
   1248                                   WORD16 *pi2_src2,
   1249                                   UWORD8 *pu1_dst,
   1250                                   WORD32 src_strd1,
   1251                                   WORD32 src_strd2,
   1252                                   WORD32 dst_strd,
   1253                                   WORD32 wgt0,
   1254                                   WORD32 off0,
   1255                                   WORD32 wgt1,
   1256                                   WORD32 off1,
   1257                                   WORD32 shift,
   1258                                   WORD32 lvl_shift1,
   1259                                   WORD32 lvl_shift2,
   1260                                   WORD32 ht,
   1261                                   WORD32 wd)
   1262 {
   1263     WORD32 row, col, temp;
   1264 
   1265     __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
   1266     __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
   1267 
   1268 
   1269     ASSERT(wd % 4 == 0); /* checking assumption*/
   1270     ASSERT(ht % 2 == 0); /* checking assumption*/
   1271 
   1272     temp = (off0 + off1 + 1) << (shift - 1);
   1273 
   1274     // seting values in register
   1275     const_temp_4x32b = _mm_set1_epi32(temp);
   1276     lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
   1277     lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
   1278     wgt0_4x32b = _mm_set1_epi32(wgt0);
   1279     wgt1_4x32b = _mm_set1_epi32(wgt1);
   1280 
   1281     if(0 == (wd & 7)) /* wd multiple of 8 case */
   1282     {
   1283         __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
   1284         /*  outer for loop starts from here */
   1285         for(row = 0; row < ht; row += 2)
   1286         {
   1287             for(col = 0; col < wd; col += 8)
   1288             {
   1289                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1290                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
   1291                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
   1292                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
   1293                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
   1294                 /* Next 4 pixels */
   1295                 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
   1296                 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
   1297                 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
   1298                 src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
   1299 
   1300                 /* considering pix. 4:0 by converting 16-into 32 bit */
   1301                 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
   1302                 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
   1303                 /* (pi2_src1[col] + lvl_shift1) */
   1304                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
   1305                 /* (pi2_src2[col] + lvl_shift2) */
   1306                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
   1307                 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
   1308                 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
   1309                 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
   1310                 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
   1311 
   1312                 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
   1313                 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
   1314                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
   1315                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
   1316                 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
   1317                 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
   1318 
   1319                 /* Next 4 Pixels */
   1320                 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
   1321                 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
   1322                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
   1323                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
   1324                 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
   1325                 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
   1326                 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
   1327                 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
   1328                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
   1329                 src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
   1330                 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
   1331                 src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
   1332 
   1333                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
   1334                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
   1335                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
   1336                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
   1337                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
   1338                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
   1339                 /* (i4_tmp >> shift) */
   1340                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
   1341                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
   1342 
   1343                 /* Next 4 Pixels */
   1344                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
   1345                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
   1346                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
   1347                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
   1348                 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
   1349                 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
   1350 
   1351                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
   1352                 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
   1353 
   1354                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1355                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
   1356                 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
   1357 
   1358                 /* store four 8-bit output values  */
   1359                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
   1360                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
   1361 
   1362                 pi2_src1 += 8;  /* Pointer update */
   1363                 pi2_src2 += 8;  /* Pointer update */
   1364                 pu1_dst  += 8;  /* Pointer update */
   1365 
   1366             } /* inner loop ends here(4-output values in single iteration) */
   1367 
   1368             pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
   1369             pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
   1370             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
   1371 
   1372         } /* outer loop ends */
   1373     }
   1374     else /* wd multiple of 4 case */
   1375     {
   1376         WORD32 dst0, dst1;
   1377         /*  outer for loop starts from here */
   1378         for(row = 0; row < ht; row += 2)
   1379         {
   1380             for(col = 0; col < wd; col += 4)
   1381             {
   1382                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1383                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
   1384                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
   1385                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
   1386                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
   1387 
   1388                 /* considering pix. 4:0 by converting 16-into 32 bit */
   1389                 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
   1390                 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
   1391                 /* (pi2_src1[col] + lvl_shift1) */
   1392                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
   1393                 /* (pi2_src2[col] + lvl_shift2) */
   1394                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
   1395                 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
   1396                 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
   1397                 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
   1398                 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
   1399 
   1400                 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
   1401                 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
   1402                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
   1403                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
   1404                 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
   1405                 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
   1406 
   1407                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
   1408                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
   1409                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
   1410 
   1411                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
   1412                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
   1413                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
   1414 
   1415                 /* (i4_tmp >> shift) */
   1416                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
   1417                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
   1418 
   1419                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
   1420 
   1421                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1422                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
   1423 
   1424                 dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
   1425 
   1426                 /* dst row = 1 to 3 */
   1427                 src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
   1428 
   1429                 /* store four 8-bit output values  */
   1430                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   1431 
   1432                 dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
   1433 
   1434                 /* row = 1 to 3 */
   1435                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   1436 
   1437                 pi2_src1 += 4;  /* Pointer update */
   1438                 pi2_src2 += 4;  /* Pointer update */
   1439                 pu1_dst  += 4;  /* Pointer update */
   1440 
   1441             } /* inner loop ends here(4-output values in single iteration) */
   1442 
   1443             pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
   1444             pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
   1445             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
   1446 
   1447         } /* outer loop ends */
   1448     }
   1449 
   1450 }
   1451 
   1452 /**
   1453 *******************************************************************************
   1454 *
   1455 * @brief
   1456 * Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
   1457 * pi2_src2 and stores it at location pointed  by pi2_dst
   1458 *
   1459 * @par Description:
   1460 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
   1461 * off1 + 1) << (shift - 1) ) >> shift
   1462 *
   1463 * @param[in] pi2_src1
   1464 *  Pointer to source 1
   1465 *
   1466 * @param[in] pi2_src2
   1467 *  Pointer to source 2
   1468 *
   1469 * @param[out] pu1_dst
   1470 *  Pointer to destination
   1471 *
   1472 * @param[in] src_strd1
   1473 *  Source stride 1
   1474 *
   1475 * @param[in] src_strd2
   1476 *  Source stride 2
   1477 *
   1478 * @param[in] dst_strd
   1479 *  Destination stride
   1480 *
   1481 * @param[in] wgt0
   1482 *  weight to be multiplied to source 1
   1483 *
   1484 * @param[in] off0
   1485 *  offset 0
   1486 *
   1487 * @param[in] wgt1
   1488 *  weight to be multiplied to source 2
   1489 *
   1490 * @param[in] off1
   1491 *  offset 1
   1492 *
   1493 * @param[in] shift
   1494 *  (14 Bit depth) + log2_weight_denominator
   1495 *
   1496 * @param[in] lvl_shift1
   1497 *  added before shift and offset
   1498 *
   1499 * @param[in] lvl_shift2
   1500 *  added before shift and offset
   1501 *
   1502 * @param[in] ht
   1503 *  height of the source
   1504 *
   1505 * @param[in] wd
   1506 *  width of the source (each colour component)
   1507 *
   1508 * @returns
   1509 *
   1510 * @remarks
   1511 *  None
   1512 *
   1513 *******************************************************************************
   1514 */
   1515 
   1516 void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1,
   1517                                          WORD16 *pi2_src2,
   1518                                          UWORD8 *pu1_dst,
   1519                                          WORD32 src_strd1,
   1520                                          WORD32 src_strd2,
   1521                                          WORD32 dst_strd,
   1522                                          WORD32 wgt0_cb,
   1523                                          WORD32 wgt0_cr,
   1524                                          WORD32 off0_cb,
   1525                                          WORD32 off0_cr,
   1526                                          WORD32 wgt1_cb,
   1527                                          WORD32 wgt1_cr,
   1528                                          WORD32 off1_cb,
   1529                                          WORD32 off1_cr,
   1530                                          WORD32 shift,
   1531                                          WORD32 lvl_shift1,
   1532                                          WORD32 lvl_shift2,
   1533                                          WORD32 ht,
   1534                                          WORD32 wd)
   1535 {
   1536     WORD32 row, col, temp1, temp2;
   1537     WORD32 wdx2;
   1538 
   1539     __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
   1540     __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
   1541 
   1542 
   1543     ASSERT(wd % 2 == 0); /* checking assumption*/
   1544     ASSERT(ht % 2 == 0); /* checking assumption*/
   1545 
   1546     temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
   1547     temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
   1548 
   1549     // seting values in register
   1550     const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
   1551     lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
   1552     lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
   1553     wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
   1554     wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
   1555 
   1556     wdx2 = wd * 2;
   1557 
   1558     if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
   1559     {
   1560         __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
   1561         /*  outer for loop starts from here */
   1562         for(row = 0; row < ht; row += 2)
   1563         {
   1564             for(col = 0; col < wdx2; col += 8)
   1565             {
   1566                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1567                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
   1568                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
   1569                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
   1570                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
   1571                 /* Next 4 pixels */
   1572                 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
   1573                 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
   1574                 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
   1575                 src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
   1576 
   1577                 /* considering pix. 4:0 by converting 16-into 32 bit */
   1578                 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
   1579                 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
   1580                 /* (pi2_src1[col] + lvl_shift1) */
   1581                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
   1582                 /* (pi2_src2[col] + lvl_shift2) */
   1583                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
   1584                 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
   1585                 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
   1586                 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
   1587                 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
   1588 
   1589                 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
   1590                 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
   1591                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
   1592                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
   1593                 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
   1594                 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
   1595 
   1596                 /* Next 4 Pixels */
   1597                 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
   1598                 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
   1599                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
   1600                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
   1601                 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
   1602                 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
   1603                 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
   1604                 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
   1605                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
   1606                 src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
   1607                 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
   1608                 src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
   1609 
   1610                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
   1611                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
   1612                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
   1613                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
   1614                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
   1615                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
   1616                 /* (i4_tmp >> shift) */
   1617                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
   1618                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
   1619 
   1620                 /* Next 4 Pixels */
   1621                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
   1622                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
   1623                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
   1624                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
   1625                 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
   1626                 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
   1627 
   1628                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
   1629                 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
   1630 
   1631                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1632                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
   1633                 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
   1634 
   1635                 /* store four 8-bit output values  */
   1636                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
   1637                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
   1638 
   1639                 pi2_src1 += 8;  /* Pointer update */
   1640                 pi2_src2 += 8;  /* Pointer update */
   1641                 pu1_dst  += 8;  /* Pointer update */
   1642 
   1643             } /* inner loop ends here(4-output values in single iteration) */
   1644 
   1645             pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
   1646             pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
   1647             pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
   1648 
   1649         } /* outer loop ends */
   1650     }
   1651     else /* wdx2 multiple of 4 case */
   1652     {
   1653         WORD32 dst0, dst1;
   1654         /*  outer for loop starts from here */
   1655         for(row = 0; row < ht; row += 2)
   1656         {
   1657             for(col = 0; col < wdx2; col += 4)
   1658             {
   1659                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1660                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
   1661                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
   1662                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
   1663                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
   1664 
   1665                 /* considering pix. 4:0 by converting 16-into 32 bit */
   1666                 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
   1667                 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
   1668                 /* (pi2_src1[col] + lvl_shift1) */
   1669                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
   1670                 /* (pi2_src2[col] + lvl_shift2) */
   1671                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
   1672                 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
   1673                 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
   1674                 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
   1675                 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
   1676 
   1677                 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
   1678                 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
   1679                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
   1680                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
   1681                 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
   1682                 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
   1683 
   1684                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
   1685                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
   1686                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
   1687 
   1688                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
   1689                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
   1690                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
   1691 
   1692                 /* (i4_tmp >> shift) */
   1693                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
   1694                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
   1695 
   1696                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
   1697 
   1698                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1699                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
   1700 
   1701                 dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
   1702 
   1703                 /* dst row = 1 to 3 */
   1704                 src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
   1705 
   1706                 /* store four 8-bit output values  */
   1707                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   1708 
   1709                 dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
   1710 
   1711                 /* row = 1 to 3 */
   1712                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   1713 
   1714                 pi2_src1 += 4;  /* Pointer update */
   1715                 pi2_src2 += 4;  /* Pointer update */
   1716                 pu1_dst  += 4;  /* Pointer update */
   1717 
   1718             } /* inner loop ends here(4-output values in single iteration) */
   1719 
   1720             pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
   1721             pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
   1722             pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
   1723         }
   1724     }
   1725 
   1726 }
   1727 
   1728 /**
   1729 *******************************************************************************
   1730 *
   1731 * @brief
   1732 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
   1733 * pi2_src2 and stores it at location  pointed by pi2_dst
   1734 *
   1735 * @par Description:
   1736 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
   1737 * >> shift  where shift = 15 - BitDepth
   1738 *
   1739 * @param[in] pi2_src1
   1740 *  Pointer to source 1
   1741 *
   1742 * @param[in] pi2_src2
   1743 *  Pointer to source 2
   1744 *
   1745 * @param[out] pu1_dst
   1746 *  Pointer to destination
   1747 *
   1748 * @param[in] src_strd1
   1749 *  Source stride 1
   1750 *
   1751 * @param[in] src_strd2
   1752 *  Source stride 2
   1753 *
   1754 * @param[in] dst_strd
   1755 *  Destination stride
   1756 *
   1757 * @param[in] lvl_shift1
   1758 *  added before shift and offset
   1759 *
   1760 * @param[in] lvl_shift2
   1761 *  added before shift and offset
   1762 *
   1763 * @param[in] ht
   1764 *  height of the source
   1765 *
   1766 * @param[in] wd
   1767 *  width of the source
   1768 *
   1769 * @returns
   1770 *
   1771 * @remarks
   1772 *  None
   1773 *
   1774 * Assumption : ht%4 == 0, wd%4 == 0
   1775 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
   1776 * final result will match even if intermediate precision is in 16 bit.
   1777 *
   1778 *******************************************************************************
   1779 */
   1780 
   1781 void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1,
   1782                                           WORD16 *pi2_src2,
   1783                                           UWORD8 *pu1_dst,
   1784                                           WORD32 src_strd1,
   1785                                           WORD32 src_strd2,
   1786                                           WORD32 dst_strd,
   1787                                           WORD32 lvl_shift1,
   1788                                           WORD32 lvl_shift2,
   1789                                           WORD32 ht,
   1790                                           WORD32 wd)
   1791 {
   1792     WORD32 row, col, temp;
   1793     WORD32 shift;
   1794 
   1795     __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
   1796     __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
   1797     __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
   1798 
   1799     ASSERT(wd % 4 == 0); /* checking assumption*/
   1800     ASSERT(ht % 2 == 0); /* checking assumption*/
   1801 
   1802     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
   1803     temp = 1 << (shift - 1);
   1804 
   1805     // seting values in register
   1806     lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
   1807     lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
   1808     const_temp_8x16b = _mm_set1_epi16(temp);
   1809 
   1810     lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
   1811     lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
   1812 
   1813     if(0 == (ht & 3)) /* ht multiple of 4*/
   1814     {
   1815         if(0 == (wd & 15)) /* wd multiple of 16 case */
   1816         {
   1817             __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
   1818             __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
   1819             /*  outer for loop starts from here */
   1820             for(row = 0; row < ht; row += 4)
   1821             {
   1822                 for(col = 0; col < wd; col += 16)
   1823                 {
   1824                     /*load 8 pixel values */ /* First 8 Values */
   1825                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
   1826                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
   1827                     /* row = 1 */
   1828                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
   1829                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
   1830                     /* row = 2 */
   1831                     src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
   1832                     src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
   1833                     /* row = 3 */
   1834                     src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
   1835                     src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
   1836 
   1837                     /*load 8 pixel values */ /* Second 8 Values */
   1838                     src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
   1839                     src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
   1840                     /* row = 1 */
   1841                     src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
   1842                     src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
   1843                     /* row = 2 */
   1844                     src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
   1845                     src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
   1846 
   1847                     /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
   1848                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1849                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1850                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1851                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
   1852 
   1853                     /*load 8 pixel values */ /* Second 8 Values */
   1854                     /* row = 3 */
   1855                     src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
   1856                     src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
   1857 
   1858                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
   1859                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1860                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
   1861                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   1862                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
   1863 
   1864                     /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
   1865                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
   1866                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
   1867                     src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
   1868                     src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
   1869 
   1870                     /* (i4_tmp >> shift) */ /* First 8 Values */
   1871                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1872                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
   1873                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   1874                     src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
   1875 
   1876                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
   1877                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
   1878                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
   1879                     src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
   1880                     src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
   1881 
   1882                     /* (i4_tmp >> shift) */ /* Second 8 Values */
   1883                     src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
   1884                     src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
   1885                     src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
   1886                     src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
   1887 
   1888                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
   1889                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
   1890                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
   1891                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
   1892                     src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
   1893 
   1894                     /* store four 8-bit output values  */ /* 16 8 Values */
   1895                     _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
   1896                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
   1897                     _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
   1898                     _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
   1899 
   1900                     /* To update pointer */
   1901                     pi2_src1 += 16;
   1902                     pi2_src2 += 16;
   1903                     pu1_dst  += 16;
   1904 
   1905                 } /* inner loop ends here(8-output values in single iteration) */
   1906 
   1907                 pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
   1908                 pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
   1909                 pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
   1910 
   1911             }
   1912         }
   1913         else if(0 == (wd & 7)) /* multiple of 8 case */
   1914         {
   1915             /*  outer for loop starts from here */
   1916             for(row = 0; row < ht; row += 4)
   1917             {
   1918                 for(col = 0; col < wd; col += 8)
   1919                 {
   1920                     /*load 8 pixel values */
   1921                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
   1922                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
   1923                     /* row = 1 */
   1924                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
   1925                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
   1926                     /* row = 2 */
   1927                     src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
   1928                     src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
   1929                     /* row = 3 */
   1930                     src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
   1931                     src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
   1932 
   1933                     /* (pi2_src1[col] + pi2_src2[col]) */
   1934                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   1935                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
   1936                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   1937                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
   1938 
   1939                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   1940                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   1941                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
   1942                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   1943                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
   1944 
   1945                     /* (i4_tmp >> shift) */
   1946                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   1947                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
   1948                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   1949                     src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
   1950 
   1951                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   1952                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   1953                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
   1954                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
   1955                     src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
   1956 
   1957                     /* store four 8-bit output values  */
   1958                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
   1959                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
   1960                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
   1961                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
   1962 
   1963                     /* To update pointer */
   1964                     pi2_src1 += 8;
   1965                     pi2_src2 += 8;
   1966                     pu1_dst  += 8;
   1967 
   1968                 } /* inner loop ends here(8-output values in single iteration) */
   1969 
   1970                 pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
   1971                 pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
   1972                 pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
   1973 
   1974             }
   1975         }
   1976         else /* wd multiple of 4 case*/
   1977         {
   1978             WORD32 dst0, dst1, dst2, dst3;
   1979 
   1980             /*  outer for loop starts from here */
   1981             for(row = 0; row < ht; row += 4)
   1982             {
   1983                 for(col = 0; col < wd; col += 4)
   1984                 {
   1985                     /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
   1986                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
   1987                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   1988                     src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
   1989 
   1990                     /* row = 1 */
   1991                     src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
   1992                     src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
   1993                     /* row = 2 */
   1994                     src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
   1995                     src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
   1996                     /* row = 3 */
   1997                     src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
   1998                     src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
   1999 
   2000                     /* Pack two rows together */
   2001                     src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
   2002                     src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
   2003                     src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
   2004                     src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
   2005 
   2006                     /* (pi2_src1[col] + pi2_src2[col]) */
   2007                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   2008                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
   2009 
   2010                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   2011                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   2012                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
   2013 
   2014                     /* (i4_tmp >> shift) */
   2015                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   2016                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
   2017 
   2018                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   2019                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   2020                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
   2021 
   2022                     dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
   2023                     /* dst row = 1 to 3 */
   2024                     src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
   2025                     src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
   2026 
   2027                     /* store four 8-bit output values  */
   2028                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   2029 
   2030                     dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
   2031                     dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
   2032                     dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
   2033 
   2034                     /* row = 1 to row = 3 */
   2035                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   2036                     *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
   2037                     *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
   2038 
   2039                     /* To update pointer */
   2040                     pi2_src1 += 4;
   2041                     pi2_src2 += 4;
   2042                     pu1_dst  += 4;
   2043 
   2044                 } /* inner loop ends here(4-output values in single iteration) */
   2045 
   2046                 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
   2047                 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
   2048                 pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
   2049 
   2050             }
   2051         }
   2052     }
   2053     else /* ht multiple of 2 case and wd multiple of 4 case*/
   2054     {
   2055 
   2056         WORD32 dst0, dst1;
   2057 
   2058         /*  outer for loop starts from here */
   2059         for(row = 0; row < ht; row += 2)
   2060         {
   2061             for(col = 0; col < wd; col += 4)
   2062             {
   2063                 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
   2064                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
   2065                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
   2066                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
   2067 
   2068                 /* row = 1 */
   2069                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
   2070                 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
   2071 
   2072                 /* Pack two rows together */
   2073                 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
   2074                 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
   2075 
   2076                 /* (pi2_src1[col] + pi2_src2[col]) */
   2077                 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
   2078 
   2079                 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
   2080                 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
   2081 
   2082                 /* (i4_tmp >> shift) */
   2083                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
   2084 
   2085                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
   2086                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
   2087 
   2088                 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
   2089                 /* dst row = 1 to 3 */
   2090                 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
   2091 
   2092                 /* store four 8-bit output values  */
   2093                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
   2094 
   2095                 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
   2096 
   2097                 /* row = 1 to row = 3 */
   2098                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
   2099 
   2100                 /* To update pointer */
   2101                 pi2_src1 += 4;
   2102                 pi2_src2 += 4;
   2103                 pu1_dst  += 4;
   2104 
   2105             } /* inner loop ends here(4-output values in single iteration) */
   2106 
   2107             pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
   2108             pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
   2109             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
   2110 
   2111         }
   2112 
   2113     }
   2114 
   2115 }
   2116