Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 
     19 /**
     20 *******************************************************************************
     21 * @file
     22 *  ihevc_inter_pred_filters_atom_intr.c
     23 *
     24 * @brief
     25 *  Contains function definitions for inter prediction  interpolation filters
     26 *  coded in x86 intrinsics
     27 *
     28 *
     29 * @author
     30 *
     31 *
     32 * @par List of Functions:
     33 *  - ihevc_inter_pred_luma_copy_ssse3()
     34 *  - ihevc_inter_pred_luma_horz_ssse3()
     35 *  - ihevc_inter_pred_luma_vert_ssse3()
     36 *  - ihevc_inter_pred_luma_copy_w16out_ssse3()
     37 *  - ihevc_inter_pred_luma_horz_w16out_ssse3()
     38 *  - ihevc_inter_pred_luma_vert_w16out_ssse3()
     39 *  - ihevc_inter_pred_luma_vert_w16inp_ssse3()
     40 *  - ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3()
     41 *  - ihevc_inter_pred_chroma_copy_ssse3()
     42 *  - ihevc_inter_pred_chroma_horz_ssse3()
     43 *  - ihevc_inter_pred_chroma_vert_ssse3()
     44 *  - ihevc_inter_pred_chroma_copy_w16out_ssse3()
     45 *  - ihevc_inter_pred_chroma_horz_w16out_ssse3()
     46 *  - ihevc_inter_pred_chroma_vert_w16out_ssse3()
     47 *  - ihevc_inter_pred_chroma_vert_w16inp_ssse3()
     48 *  - ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3()
     49 *
     50 * @remarks
     51 *  None
     52 *
     53 *******************************************************************************
     54 */
     55 
     56 
     57 /*****************************************************************************/
     58 /* File Includes                                                             */
     59 /*****************************************************************************/
     60 #include <assert.h>
     61 
     62 #include "ihevc_debug.h"
     63 #include "ihevc_typedefs.h"
     64 #include "ihevc_defs.h"
     65 #include "ihevc_inter_pred.h"
     66 #include "ihevc_platform_macros.h"
     67 #include "ihevc_macros.h"
     68 #include "ihevc_func_selector.h"
     69 
     70 #include <immintrin.h>
     71 
     72 /*****************************************************************************/
     73 /* Function Definitions                                                      */
     74 /*****************************************************************************/
     75 
     76 /**
     77 *******************************************************************************
     78 *
     79 * @brief
     80 *       Interprediction luma function for copy
     81 *
     82 * @par Description:
     83 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
     84 *    by 'src' to the location pointed by 'dst'
     85 *
     86 * @param[in] pu1_src
     87 *  UWORD8 pointer to the source
     88 *
     89 * @param[out] pu1_dst
     90 *  UWORD8 pointer to the destination
     91 *
     92 * @param[in] src_strd
     93 *  integer source stride
     94 *
     95 * @param[in] dst_strd
     96 *  integer destination stride
     97 *
     98 * @param[in] pi1_coeff
     99 *  WORD8 pointer to the filter coefficients
    100 *
    101 * @param[in] ht
    102 *  integer height of the array
    103 *
    104 * @param[in] wd
    105 *  integer width of the array
    106 *
    107 * @returns
    108 *
    109 * @remarks
    110 *  None
    111 *
    112 * Assumption : ht%4 == 0, wd%4 == 0
    113 *
    114 *******************************************************************************
    115 */
    116 
    117 
    118 void ihevc_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
    119                                       UWORD8 *pu1_dst,
    120                                       WORD32 src_strd,
    121                                       WORD32 dst_strd,
    122                                       WORD8 *pi1_coeff,
    123                                       WORD32 ht,
    124                                       WORD32 wd)
    125 {
    126 
    127     WORD32 row, col;
    128     __m128i  src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
    129     UNUSED(pi1_coeff);
    130     ASSERT(wd % 4 == 0); /* checking assumption*/
    131     ASSERT(ht % 4 == 0); /* checking assumption*/
    132 
    133 /*  outer for loop starts from here */
    134     if(0 == (wd & 15)) /* wd multiple of 16 case */
    135     {
    136         for(row = 0; row < ht; row += 4)
    137         {
    138             for(col = 0; col < wd; col += 16)
    139             {
    140                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    141                 src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                /* row =0 */
    142                 src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    143                 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    144                 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    145 
    146                 /* storing 16 8-bit output values */
    147                 _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
    148                 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    149                 _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
    150                 _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
    151 
    152                 pu1_src += 16; /* pointer update */
    153                 pu1_dst += 16; /* pointer update */
    154             } /* inner for loop ends here(16-output values in single iteration) */
    155 
    156             pu1_src += 4 * src_strd - wd; /* pointer update */
    157             pu1_dst += 4 * dst_strd - wd; /* pointer update */
    158         }
    159 
    160     }
    161     else if(0 == (wd & 7)) /* multiple of 8 case */
    162     {
    163         for(row = 0; row < ht; row += 4)
    164         {
    165             for(col = 0; col < wd; col += 8)
    166             {
    167                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    168                 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    169                 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    170                 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    171                 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    172 
    173                 /* storing 16 8-bit output values */
    174                 _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b);                 /* row =0 */
    175                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b);  /* row =1 */
    176                 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b);  /* row =2 */
    177                 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b);  /* row =3 */
    178 
    179                 pu1_src += 8; /* pointer update */
    180                 pu1_dst += 8; /* pointer update */
    181             } /*  inner for loop ends here(8-output values in single iteration) */
    182 
    183             pu1_src += 4 * src_strd - wd; /* pointer update */
    184             pu1_dst += 4 * dst_strd - wd; /* pointer update */
    185         }
    186     }
    187     else /* wd = multiple of 4 case */
    188     {
    189         WORD32 dst0, dst1, dst2, dst3;
    190         for(row = 0; row < ht; row += 4)
    191         {
    192             for(col = 0; col < wd; col += 4)
    193             {
    194                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
    195                 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));                /* row =0 */
    196                 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */
    197                 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */
    198                 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */
    199 
    200                 dst0 = _mm_cvtsi128_si32(src0_16x8b);
    201                 dst1 = _mm_cvtsi128_si32(src1_16x8b);
    202                 dst2 = _mm_cvtsi128_si32(src2_16x8b);
    203                 dst3 = _mm_cvtsi128_si32(src3_16x8b);
    204 
    205                 /* storing 4 8-bit output values */
    206                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */
    207                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */
    208                 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */
    209                 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */
    210 
    211                 pu1_src += 4; /* pointer update */
    212                 pu1_dst += 4; /* pointer update */
    213             } /*  inner for loop ends here(4- output values in single iteration) */
    214 
    215             pu1_src += 4 * src_strd - wd; /* pointer update */
    216             pu1_dst += 4 * dst_strd - wd; /* pointer update */
    217         }
    218     }
    219 }
    220 
    221 /* INTER_PRED_LUMA_COPY */
    222 
    223 /**
    224 *******************************************************************************
    225 *
    226 * @brief
    227 *     Interprediction luma filter for horizontal input
    228 *
    229 * @par Description:
    230 *    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
    231 *    to the elements pointed by 'pu1_src' and  writes to the location pointed
    232 *    by 'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
    233 *
    234 * @param[in] pu1_src
    235 *  UWORD8 pointer to the source
    236 *
    237 * @param[out] pu1_dst
    238 *  UWORD8 pointer to the destination
    239 *
    240 * @param[in] src_strd
    241 *  integer source stride
    242 *
    243 * @param[in] dst_strd
    244 *  integer destination stride
    245 *
    246 * @param[in] pi1_coeff
    247 *  WORD8 pointer to the filter coefficients
    248 *
    249 * @param[in] ht
    250 *  integer height of the array
    251 *
    252 * @param[in] wd
    253 *  integer width of the array
    254 *
    255 * @returns
    256 *
    257 * @remarks
    258 *  None
    259 *
    260 *******************************************************************************
    261 */
    262 void ihevc_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
    263                                       UWORD8 *pu1_dst,
    264                                       WORD32 src_strd,
    265                                       WORD32 dst_strd,
    266                                       WORD8 *pi1_coeff,
    267                                       WORD32 ht,
    268                                       WORD32 wd)
    269 {
    270     WORD32 row, col;
    271 
    272     /* all 128 bit registers are named with a suffix mxnb, where m is the */
    273     /* number of n bits packed in the register                            */
    274     __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
    275     __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
    276     __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
    277     __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b, res_temp8_8x16b;
    278     __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b, res_temp18_8x16b;
    279     __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
    280     __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
    281 
    282     ASSERT(wd % 4 == 0); /* checking assumption*/
    283 
    284     PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
    285     PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
    286     PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
    287     PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
    288     PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
    289     PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
    290 
    291     /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
    292     src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
    293     zero_8x16b = _mm_set1_epi32(0);
    294     offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
    295 
    296     mask_low_32b = _mm_cmpeq_epi16(zero_8x16b, zero_8x16b);
    297     mask_high_96b = _mm_srli_si128(mask_low_32b, 12);
    298     mask_low_32b = _mm_slli_si128(mask_low_32b, 4);
    299 
    300     control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
    301     control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
    302     control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
    303     control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
    304 
    305     coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
    306     coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
    307 
    308     coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b);  /* pi1_coeff[4] */
    309     coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b);  /* pi1_coeff[4] */
    310 
    311     if(0 == (ht & 1)) /* ht multiple of 2 case */
    312     {
    313 
    314         if(0 == (wd & 7)) /* wd = multiple of 8 case */
    315         {
    316             for(row = 0; row < ht; row += 2)
    317             {
    318 
    319                 int offset = 0;
    320 
    321                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
    322                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
    323 
    324 
    325                 for(col = 0; col < wd; col += 8)
    326                 {
    327                     /*load 16 pixel values from row 0*/
    328                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));         /* row = 0 */
    329 
    330                     /*load 16 pixel values from row 1*/
    331                     src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
    332 
    333                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
    334                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
    335                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    336                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
    337                                                                                            /* row = 0 */
    338                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    339                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    340                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    341                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    342                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
    343 
    344                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    345                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    346                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    347                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    348                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
    349 
    350                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    351                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    352                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    353                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    354                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
    355 
    356                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
    357                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
    358                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
    359 
    360                     res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
    361                     res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
    362                     res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
    363 
    364                     _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
    365 
    366                     src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row =1 */
    367                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row =1 */
    368                     src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    369                     res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
    370                                                                                               /* row = 1 */
    371                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    372                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    373                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
    374                     src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    375                     res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
    376 
    377                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    378                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    379                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
    380                     src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    381                     res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
    382 
    383                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    384                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    385                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row =1 */
    386                     src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    387                     res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
    388 
    389                     res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
    390                     res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
    391                     res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
    392 
    393                     res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
    394                     res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
    395                     res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
    396 
    397                     /* to store the 1st 4 pixels res. */
    398                     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
    399 
    400                     offset += 8; /* To pointer updates*/
    401                 }
    402                 pu1_src += 2 * src_strd;  /* pointer updates*/
    403                 pu1_dst += 2 * dst_strd;  /* pointer updates*/
    404             }
    405         }
    406         else /* wd = multiple of 4 case */
    407         {
    408             for(row = 0; row < ht; row += 2)
    409             {
    410                 int offset = 0;
    411 
    412                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
    413                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
    414 
    415 
    416                 for(col = 0; col < wd; col += 4)
    417                 {
    418                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
    419                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
    420                     src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
    421 
    422                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
    423                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
    424                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    425                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
    426                                                                                            /* row = 0 */
    427                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    428                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    429                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    430                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    431                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
    432 
    433                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    434                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    435                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    436                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    437                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
    438 
    439                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    440                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    441                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    442                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    443                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
    444 
    445                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
    446                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
    447                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
    448 
    449                     res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
    450                     res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
    451                     res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
    452 
    453                     res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
    454                     res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
    455                     res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
    456                     res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
    457 
    458                     _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
    459 
    460                     src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
    461                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
    462                     src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    463                     res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
    464                                                                                               /* row = 1 */
    465                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    466                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    467                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
    468                     src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    469                     res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
    470 
    471                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    472                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    473                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
    474                     src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    475                     res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
    476 
    477                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    478                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    479                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
    480                     src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    481                     res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
    482 
    483                     res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
    484                     res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
    485                     res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
    486 
    487                     res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
    488                     res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
    489                     res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
    490 
    491                     res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
    492                     res_temp18_8x16b =  _mm_and_si128(res_temp17_8x16b, mask_low_32b);
    493                     res_temp17_8x16b =  _mm_and_si128(res_temp15_8x16b, mask_high_96b);
    494                     res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b);
    495 
    496                     /* to store the 1st 4 pixels res. */
    497                     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
    498 
    499                     offset += 4; /* To pointer updates*/
    500                 }
    501                 pu1_src += 2 * src_strd;  /* Pointer update */
    502                 pu1_dst += 2 * dst_strd;  /* Pointer update */
    503             }
    504         }
    505     }
    506     else /* odd ht */
    507     {
    508         if(0 == (wd & 7)) /* multiple of 8 case */
    509         {
    510             for(row = 0; row < ht; row++)
    511             {
    512                 int offset = 0;
    513 
    514 
    515                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
    516                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
    517 
    518 
    519                 for(col = 0; col < wd; col += 8)
    520                 {
    521                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
    522                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
    523 
    524                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
    525                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
    526                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    527                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
    528                                                                                            /* row = 0 */
    529                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    530                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    531                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    532                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    533                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
    534 
    535                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    536                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    537                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    538                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    539                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
    540 
    541                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    542                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    543                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    544                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    545                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
    546 
    547                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
    548                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
    549                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
    550 
    551                     res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
    552                     res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
    553                     res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
    554 
    555                     /* to store the 1st 4 pixels res. */
    556                     _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
    557 
    558                     offset += 8; /* To pointer updates*/
    559                 }
    560                 pu1_src += src_strd;    /* pointer updates*/
    561                 pu1_dst += dst_strd;    /* pointer updates*/
    562             }
    563         }
    564         else  /* wd = multiple of 4 case */
    565         {
    566             for(row = 0; row < (ht - 1); row += 2)
    567             {
    568                 int offset = 0;
    569 
    570                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
    571                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
    572 
    573 
    574                 for(col = 0; col < wd; col += 4)
    575                 {
    576                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
    577                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
    578                     src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
    579 
    580                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
    581                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
    582                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    583                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
    584                                                                                            /* row = 0 */
    585                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    586                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    587                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    588                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    589                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
    590 
    591                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    592                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    593                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    594                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    595                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
    596 
    597                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    598                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    599                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    600                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    601                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
    602 
    603                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
    604                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
    605                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
    606 
    607                     res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
    608                     res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
    609                     res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
    610 
    611                     res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
    612                     res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
    613                     res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
    614                     res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
    615 
    616                     _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
    617 
    618                     src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
    619                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
    620                     src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    621                     res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
    622                                                                                               /* row = 1 */
    623                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    624                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    625                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
    626                     src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    627                     res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
    628 
    629                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    630                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    631                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
    632                     src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    633                     res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
    634 
    635                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
    636                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
    637                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
    638                     src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
    639                     res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
    640 
    641                     res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
    642                     res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
    643                     res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
    644 
    645                     res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b);             /* row = 1 */
    646                     res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */
    647                     res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b);       /* row = 1 */
    648 
    649                     res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
    650                     res_temp18_8x16b =  _mm_and_si128(res_temp17_8x16b, mask_low_32b);
    651                     res_temp17_8x16b =  _mm_and_si128(res_temp15_8x16b, mask_high_96b);
    652                     res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b);
    653 
    654                     /* to store the 1st 4 pixels res. */
    655                     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b);
    656 
    657                     offset += 4; /* To pointer updates*/
    658                 }
    659                 pu1_src += 2 * src_strd;  /* Pointer update */
    660                 pu1_dst += 2 * dst_strd;  /* Pointer update */
    661             }
    662             { /* last repeat at outside the loop */
    663                 int offset = 0;
    664                 for(col = 0; col < wd; col += 4)
    665                 {
    666                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
    667                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
    668 
    669                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
    670                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
    671                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    672                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
    673                                                                                            /* row = 0 */
    674                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    675                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    676                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    677                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    678                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
    679 
    680                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    681                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    682                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    683                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    684                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
    685 
    686                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
    687                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
    688                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
    689                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
    690                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
    691 
    692                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
    693                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
    694                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
    695 
    696                     res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b);             /* row = 0 */
    697                     res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
    698                     res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b);        /* row = 0 */
    699 
    700                     res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
    701                     res_temp8_8x16b =  _mm_and_si128(res_temp7_8x16b, mask_low_32b);
    702                     res_temp7_8x16b =  _mm_and_si128(res_temp5_8x16b, mask_high_96b);
    703                     res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b);
    704 
    705                     /* to store the 1st 4 pixels res. */
    706                     _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b);
    707 
    708                     offset += 4; /* To pointer updates*/
    709                 }
    710             }
    711         }
    712     }
    713 }
    714 
    715 
    716 /**
    717 *******************************************************************************
    718 *
    719 * @brief
    720 *    Interprediction luma filter for vertical input
    721 *
    722 * @par Description:
    723 *   Applies a vertcal filter with coefficients pointed to  by 'pi1_coeff' to
    724 *   the elements pointed by 'pu1_src' and  writes to the location pointed by
    725 *   'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
    726 *
    727 * @param[in] pu1_src
    728 *  UWORD8 pointer to the source
    729 *
    730 * @param[out] pu1_dst
    731 *  UWORD8 pointer to the destination
    732 *
    733 * @param[in] src_strd
    734 *  integer source stride
    735 *
    736 * @param[in] dst_strd
    737 *  integer destination stride
    738 *
    739 * @param[in] pi1_coeff
    740 *  WORD8 pointer to the filter coefficients
    741 *
    742 * @param[in] ht
    743 *  integer height of the array
    744 *
    745 * @param[in] wd
    746 *  integer width of the array
    747 *
    748 * @returns
    749 *
    750 * @remarks
    751 *  None
    752 *
    753 *******************************************************************************
    754 */
    755 void ihevc_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
    756                                       UWORD8 *pu1_dst,
    757                                       WORD32 src_strd,
    758                                       WORD32 dst_strd,
    759                                       WORD8 *pi1_coeff,
    760                                       WORD32 ht,
    761                                       WORD32 wd)
    762 {
    763     WORD32 row, col;
    764     UWORD8 *pu1_src_copy;
    765     UWORD8 *pu1_dst_copy;
    766     __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
    767     __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
    768     __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
    769     __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
    770     __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
    771     __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s17_8x16b, s18_8x16b, s19_8x16b;
    772     __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s27_8x16b, s28_8x16b, s29_8x16b;
    773     __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s37_8x16b, s38_8x16b, s39_8x16b;
    774 
    775     __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
    776     __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
    777 
    778     PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
    779     PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
    780     PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
    781     PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
    782     PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
    783     PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
    784     PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
    785     PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
    786 
    787 /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
    788     s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
    789 
    790     control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
    791     control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
    792     control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
    793     control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
    794 
    795     coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
    796     coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
    797 
    798     coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b);  /* pi1_coeff[4] */
    799     coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b);  /* pi1_coeff[4] */
    800 
    801 /*  seting  values in register */
    802     zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */
    803     offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
    804     mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
    805     mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
    806 
    807 /*  outer for loop starts from here */
    808     if(wd % 8 == 0)
    809     { /* wd = multiple of 8 case */
    810 
    811         pu1_src_copy = pu1_src;
    812         pu1_dst_copy = pu1_dst;
    813 
    814         for(col = 0; col < wd; col += 8)
    815         {
    816 
    817             pu1_src = pu1_src_copy + col;
    818             pu1_dst = pu1_dst_copy + col;
    819 
    820             PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
    821             PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
    822             PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
    823             PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
    824 
    825             /*load 8 pixel values.*/
    826             s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
    827 
    828             /*load 8 pixel values*/
    829             s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
    830 
    831             s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
    832 
    833             s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
    834 
    835             /*load 8 pixel values*/
    836             s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
    837 
    838             /*load 8 pixel values*/
    839             s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
    840 
    841             s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
    842 
    843             s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
    844 
    845             /*load 8 pixel values*/
    846             s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
    847 
    848             /*load 8 pixel values*/
    849             s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
    850 
    851             s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
    852 
    853             s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
    854 
    855             /*load 8 pixel values*/
    856             s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
    857 
    858             /*load 8 pixel values*/
    859             s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
    860 
    861             s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
    862 
    863             s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
    864 
    865             s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
    866             s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
    867             s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
    868 
    869             s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
    870 
    871             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
    872             s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
    873 
    874             /* i2_tmp = CLIP_U8(i2_tmp);*/
    875             s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
    876 
    877             /* store 8 8-bit output values  */
    878             /* Store the output pixels of row 0*/
    879             _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
    880 
    881             /* ROW 2*/
    882             s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
    883             s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
    884             s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
    885 
    886             /*load 8 pixel values*/
    887             s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
    888 
    889             /*load 8 pixel values*/
    890             s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
    891 
    892             s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
    893 
    894             s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
    895 
    896             s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
    897             s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
    898             s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
    899 
    900             s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
    901 
    902             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
    903             s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
    904 
    905             /* i2_tmp = CLIP_U8(i2_tmp);*/
    906             s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
    907 
    908             /* store 8 8-bit output values  */
    909             /* Store the output pixels of row 2*/
    910             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
    911 
    912 
    913             /*ROW 1*/
    914             s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
    915 
    916             s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
    917 
    918             s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
    919 
    920             s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
    921 
    922             s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
    923 
    924             s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
    925 
    926             s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
    927 
    928             s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
    929 
    930             s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
    931             s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
    932             s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
    933 
    934             s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
    935 
    936             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
    937             s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
    938 
    939             /* i2_tmp = CLIP_U8(i2_tmp);*/
    940             s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
    941 
    942             /* store 8 8-bit output values  */
    943             /* Store the output pixels of row 1*/
    944             _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b);
    945 
    946 
    947             /* ROW 3*/
    948             s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
    949             s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
    950             s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
    951 
    952             /*load 8 pixel values*/
    953             s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
    954 
    955             s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
    956 
    957             s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
    958 
    959             s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
    960             s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
    961             s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
    962 
    963             s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
    964 
    965             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
    966             s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
    967 
    968             /* i2_tmp = CLIP_U8(i2_tmp);*/
    969             s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
    970 
    971             /* store 8 8-bit output values  */
    972             /* Store the output pixels of row 2*/
    973             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
    974 
    975             pu1_src += (8 * src_strd);
    976             pu1_dst += (4 * dst_strd);
    977 
    978             for(row = 4; row < ht; row += 4)
    979             {
    980                 PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
    981                 PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
    982                 PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
    983                 PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
    984 
    985 
    986                 s3_0_16x8b = s3_2_16x8b;
    987                 s3_1_16x8b = s3_3_16x8b;
    988                 s3_2_16x8b = s3_4_16x8b;
    989 
    990                 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
    991                 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
    992                 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
    993 
    994                 /*load 8 pixel values from (cur_row + 4)th row*/
    995                 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
    996 
    997                 s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
    998                 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
    999 
   1000                 s4_0_16x8b = s4_2_16x8b;
   1001                 s4_1_16x8b = s4_3_16x8b;
   1002                 s4_2_16x8b = s4_4_16x8b;
   1003 
   1004                 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
   1005                 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
   1006                 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
   1007 
   1008                 s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
   1009 
   1010                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1011                 s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1012 
   1013                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   1014                 s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
   1015 
   1016                 /* store 8 8-bit output values  */
   1017                 /* Store the output pixels of row 4*/
   1018                 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
   1019 
   1020                 /* row + 2*/
   1021                 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
   1022                 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
   1023                 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
   1024 
   1025                 /*load 8 pixel values from (cur_row + 5)th row*/
   1026                 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
   1027 
   1028                 /*load 8 pixel values from (cur_row + 6)th row*/
   1029                 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   1030 
   1031                 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
   1032                 s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
   1033 
   1034                 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
   1035 
   1036                 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
   1037                 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
   1038                 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
   1039 
   1040                 s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
   1041 
   1042                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1043                 s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1044 
   1045                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   1046                 s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
   1047 
   1048                 /* store 8 8-bit output values  */
   1049                 /* Store the output pixels of (cur_row+2)*/
   1050                 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
   1051 
   1052 
   1053                 /*row + 1*/
   1054                 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
   1055                 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
   1056                 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
   1057 
   1058                 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
   1059                 s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
   1060                 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
   1061 
   1062                 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
   1063                 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
   1064                 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
   1065 
   1066                 s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
   1067 
   1068                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1069                 s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1070 
   1071                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   1072                 s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
   1073 
   1074                 /* store 8 8-bit output values  */
   1075                 /* Store the output pixels of (cur_row + 1)*/
   1076                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b);
   1077 
   1078 
   1079                 /* row + 3*/
   1080                 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
   1081                 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
   1082                 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
   1083 
   1084                 /*load 8 pixel values from (cur_row + 7)th row*/
   1085                 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   1086 
   1087                 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
   1088                 s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
   1089 
   1090                 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
   1091 
   1092                 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
   1093                 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
   1094                 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
   1095 
   1096                 s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
   1097 
   1098                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1099                 s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1100 
   1101                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   1102                 s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
   1103 
   1104                 /* store 8 8-bit output values  */
   1105                 /* Store the output pixels of (cur_row+3)*/
   1106                 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
   1107 
   1108                 s2_10_16x8b = s2_3_16x8b;
   1109 
   1110                 pu1_src += 4 * src_strd; /* pointer update */
   1111                 pu1_dst += 4 * dst_strd; /* pointer update */
   1112             }
   1113         }
   1114     }
   1115     else /* wd = multiple of 8 case */
   1116     {
   1117 
   1118         pu1_src_copy = pu1_src;
   1119         pu1_dst_copy = pu1_dst;
   1120 
   1121         for(col = 0; col < wd; col += 4)
   1122         {
   1123 
   1124             pu1_src = pu1_src_copy + col;
   1125             pu1_dst = pu1_dst_copy + col;
   1126 
   1127             PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
   1128             PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
   1129             PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
   1130             PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
   1131 
   1132 
   1133             /*load 8 pixel values */
   1134             s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
   1135 
   1136             /*load 8 pixel values */
   1137             s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
   1138 
   1139             s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
   1140 
   1141             s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
   1142 
   1143             /*load 8 pixel values */
   1144             s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
   1145 
   1146             /*load 8 pixel values */
   1147             s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
   1148 
   1149             s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
   1150 
   1151             s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
   1152 
   1153             /*load 8 pixel values */
   1154             s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
   1155 
   1156             /*load 8 pixel values */
   1157             s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   1158 
   1159             s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
   1160 
   1161             s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
   1162 
   1163             /*load 8 pixel values */
   1164             s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   1165 
   1166             /*load 8 pixel values */
   1167             s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
   1168 
   1169             s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
   1170 
   1171             s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
   1172 
   1173             s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
   1174             s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
   1175             s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
   1176 
   1177             s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
   1178 
   1179             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1180             s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1181 
   1182             /* i2_tmp = CLIP_U8(i2_tmp);*/
   1183             s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
   1184             s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
   1185             s6_8x16b =  _mm_and_si128(s5_8x16b, mask_low_32b);
   1186             s7_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
   1187             s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b);
   1188             /* store 8 8-bit output values  */
   1189             /* Store the output pixels of row 0*/
   1190             _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b);
   1191 
   1192             /* ROW 2*/
   1193             s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
   1194             s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
   1195             s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
   1196 
   1197             /*load 8 pixel values */
   1198             s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
   1199 
   1200             /*load 8 pixel values */
   1201             s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
   1202 
   1203             s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
   1204 
   1205             s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
   1206 
   1207             s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
   1208             s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
   1209             s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
   1210 
   1211             s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
   1212 
   1213             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1214             s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1215 
   1216             /* i2_tmp = CLIP_U8(i2_tmp);*/
   1217             s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
   1218             s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
   1219             s26_8x16b =  _mm_and_si128(s25_8x16b, mask_low_32b);
   1220             s27_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
   1221             s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b);
   1222             /* store 8 8-bit output values  */
   1223             /* Store the output pixels of row 2*/
   1224             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b);
   1225 
   1226 
   1227             /*ROW 1*/
   1228             s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
   1229 
   1230             s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
   1231 
   1232             s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
   1233 
   1234             s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
   1235 
   1236             s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
   1237 
   1238             s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
   1239 
   1240             s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
   1241 
   1242             s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
   1243 
   1244             s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
   1245             s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
   1246             s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
   1247 
   1248             s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
   1249 
   1250             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1251             s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1252 
   1253             /* i2_tmp = CLIP_U8(i2_tmp);*/
   1254             s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
   1255             s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
   1256             s16_8x16b =  _mm_and_si128(s15_8x16b, mask_low_32b);
   1257             s17_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
   1258             s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b);
   1259             /* store 8 8-bit output values  */
   1260             /* Store the output pixels of row 1*/
   1261             _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s18_8x16b);
   1262 
   1263 
   1264             /* ROW 3*/
   1265             s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
   1266             s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
   1267             s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
   1268 
   1269             /*load 8 pixel values */
   1270             s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
   1271 
   1272             s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
   1273 
   1274             s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
   1275 
   1276             s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
   1277             s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
   1278             s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
   1279 
   1280             s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
   1281 
   1282             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1283             s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1284 
   1285             /* i2_tmp = CLIP_U8(i2_tmp);*/
   1286             s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
   1287 
   1288             s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
   1289             s36_8x16b =  _mm_and_si128(s35_8x16b, mask_low_32b);
   1290             s37_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
   1291             s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b);
   1292 
   1293             /* store 8 8-bit output values  */
   1294             /* Store the output pixels of row 2*/
   1295             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b);
   1296 
   1297             pu1_src += (8 * src_strd);
   1298             pu1_dst += (4 * dst_strd);
   1299 
   1300             for(row = 4; row < ht; row += 4)
   1301             {
   1302 
   1303                 PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
   1304                 PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
   1305                 PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
   1306                 PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
   1307 
   1308 
   1309                 s3_0_16x8b = s3_2_16x8b;
   1310                 s3_1_16x8b = s3_3_16x8b;
   1311                 s3_2_16x8b = s3_4_16x8b;
   1312 
   1313                 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
   1314                 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
   1315                 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
   1316 
   1317                 /*load 16 pixel values from (cur_row + 4)th row*/
   1318                 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   1319 
   1320                 s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
   1321                 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
   1322 
   1323                 s4_0_16x8b = s4_2_16x8b;
   1324                 s4_1_16x8b = s4_3_16x8b;
   1325                 s4_2_16x8b = s4_4_16x8b;
   1326 
   1327                 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
   1328                 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
   1329                 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
   1330 
   1331                 s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b);
   1332 
   1333                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1334                 s8_8x16b = _mm_srai_epi16(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1335 
   1336                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   1337                 s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
   1338 
   1339                 s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
   1340                 s6_8x16b =  _mm_and_si128(s5_8x16b, mask_low_32b);
   1341                 s7_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
   1342                 s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b);
   1343 
   1344                 /* store 8 8-bit output values  */
   1345                 /* Store the output pixels of row 4*/
   1346                 _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b);
   1347 
   1348                 /* row + 2*/
   1349                 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
   1350                 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
   1351                 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
   1352 
   1353                 /*load 16 pixel values from (cur_row + 5)th row*/
   1354                 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
   1355 
   1356                 /*load 16 pixel values from (cur_row + 6)th row*/
   1357                 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   1358 
   1359                 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
   1360                 s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
   1361 
   1362                 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
   1363 
   1364                 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
   1365                 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
   1366                 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
   1367 
   1368                 s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b);
   1369 
   1370                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1371                 s28_8x16b = _mm_srai_epi16(s27_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1372 
   1373                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   1374                 s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
   1375 
   1376                 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
   1377                 s26_8x16b =  _mm_and_si128(s25_8x16b, mask_low_32b);
   1378                 s27_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
   1379                 s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b);
   1380 
   1381                 /* store 8 8-bit output values  */
   1382                 /* Store the output pixels of (cur_row+2)*/
   1383                 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b);
   1384 
   1385 
   1386                 /*row + 1*/
   1387                 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
   1388                 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
   1389                 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
   1390 
   1391                 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
   1392                 s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
   1393                 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
   1394 
   1395                 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
   1396                 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
   1397                 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
   1398 
   1399                 s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b);
   1400 
   1401                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1402                 s18_8x16b = _mm_srai_epi16(s17_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1403 
   1404                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   1405                 s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
   1406 
   1407                 s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
   1408                 s16_8x16b =  _mm_and_si128(s15_8x16b, mask_low_32b);
   1409                 s17_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
   1410                 s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b);
   1411 
   1412                 /* store 8 8-bit output values  */
   1413                 /* Store the output pixels of (cur_row + 1)*/
   1414                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s18_8x16b);
   1415 
   1416 
   1417                 /* row + 3*/
   1418                 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
   1419                 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
   1420                 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
   1421 
   1422                 /*load 16 pixel values from (cur_row + 7)th row*/
   1423                 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   1424 
   1425                 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
   1426                 s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
   1427 
   1428                 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
   1429 
   1430                 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
   1431                 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
   1432                 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
   1433 
   1434                 s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b);
   1435 
   1436                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   1437                 s38_8x16b = _mm_srai_epi16(s37_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   1438 
   1439                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   1440                 s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
   1441 
   1442                 s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
   1443                 s36_8x16b =  _mm_and_si128(s35_8x16b, mask_low_32b);
   1444                 s37_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
   1445                 s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b);
   1446 
   1447                 /* store 8 8-bit output values  */
   1448                 /* Store the output pixels of (cur_row+3)*/
   1449                 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b);
   1450 
   1451                 s2_10_16x8b = s2_3_16x8b;
   1452 
   1453                 pu1_src += 4 * src_strd; /* pointer update */
   1454                 pu1_dst += 4 * dst_strd; /* pointer update */
   1455             }
   1456         }
   1457     }
   1458 }
   1459 
   1460 
   1461 /**
   1462 *******************************************************************************
   1463 *
   1464 * @brief
   1465 *       Interprediction luma filter for copy 16bit output
   1466 *
   1467 * @par Description:
   1468 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
   1469 *    by 'src' to the location pointed by 'dst' The output is upshifted by 6
   1470 *    bits and is used as input for vertical filtering or weighted prediction
   1471 *
   1472 * @param[in] pu1_src
   1473 *  UWORD8 pointer to the source
   1474 *
   1475 * @param[out] pi2_dst
   1476 *  WORD16 pointer to the destination
   1477 *
   1478 * @param[in] src_strd
   1479 *  integer source stride
   1480 *
   1481 * @param[in] dst_strd
   1482 *  integer destination stride
   1483 *
   1484 * @param[in] pi1_coeff
   1485 *  WORD8 pointer to the filter coefficients
   1486 *
   1487 * @param[in] ht
   1488 *  integer height of the array
   1489 *
   1490 * @param[in] wd
   1491 *  integer width of the array
   1492 *
   1493 * @returns
   1494 *
   1495 * @remarks
   1496 *  None
   1497 *
   1498 *******************************************************************************
   1499 */
   1500 
   1501 void ihevc_inter_pred_luma_copy_w16out_ssse3(UWORD8 *pu1_src,
   1502                                              WORD16 *pi2_dst,
   1503                                              WORD32 src_strd,
   1504                                              WORD32 dst_strd,
   1505                                              WORD8 *pi1_coeff,
   1506                                              WORD32 ht,
   1507                                              WORD32 wd)
   1508 {
   1509     WORD32 row, col;
   1510     __m128i  s3, zero_8x16b;
   1511 
   1512     ASSERT(wd % 2 == 0); /* checking assumption*/
   1513     ASSERT(ht % 2 == 0); /* checking assumption*/
   1514     UNUSED(pi1_coeff);
   1515     zero_8x16b = _mm_setzero_si128();
   1516 /*  outer for loop starts from here */
   1517     if(wd % 8 == 0) /* wd = multiple of 8 case */
   1518     {
   1519         for(row = 0; row < ht; row += 2)
   1520         {
   1521             int offset = 0;
   1522             for(col = 0; col < wd; col += 8)
   1523             {
   1524 /* row =0 */
   1525                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
   1526                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
   1527                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   1528 
   1529                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
   1530 
   1531                 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
   1532                 _mm_store_si128((__m128i *)(pi2_dst + offset), s3);
   1533 
   1534 /* row =1 */
   1535                 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
   1536                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */
   1537                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   1538 
   1539                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
   1540 
   1541                 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
   1542                 _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
   1543 
   1544                 offset += 8; /* To pointer update */
   1545             } /* inner for loop ends here(8-output values in single iteration) */
   1546 
   1547             pu1_src += 2 * src_strd; /* pointer update */
   1548             pi2_dst += 2 * dst_strd; /* pointer update */
   1549         }
   1550     }
   1551     else /* wd = multiple of 4 case */
   1552     {
   1553         for(row = 0; row < ht; row += 2)
   1554         {
   1555             int offset = 0;
   1556             for(col = 0; col < wd; col += 4)
   1557             {
   1558 /* row =0 */
   1559                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
   1560                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
   1561                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   1562 
   1563                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
   1564 
   1565                 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
   1566                 _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
   1567 
   1568 /* row =1 */
   1569                 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
   1570                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */
   1571                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   1572 
   1573                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */
   1574 
   1575                 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
   1576                 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
   1577                 offset += 4; /* To pointer update */
   1578             } /* inner for loop ends here(4-output values in single iteration) */
   1579 
   1580             pu1_src += 2 * src_strd; /* pointer update */
   1581             pi2_dst += 2 * dst_strd; /* pointer update */
   1582         }
   1583     }
   1584 
   1585 }
   1586 
   1587 /**
   1588 *******************************************************************************
   1589 *
   1590 * @brief
   1591 *     Interprediction luma filter for horizontal 16bit output
   1592 *
   1593 * @par Description:
   1594 *    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
   1595 *    to the elements pointed by 'pu1_src' and  writes to the location pointed
   1596 *    by 'pu1_dst'  No downshifting or clipping is done and the output is  used
   1597 *    as an input for vertical filtering or weighted  prediction
   1598 *
   1599 * @param[in] pu1_src
   1600 *  UWORD8 pointer to the source
   1601 *
   1602 * @param[out] pi2_dst
   1603 *  WORD16 pointer to the destination
   1604 *
   1605 * @param[in] src_strd
   1606 *  integer source stride
   1607 *
   1608 * @param[in] dst_strd
   1609 *  integer destination stride
   1610 *
   1611 * @param[in] pi1_coeff
   1612 *  WORD8 pointer to the filter coefficients
   1613 *
   1614 * @param[in] ht
   1615 *  integer height of the array
   1616 *
   1617 * @param[in] wd
   1618 *  integer width of the array
   1619 *
   1620 * @returns
   1621 *
   1622 * @remarks
   1623 *  None
   1624 *
   1625 *******************************************************************************
   1626 */
   1627 void ihevc_inter_pred_luma_horz_w16out_ssse3(UWORD8 *pu1_src,
   1628                                              WORD16 *pi2_dst,
   1629                                              WORD32 src_strd,
   1630                                              WORD32 dst_strd,
   1631                                              WORD8 *pi1_coeff,
   1632                                              WORD32 ht,
   1633                                              WORD32 wd)
   1634 {
   1635     WORD32 row, col;
   1636 
   1637     /* all 128 bit registers are named with a suffix mxnb, where m is the */
   1638     /* number of n bits packed in the register                            */
   1639 
   1640     __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
   1641     __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
   1642     __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b;
   1643     __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b;
   1644     __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
   1645     __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
   1646 
   1647     ASSERT(wd % 4 == 0); /* checking assumption*/
   1648 
   1649     PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
   1650     PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
   1651     PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
   1652     PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
   1653     PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
   1654     PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
   1655 
   1656     /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
   1657     src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   1658 
   1659 
   1660     control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
   1661     control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
   1662     control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
   1663     control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
   1664 
   1665     coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
   1666     coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
   1667 
   1668     coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b);  /* pi1_coeff[4] */
   1669     coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b);  /* pi1_coeff[4] */
   1670 
   1671     if(0 == (ht & 1)) /* ht multiple of 2 case */
   1672     {
   1673 
   1674         if(0 == (wd & 7)) /* wd = multiple of 8 case */
   1675         {
   1676             for(row = 0; row < ht; row += 2)
   1677             {
   1678 
   1679                 int offset = 0;
   1680 
   1681                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   1682                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   1683 
   1684 
   1685 
   1686                 for(col = 0; col < wd; col += 8)
   1687                 {
   1688                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
   1689                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
   1690                     src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
   1691 
   1692                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
   1693                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
   1694                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1695                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
   1696                     /* row = 0 */
   1697                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1698                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1699                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1700                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1701                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
   1702 
   1703                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1704                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1705                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1706                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1707                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
   1708 
   1709                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1710                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1711                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1712                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1713                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
   1714 
   1715                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   1716                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
   1717                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
   1718 
   1719                     src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
   1720                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
   1721                     src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1722                     res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
   1723                                                                                               /* row = 1 */
   1724                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1725                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1726                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1727                     src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1728                     res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
   1729 
   1730                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1731                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1732                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1733                     src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1734                     res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
   1735 
   1736                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1737                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1738                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1739                     src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1740                     res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
   1741 
   1742                     res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
   1743                     res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
   1744                     res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
   1745 
   1746                     /* to store the 1st 4 pixels res. */
   1747                     _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
   1748                     _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
   1749 
   1750                     offset += 8; /* To pointer updates*/
   1751                 }
   1752                 pu1_src += 2 * src_strd;  /* pointer updates*/
   1753                 pi2_dst += 2 * dst_strd;  /* pointer updates*/
   1754             }
   1755         }
   1756         else /* wd = multiple of 4 case */
   1757         {
   1758             for(row = 0; row < ht; row += 2)
   1759             {
   1760                 int offset = 0;
   1761 
   1762                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   1763                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   1764 
   1765 
   1766                 for(col = 0; col < wd; col += 4)
   1767                 {
   1768                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
   1769                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
   1770                     src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
   1771 
   1772                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
   1773                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
   1774                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1775                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
   1776                     /* row = 0 */
   1777                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1778                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1779                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1780                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1781                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
   1782 
   1783                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1784                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1785                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1786                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1787                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
   1788 
   1789                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1790                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1791                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1792                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1793                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
   1794 
   1795                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   1796                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
   1797                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
   1798 
   1799                     src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
   1800                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
   1801                     src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1802                     res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
   1803                                                                                               /* row = 1 */
   1804                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1805                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1806                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1807                     src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1808                     res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
   1809 
   1810                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1811                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1812                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1813                     src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1814                     res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
   1815 
   1816                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1817                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1818                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1819                     src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1820                     res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
   1821 
   1822                     res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
   1823                     res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
   1824                     res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
   1825 
   1826                     /* to store the 1st 4 pixels res. */
   1827                     _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
   1828                     _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
   1829 
   1830                     offset += 4; /* To pointer updates*/
   1831                 }
   1832                 pu1_src += 2 * src_strd;  /* Pointer update */
   1833                 pi2_dst += 2 * dst_strd;  /* Pointer update */
   1834             }
   1835         }
   1836     }
   1837     else /* odd ht */
   1838     {
   1839         if(0 == (wd & 7)) /* multiple of 8 case */
   1840         {
   1841             for(row = 0; row < ht; row++)
   1842             {
   1843                 int offset = 0;
   1844 
   1845                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   1846 
   1847 
   1848                 for(col = 0; col < wd; col += 8)
   1849                 {
   1850                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
   1851                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
   1852 
   1853                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
   1854                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
   1855                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1856                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
   1857                     /* row = 0 */
   1858                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1859                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1860                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1861                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1862                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
   1863 
   1864                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1865                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1866                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1867                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1868                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
   1869 
   1870                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1871                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1872                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1873                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1874                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
   1875 
   1876                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   1877                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
   1878                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
   1879 
   1880                     /* to store the 1st 4 pixels res. */
   1881                     _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
   1882 
   1883                     offset += 8; /* To pointer updates*/
   1884                 }
   1885                 pu1_src += src_strd;    /* pointer updates*/
   1886                 pi2_dst += dst_strd;    /* pointer updates*/
   1887             }
   1888         }
   1889         else  /* wd = multiple of 4 case */
   1890         {
   1891             for(row = 0; row < (ht - 1); row += 2)
   1892             {
   1893                 int offset = 0;
   1894 
   1895 
   1896                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   1897                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   1898 
   1899 
   1900                 for(col = 0; col < wd; col += 4)
   1901                 {
   1902                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
   1903                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));             /* row = 0 */
   1904                     src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */
   1905 
   1906                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
   1907                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
   1908                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1909                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
   1910                                                                                            /* row = 0 */
   1911                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1912                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1913                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1914                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1915                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
   1916 
   1917                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1918                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1919                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1920                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1921                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
   1922 
   1923                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1924                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1925                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1926                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1927                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
   1928 
   1929                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   1930                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
   1931                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
   1932 
   1933                     src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1);                   /* row = 1 */
   1934                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/                 /* row = 1 */
   1935                     src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1936                     res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b);   /* row = 1 */
   1937                                                                                               /* row = 1 */
   1938                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1939                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1940                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1941                     src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1942                     res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b);   /* row = 1 */
   1943 
   1944                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1945                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1946                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1947                     src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1948                     res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b);   /* row = 1 */
   1949 
   1950                     src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);                   /* row = 1 */
   1951                     src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2);                   /* row = 1 */
   1952                     /* pix. |7:0|6:-1| to do two dot-products at same time*/                  /* row = 1 */
   1953                     src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */
   1954                     res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b);   /* row = 1 */
   1955 
   1956                     res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
   1957                     res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b);
   1958                     res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b);
   1959 
   1960                     /* to store the 1st 4 pixels res. */
   1961                     _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
   1962                     _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b);
   1963 
   1964                     offset += 4; /* To pointer updates*/
   1965                 }
   1966                 pu1_src += 2 * src_strd;  /* Pointer update */
   1967                 pi2_dst += 2 * dst_strd;  /* Pointer update */
   1968             }
   1969             { /* last repeat at outside the loop */
   1970                 int offset = 0;
   1971                 for(col = 0; col < wd; col += 4)
   1972                 {
   1973                     /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/
   1974                     src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset));  /* row = 0 */
   1975 
   1976                     src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);                  /* row = 0 */
   1977                     /* pix. |5:-2|4:-3| to do two dot-products at same time*/              /* row = 0 */
   1978                     src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1979                     res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b);  /* row = 0 */
   1980                                                                                            /* row = 0 */
   1981                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1982                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1983                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1984                     src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1985                     res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b);  /* row = 0 */
   1986 
   1987                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1988                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1989                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1990                     src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1991                     res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b);  /* row = 0 */
   1992 
   1993                     src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);                  /* row = 0 */
   1994                     src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2);                  /* row = 0 */
   1995                     /* pix. |7:0|6:-1| to do two dot-products at same time*/               /* row = 0 */
   1996                     src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */
   1997                     res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b);  /* row = 0 */
   1998 
   1999                     res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   2000                     res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b);
   2001                     res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b);
   2002 
   2003                     /* to store the 1st 4 pixels res. */
   2004                     _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b);
   2005 
   2006                     offset += 4; /* To pointer updates*/
   2007                 }
   2008             }
   2009         }
   2010     }
   2011 }
   2012 
   2013 /**
   2014 *******************************************************************************
   2015 *
   2016 * @brief
   2017 *      Interprediction luma filter for vertical 16bit output
   2018 *
   2019 * @par Description:
   2020 *    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
   2021 *    the elements pointed by 'pu1_src' and  writes to the location pointed by
   2022 *    'pu1_dst'  No downshifting or clipping is done and the output is  used as
   2023 *    an input for weighted prediction
   2024 *
   2025 * @param[in] pu1_src
   2026 *  UWORD8 pointer to the source
   2027 *
   2028 * @param[out] pi2_dst
   2029 *  WORD16 pointer to the destination
   2030 *
   2031 * @param[in] src_strd
   2032 *  integer source stride
   2033 *
   2034 * @param[in] dst_strd
   2035 *  integer destination stride
   2036 *
   2037 * @param[in] pi1_coeff
   2038 *  WORD8 pointer to the filter coefficients
   2039 *
   2040 * @param[in] ht
   2041 *  integer height of the array
   2042 *
   2043 * @param[in] wd
   2044 *  integer width of the array
   2045 *
   2046 * @returns
   2047 *
   2048 * @remarks
   2049 *  None
   2050 *
   2051 *******************************************************************************
   2052 */
   2053 void ihevc_inter_pred_luma_vert_w16out_ssse3(UWORD8 *pu1_src,
   2054                                              WORD16 *pi2_dst,
   2055                                              WORD32 src_strd,
   2056                                              WORD32 dst_strd,
   2057                                              WORD8 *pi1_coeff,
   2058                                              WORD32 ht,
   2059                                              WORD32 wd)
   2060 {
   2061     WORD32 row, col;
   2062     UWORD8 *pu1_src_copy;
   2063     WORD16 *pi2_dst_copy;
   2064     __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
   2065     __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b;
   2066     __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
   2067     __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
   2068     __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
   2069     __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b;
   2070     __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b;
   2071     __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b;
   2072 
   2073 
   2074     __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b;
   2075 
   2076 /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
   2077     s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   2078 
   2079     control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
   2080     control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
   2081     control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */
   2082     control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */
   2083 
   2084     coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
   2085     coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
   2086 
   2087     coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b);  /* pi1_coeff[4] */
   2088     coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b);  /* pi1_coeff[4] */
   2089 
   2090 
   2091 /*  outer for loop starts from here */
   2092     if((wd % 8) == 0)
   2093     { /* wd = multiple of 8 case */
   2094 
   2095         pu1_src_copy = pu1_src;
   2096         pi2_dst_copy = pi2_dst;
   2097 
   2098         for(col = 0; col < wd; col += 8)
   2099         {
   2100 
   2101             pu1_src = pu1_src_copy + col;
   2102             pi2_dst = pi2_dst_copy + col;
   2103 
   2104             PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
   2105             PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
   2106             PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
   2107             PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
   2108 
   2109             /*load 8 pixel values */
   2110             s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
   2111 
   2112             /*load 8 pixel values */
   2113             s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
   2114 
   2115             s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
   2116 
   2117             s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
   2118 
   2119             /*load 8 pixel values */
   2120             s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
   2121 
   2122             /*load 8 pixel values */
   2123             s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
   2124 
   2125             s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
   2126 
   2127             s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
   2128 
   2129             /*load 8 pixel values */
   2130             s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
   2131 
   2132             /*load 8 pixel values */
   2133             s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   2134 
   2135             s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
   2136 
   2137             s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
   2138 
   2139             /*load 8 pixel values */
   2140             s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   2141 
   2142             /*load 8 pixel values */
   2143             s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
   2144 
   2145             s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
   2146 
   2147             s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
   2148 
   2149             s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
   2150             s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
   2151             s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
   2152 
   2153             /* store 8 8-bit output values  */
   2154             /* Store the output pixels of row 0*/
   2155             _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b);
   2156 
   2157             /* ROW 2*/
   2158             s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
   2159             s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
   2160             s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
   2161 
   2162             /*load 8 pixel values */
   2163             s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
   2164 
   2165             /*load 8 pixel values */
   2166             s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
   2167 
   2168             s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
   2169 
   2170             s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
   2171 
   2172             s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
   2173             s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
   2174             s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
   2175 
   2176             /* store 8 8-bit output values  */
   2177             /* Store the output pixels of row 2*/
   2178             _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
   2179 
   2180 
   2181             /*ROW 1*/
   2182             s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
   2183 
   2184             s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
   2185 
   2186             s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
   2187 
   2188             s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
   2189 
   2190             s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
   2191 
   2192             s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
   2193 
   2194             s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
   2195 
   2196             s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
   2197 
   2198             s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
   2199             s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
   2200             s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
   2201 
   2202 
   2203             /* store 8 8-bit output values  */
   2204             /* Store the output pixels of row 1*/
   2205             _mm_store_si128((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b);
   2206 
   2207 
   2208             /* ROW 3*/
   2209             s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
   2210             s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
   2211             s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
   2212 
   2213             /*load 8 pixel values */
   2214             s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
   2215 
   2216             s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
   2217 
   2218             s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
   2219 
   2220             s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
   2221             s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
   2222             s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
   2223 
   2224 
   2225             /* store 8 8-bit output values  */
   2226             /* Store the output pixels of row 2*/
   2227             _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
   2228 
   2229             pu1_src += (8 * src_strd);
   2230             pi2_dst += (4 * dst_strd);
   2231 
   2232             for(row = 4; row < ht; row += 4)
   2233             {
   2234 
   2235                 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
   2236                 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
   2237                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   2238                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   2239 
   2240                 s3_0_16x8b = s3_2_16x8b;
   2241                 s3_1_16x8b = s3_3_16x8b;
   2242                 s3_2_16x8b = s3_4_16x8b;
   2243 
   2244                 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
   2245                 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
   2246                 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
   2247 
   2248                 /*load 8 pixel values from (cur_row + 4)th row*/
   2249                 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2250 
   2251                 s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
   2252                 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
   2253 
   2254                 s4_0_16x8b = s4_2_16x8b;
   2255                 s4_1_16x8b = s4_3_16x8b;
   2256                 s4_2_16x8b = s4_4_16x8b;
   2257 
   2258                 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
   2259                 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
   2260                 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
   2261 
   2262                 /* store 8 8-bit output values  */
   2263                 /* Store the output pixels of row 4*/
   2264                 _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b);
   2265 
   2266                 /* row + 2*/
   2267                 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
   2268                 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
   2269                 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
   2270 
   2271                 /*load 8 pixel values from (cur_row + 5)th row*/
   2272                 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
   2273 
   2274                 /*load 8 pixel values from (cur_row + 6)th row*/
   2275                 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   2276 
   2277                 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
   2278                 s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
   2279 
   2280                 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
   2281 
   2282                 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
   2283                 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
   2284                 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
   2285 
   2286                 /* store 8 8-bit output values  */
   2287                 /* Store the output pixels of (cur_row+2)*/
   2288                 _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
   2289 
   2290 
   2291                 /*row + 1*/
   2292                 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
   2293                 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
   2294                 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
   2295 
   2296                 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
   2297                 s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
   2298                 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
   2299 
   2300                 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
   2301                 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
   2302                 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
   2303 
   2304 
   2305                 /* store 8 8-bit output values  */
   2306                 /* Store the output pixels of (cur_row + 1)*/
   2307                 _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s16_8x16b);
   2308 
   2309 
   2310                 /* row + 3*/
   2311                 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
   2312                 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
   2313                 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
   2314 
   2315                 /*load 8 pixel values from (cur_row + 7)th row*/
   2316                 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   2317 
   2318                 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
   2319                 s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
   2320 
   2321                 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
   2322 
   2323                 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
   2324                 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
   2325                 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
   2326 
   2327                 /* store 8 8-bit output values  */
   2328                 /* Store the output pixels of (cur_row+3)*/
   2329                 _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
   2330 
   2331                 s2_10_16x8b = s2_3_16x8b;
   2332 
   2333 
   2334                 pu1_src += 4 * src_strd; /* pointer update */
   2335                 pi2_dst += 4 * dst_strd; /* pointer update */
   2336             }
   2337         }
   2338     }
   2339     else /* wd = multiple of 8 case */
   2340     {
   2341 
   2342         pu1_src_copy = pu1_src;
   2343         pi2_dst_copy = pi2_dst;
   2344 
   2345         for(col = 0; col < wd; col += 4)
   2346         {
   2347 
   2348             pu1_src = pu1_src_copy + col;
   2349             pi2_dst = pi2_dst_copy + col;
   2350 
   2351             PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0)
   2352             PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0)
   2353             PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0)
   2354             PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0)
   2355 
   2356             /*load 8 pixel values */
   2357             s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd)));
   2358 
   2359             /*load 8 pixel values */
   2360             s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd)));
   2361 
   2362             s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
   2363 
   2364             s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
   2365 
   2366             /*load 8 pixel values */
   2367             s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
   2368 
   2369             /*load 8 pixel values */
   2370             s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
   2371 
   2372             s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
   2373 
   2374             s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
   2375 
   2376             /*load 8 pixel values */
   2377             s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
   2378 
   2379             /*load 8 pixel values */
   2380             s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   2381 
   2382             s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b);
   2383 
   2384             s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
   2385 
   2386             /*load 8 pixel values */
   2387             s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   2388 
   2389             /*load 8 pixel values */
   2390             s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd)));
   2391 
   2392             s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b);
   2393 
   2394             s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
   2395 
   2396             s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
   2397             s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
   2398             s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
   2399 
   2400             /* store 8 8-bit output values  */
   2401             /* Store the output pixels of row 0*/
   2402             _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b);
   2403 
   2404             /* ROW 2*/
   2405             s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
   2406             s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
   2407             s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
   2408 
   2409             /*load 8 pixel values */
   2410             s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd)));
   2411 
   2412             /*load 8 pixel values */
   2413             s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd)));
   2414 
   2415             s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b);
   2416 
   2417             s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
   2418 
   2419             s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
   2420             s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
   2421             s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
   2422 
   2423             /* store 8 8-bit output values  */
   2424             /* Store the output pixels of row 2*/
   2425             _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
   2426 
   2427 
   2428             /*ROW 1*/
   2429             s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
   2430 
   2431             s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
   2432 
   2433             s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b);
   2434 
   2435             s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
   2436 
   2437             s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b);
   2438 
   2439             s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
   2440 
   2441             s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b);
   2442 
   2443             s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
   2444 
   2445             s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
   2446             s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
   2447             s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
   2448 
   2449 
   2450             /* store 8 8-bit output values  */
   2451             /* Store the output pixels of row 1*/
   2452             _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b);
   2453 
   2454 
   2455             /* ROW 3*/
   2456             s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
   2457             s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
   2458             s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
   2459 
   2460             /*load 8 pixel values */
   2461             s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd)));
   2462 
   2463             s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b);
   2464 
   2465             s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
   2466 
   2467             s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
   2468             s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
   2469             s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
   2470 
   2471             /* store 8 8-bit output values  */
   2472             /* Store the output pixels of row 2*/
   2473             _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
   2474 
   2475             pu1_src += (8 * src_strd);
   2476             pi2_dst += (4 * dst_strd);
   2477 
   2478             for(row = 4; row < ht; row += 4)
   2479             {
   2480 
   2481                 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
   2482                 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
   2483                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   2484                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   2485 
   2486                 s3_0_16x8b = s3_2_16x8b;
   2487                 s3_1_16x8b = s3_3_16x8b;
   2488                 s3_2_16x8b = s3_4_16x8b;
   2489 
   2490                 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b);
   2491                 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b);
   2492                 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b);
   2493 
   2494                 /*load 8 pixel values from (cur_row + 4)th row*/
   2495                 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2496 
   2497                 s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b);
   2498                 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b);
   2499 
   2500                 s4_0_16x8b = s4_2_16x8b;
   2501                 s4_1_16x8b = s4_3_16x8b;
   2502                 s4_2_16x8b = s4_4_16x8b;
   2503 
   2504                 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b);
   2505                 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b);
   2506                 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b);
   2507 
   2508                 /* store 8 8-bit output values  */
   2509                 /* Store the output pixels of row 4*/
   2510                 _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b);
   2511 
   2512                 /* row + 2*/
   2513                 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b);
   2514                 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b);
   2515                 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b);
   2516 
   2517                 /*load 8 pixel values from (cur_row + 5)th row*/
   2518                 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
   2519 
   2520                 /*load 8 pixel values from (cur_row + 6)th row*/
   2521                 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   2522 
   2523                 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
   2524                 s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b);
   2525 
   2526                 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b);
   2527 
   2528                 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b);
   2529                 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b);
   2530                 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b);
   2531 
   2532                 /* store 8 8-bit output values  */
   2533                 /* Store the output pixels of (cur_row+2)*/
   2534                 _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b);
   2535 
   2536 
   2537                 /*row + 1*/
   2538                 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b);
   2539                 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b);
   2540                 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b);
   2541 
   2542                 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
   2543                 s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b);
   2544                 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b);
   2545 
   2546                 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b);
   2547                 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b);
   2548                 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b);
   2549 
   2550                 /* store 8 8-bit output values  */
   2551                 /* Store the output pixels of (cur_row + 1)*/
   2552                 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s16_8x16b);
   2553 
   2554 
   2555                 /* row + 3*/
   2556                 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b);
   2557                 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b);
   2558                 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b);
   2559 
   2560                 /*load 8 pixel values from (cur_row + 7)th row*/
   2561                 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   2562 
   2563                 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
   2564                 s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b);
   2565 
   2566                 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b);
   2567 
   2568                 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b);
   2569                 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b);
   2570                 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b);
   2571 
   2572                 /* store 8 8-bit output values  */
   2573                 /* Store the output pixels of (cur_row+3)*/
   2574                 _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b);
   2575 
   2576                 s2_10_16x8b = s2_3_16x8b;
   2577 
   2578                 pu1_src += 4 * src_strd; /* pointer update */
   2579                 pi2_dst += 4 * dst_strd; /* pointer update */
   2580             }
   2581         }
   2582     }
   2583 }
   2584 
   2585 /**
   2586 *******************************************************************************
   2587 *
   2588 * @brief
   2589 *
   2590 *        Luma vertical filter for 16bit input.
   2591 *
   2592 * @par Description:
   2593 *   Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
   2594 *   the elements pointed by 'pu1_src' and  writes to the location pointed by
   2595 *   'pu1_dst'  Input is 16 bits  The filter output is downshifted by 12 and
   2596 *   clipped to lie  between 0 and 255
   2597 *
   2598 * @param[in] pi2_src
   2599 *  WORD16 pointer to the source
   2600 *
   2601 * @param[out] pu1_dst
   2602 *  UWORD8 pointer to the destination
   2603 *
   2604 * @param[in] src_strd
   2605 *  integer source stride
   2606 *
   2607 * @param[in] dst_strd
   2608 *  integer destination stride
   2609 *
   2610 * @param[in] pi1_coeff
   2611 *  WORD8 pointer to the filter coefficients
   2612 *
   2613 * @param[in] ht
   2614 *  integer height of the array
   2615 *
   2616 * @param[in] wd
   2617 *  integer width of the array
   2618 *
   2619 * @returns
   2620 *
   2621 * @remarks
   2622 *  None
   2623 *
   2624 *******************************************************************************
   2625 */
   2626 void ihevc_inter_pred_luma_vert_w16inp_ssse3(WORD16 *pi2_src,
   2627                                              UWORD8 *pu1_dst,
   2628                                              WORD32 src_strd,
   2629                                              WORD32 dst_strd,
   2630                                              WORD8 *pi1_coeff,
   2631                                              WORD32 ht,
   2632                                              WORD32 wd)
   2633 {
   2634     WORD32 row, col;
   2635     WORD16 *pi2_src_copy;
   2636     UWORD8 *pu1_dst_copy;
   2637     __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
   2638     __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b;
   2639     __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
   2640     __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
   2641     __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
   2642     __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b;
   2643     __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b;
   2644     __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b;
   2645 
   2646     __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg;
   2647 
   2648 /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
   2649     s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   2650 
   2651     zero_8x16b = _mm_setzero_si128();
   2652     sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
   2653     s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
   2654 
   2655     coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
   2656     coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
   2657 
   2658     coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2));  /* pi1_coeff[4] */
   2659     coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3));  /* pi1_coeff[4] */
   2660 
   2661 
   2662 /* seting values in register */
   2663     offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
   2664     mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
   2665     mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
   2666 
   2667 
   2668     pi2_src_copy = pi2_src;
   2669     pu1_dst_copy = pu1_dst;
   2670 
   2671 /*  outer for loop starts from here */
   2672     for(col = 0; col < wd; col += 4)
   2673     {
   2674 
   2675         pi2_src = pi2_src_copy + col;
   2676         pu1_dst = pu1_dst_copy + col;
   2677 
   2678         /*load 4 pixel values */
   2679         s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd)));
   2680 
   2681         /*load 4 pixel values */
   2682         s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd)));
   2683 
   2684         s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
   2685 
   2686         s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
   2687 
   2688         /*load 4 pixel values */
   2689         s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
   2690 
   2691         /*load 4 pixel values */
   2692         s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
   2693 
   2694         s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
   2695 
   2696         s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
   2697 
   2698         /*load 4 pixel values */
   2699         s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
   2700 
   2701         /*load 4 pixel values */
   2702         s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
   2703 
   2704         s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b);
   2705 
   2706         s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
   2707 
   2708         /*load 4 pixel values */
   2709         s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
   2710 
   2711         /*load 4 pixel values */
   2712         s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd)));
   2713 
   2714         s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b);
   2715 
   2716         s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
   2717 
   2718         s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
   2719         s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
   2720         s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
   2721 
   2722         /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2723         s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2724 
   2725         /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   2726         s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b);
   2727 
   2728         /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2729         s8_8x16b = _mm_srai_epi32(s9_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2730 
   2731         s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
   2732 
   2733 
   2734         /* i2_tmp = CLIP_U8(i2_tmp);*/
   2735         s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
   2736 
   2737         s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
   2738         s5_8x16b =  _mm_and_si128(s4_8x16b, mask_low_32b);
   2739         s6_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
   2740         s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
   2741 
   2742         /* store 8 8-bit output values  */
   2743         /* Store the output pixels of row 0*/
   2744         _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
   2745 
   2746         /* ROW 2*/
   2747         s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
   2748         s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
   2749         s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
   2750 
   2751         /*load 4 pixel values */
   2752         s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd)));
   2753 
   2754         /*load 4 pixel values */
   2755         s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd)));
   2756 
   2757         s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b);
   2758 
   2759         s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
   2760 
   2761         s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
   2762         s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
   2763         s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
   2764 
   2765         /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2766         s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2767 
   2768         /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   2769         s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b);
   2770 
   2771         /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2772         s28_8x16b = _mm_srai_epi32(s29_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2773 
   2774         s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b);
   2775 
   2776 
   2777         /* i2_tmp = CLIP_U8(i2_tmp);*/
   2778         s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
   2779 
   2780         s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
   2781         s25_8x16b =  _mm_and_si128(s24_8x16b, mask_low_32b);
   2782         s26_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
   2783         s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b);
   2784 
   2785         /* store 8 8-bit output values  */
   2786         /* Store the output pixels of row 2*/
   2787         _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
   2788 
   2789 
   2790         /*ROW 1*/
   2791         s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
   2792 
   2793         s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
   2794 
   2795         s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b);
   2796 
   2797         s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
   2798 
   2799         s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b);
   2800 
   2801         s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
   2802 
   2803         s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b);
   2804 
   2805         s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
   2806 
   2807         s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
   2808         s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
   2809         s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
   2810 
   2811         /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2812         s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2813 
   2814         /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   2815         s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b);
   2816 
   2817         /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2818         s18_8x16b = _mm_srai_epi32(s19_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2819 
   2820         s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b);
   2821 
   2822 
   2823         /* i2_tmp = CLIP_U8(i2_tmp);*/
   2824         s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
   2825 
   2826         s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (dst_strd)));
   2827         s15_8x16b =  _mm_and_si128(s14_8x16b, mask_low_32b);
   2828         s16_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
   2829         s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b);
   2830 
   2831         /* store 8 8-bit output values  */
   2832         /* Store the output pixels of row 1*/
   2833         _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b);
   2834 
   2835 
   2836         /* ROW 3*/
   2837         s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
   2838         s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
   2839         s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
   2840 
   2841         /*load 4 pixel values */
   2842         s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd)));
   2843 
   2844         s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b);
   2845 
   2846         s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
   2847 
   2848         s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
   2849         s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
   2850         s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
   2851 
   2852         /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2853         s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2854 
   2855 
   2856         /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   2857         s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b);
   2858 
   2859         /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2860         s38_8x16b = _mm_srai_epi32(s39_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2861 
   2862         s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b);
   2863 
   2864 
   2865         /* i2_tmp = CLIP_U8(i2_tmp);*/
   2866         s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
   2867 
   2868         s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
   2869         s35_8x16b =  _mm_and_si128(s34_8x16b, mask_low_32b);
   2870         s36_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
   2871         s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b);
   2872 
   2873         /* store 8 8-bit output values  */
   2874         /* Store the output pixels of row 2*/
   2875         _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
   2876 
   2877         pi2_src += (8 * src_strd);
   2878         pu1_dst += (4 * dst_strd);
   2879 
   2880         for(row = 4; row < ht; row += 4)
   2881         {
   2882 
   2883             s3_0_16x8b = s3_2_16x8b;
   2884             s3_1_16x8b = s3_3_16x8b;
   2885             s3_2_16x8b = s3_4_16x8b;
   2886 
   2887             s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
   2888             s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
   2889             s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
   2890 
   2891             /*load 4 pixel values from (cur_row + 4)th row*/
   2892             s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src));
   2893 
   2894             s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b);
   2895             s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
   2896 
   2897             s4_0_16x8b = s4_2_16x8b;
   2898             s4_1_16x8b = s4_3_16x8b;
   2899             s4_2_16x8b = s4_4_16x8b;
   2900 
   2901             s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
   2902             s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
   2903             s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
   2904 
   2905             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2906             s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2907 
   2908             /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   2909             s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b);
   2910 
   2911             /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2912             s8_8x16b = _mm_srai_epi32(s9_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2913 
   2914             s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
   2915 
   2916 
   2917             /* i2_tmp = CLIP_U8(i2_tmp);*/
   2918             s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b);
   2919 
   2920             s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
   2921             s5_8x16b =  _mm_and_si128(s4_8x16b, mask_low_32b);
   2922             s6_8x16b =  _mm_and_si128(s9_8x16b, mask_high_96b);
   2923             s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
   2924 
   2925             /* store 8 8-bit output values  */
   2926             /* Store the output pixels of row 4*/
   2927             _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
   2928 
   2929 /* row + 2*/
   2930             s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
   2931             s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
   2932             s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
   2933 
   2934             /*load 4 pixel values from (cur_row + 5)th row*/
   2935             s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
   2936 
   2937             /*load 4 pixel values from (cur_row + 6)th row*/
   2938             s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
   2939 
   2940             /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
   2941             s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
   2942 
   2943             s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
   2944 
   2945             s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
   2946             s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
   2947             s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
   2948 
   2949             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2950             s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2951 
   2952             /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   2953             s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b);
   2954 
   2955             /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2956             s28_8x16b = _mm_srai_epi32(s29_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2957 
   2958             s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b);
   2959 
   2960 
   2961             /* i2_tmp = CLIP_U8(i2_tmp);*/
   2962             s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b);
   2963 
   2964             s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd)));
   2965             s25_8x16b =  _mm_and_si128(s24_8x16b, mask_low_32b);
   2966             s26_8x16b =  _mm_and_si128(s29_8x16b, mask_high_96b);
   2967             s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b);
   2968 
   2969             /* store 8 8-bit output values  */
   2970             /* Store the output pixels of (cur_row+2)*/
   2971             _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b);
   2972 
   2973 
   2974 /*row + 1*/
   2975             s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
   2976             s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
   2977             s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
   2978 
   2979             /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
   2980             s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
   2981             s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
   2982 
   2983             s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
   2984             s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
   2985             s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
   2986 
   2987             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2988             s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2989 
   2990             /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   2991             s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b);
   2992 
   2993             /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   2994             s18_8x16b = _mm_srai_epi32(s19_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   2995 
   2996             s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b);
   2997 
   2998             /* i2_tmp = CLIP_U8(i2_tmp);*/
   2999             s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b);
   3000 
   3001             s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
   3002             s15_8x16b =  _mm_and_si128(s14_8x16b, mask_low_32b);
   3003             s16_8x16b =  _mm_and_si128(s19_8x16b, mask_high_96b);
   3004             s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b);
   3005 
   3006             /* store 8 8-bit output values  */
   3007             /* Store the output pixels of (cur_row + 1)*/
   3008             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b);
   3009 
   3010 
   3011 /* row + 3*/
   3012             s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
   3013             s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
   3014             s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
   3015 
   3016             /*load 4 pixel values from (cur_row + 7)th row*/
   3017             s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
   3018 
   3019             /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
   3020             s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
   3021 
   3022             s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
   3023 
   3024             s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
   3025             s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
   3026             s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
   3027 
   3028             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3029             s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3030 
   3031             /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3032             s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b);
   3033 
   3034             /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3035             s38_8x16b = _mm_srai_epi32(s39_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3036 
   3037             s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b);
   3038 
   3039 
   3040             /* i2_tmp = CLIP_U8(i2_tmp);*/
   3041             s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b);
   3042 
   3043             s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd)));
   3044             s35_8x16b =  _mm_and_si128(s34_8x16b, mask_low_32b);
   3045             s36_8x16b =  _mm_and_si128(s39_8x16b, mask_high_96b);
   3046             s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b);
   3047 
   3048             /* store 8 8-bit output values  */
   3049             /* Store the output pixels of (cur_row+3)*/
   3050             _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b);
   3051 
   3052             s2_10_16x8b = s2_3_16x8b;
   3053 
   3054             pi2_src += 4 * src_strd; /* pointer update */
   3055             pu1_dst += 4 * dst_strd; /* pointer update */
   3056         }
   3057     }
   3058 
   3059 }
   3060 
   3061 
   3062 /**
   3063 *******************************************************************************
   3064 *
   3065 * @brief
   3066 *      Luma prediction filter for vertical 16bit input & output
   3067 *
   3068 * @par Description:
   3069 *    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
   3070 *    the elements pointed by 'pu1_src' and  writes to the location pointed by
   3071 *    'pu1_dst'  Input is 16 bits  The filter output is downshifted by 6 and
   3072 *    8192 is  subtracted to store it as a 16 bit number  The output is used as
   3073 *    a input to weighted prediction
   3074 *
   3075 * @param[in] pi2_src
   3076 *  WORD16 pointer to the source
   3077 *
   3078 * @param[out] pi2_dst
   3079 *  WORD16 pointer to the destination
   3080 *
   3081 * @param[in] src_strd
   3082 *  integer source stride
   3083 *
   3084 * @param[in] dst_strd
   3085 *  integer destination stride
   3086 *
   3087 * @param[in] pi1_coeff
   3088 *  WORD8 pointer to the filter coefficients
   3089 *
   3090 * @param[in] ht
   3091 *  integer height of the array
   3092 *
   3093 * @param[in] wd
   3094 *  integer width of the array
   3095 *
   3096 * @returns
   3097 *
   3098 * @remarks
   3099 *  None
   3100 *
   3101 *******************************************************************************
   3102 */
   3103 void ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src,
   3104                                                     WORD16 *pi2_dst,
   3105                                                     WORD32 src_strd,
   3106                                                     WORD32 dst_strd,
   3107                                                     WORD8 *pi1_coeff,
   3108                                                     WORD32 ht,
   3109                                                     WORD32 wd)
   3110 {
   3111     WORD32 row, col;
   3112     WORD16 *pi2_src_copy;
   3113     WORD16 *pi2_dst_copy;
   3114     __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b;
   3115     __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b;
   3116     __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b;
   3117     __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b;
   3118     __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b;
   3119     __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b;
   3120     __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b;
   3121     __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b;
   3122 
   3123     __m128i zero_8x16b, offset_8x16b, sign_reg;
   3124 
   3125 /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
   3126     s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   3127 
   3128     zero_8x16b = _mm_setzero_si128();
   3129     sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
   3130     s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
   3131 
   3132     coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
   3133     coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
   3134 
   3135     coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2));  /* pi1_coeff[4] */
   3136     coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3));  /* pi1_coeff[4] */
   3137 
   3138 
   3139 /* seting values in register */
   3140     offset_8x16b = _mm_set1_epi32(OFFSET14); /* for offset addition */
   3141 
   3142     pi2_src_copy = pi2_src;
   3143     pi2_dst_copy = pi2_dst;
   3144 
   3145 /*  outer for loop starts from here */
   3146     for(col = 0; col < wd; col += 4)
   3147     {
   3148 
   3149         pi2_src = pi2_src_copy + col;
   3150         pi2_dst = pi2_dst_copy + col;
   3151 
   3152         /*load 4 pixel values*/
   3153         s2_0_16x8b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd)));
   3154 
   3155         /*load 4 pixel values*/
   3156         s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd)));
   3157 
   3158         s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
   3159 
   3160         s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
   3161 
   3162         /*load 4 pixel values*/
   3163         s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
   3164 
   3165         /*load 4 pixel values*/
   3166         s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
   3167 
   3168         s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
   3169 
   3170         s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
   3171 
   3172         /*load 4 pixel values*/
   3173         s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
   3174 
   3175         /*load 4 pixel values*/
   3176         s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
   3177 
   3178         s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b);
   3179 
   3180         s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
   3181 
   3182         /*load 4 pixel values*/
   3183         s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
   3184 
   3185         /*load 4 pixel values*/
   3186         s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd)));
   3187 
   3188         s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b);
   3189 
   3190         s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
   3191 
   3192         s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
   3193         s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
   3194         s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
   3195 
   3196         /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3197         s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3198 
   3199         /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3200         s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b);
   3201 
   3202         s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b);
   3203 
   3204         /* store 8 8-bit output values  */
   3205         /* Store the output pixels of row 0*/
   3206         _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
   3207 
   3208         /* ROW 2*/
   3209         s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
   3210         s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
   3211         s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
   3212 
   3213         /*load 4 pixel values*/
   3214         s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd)));
   3215 
   3216         /*load 4 pixel values*/
   3217         s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd)));
   3218 
   3219         s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b);
   3220 
   3221         s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
   3222 
   3223         s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
   3224         s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
   3225         s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
   3226 
   3227         /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3228         s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3229 
   3230         /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3231         s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b);
   3232 
   3233         s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b);
   3234 
   3235         /* store 8 8-bit output values  */
   3236         /* Store the output pixels of row 2*/
   3237         _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b);
   3238 
   3239 
   3240         /*ROW 1*/
   3241         s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
   3242 
   3243         s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
   3244 
   3245         s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b);
   3246 
   3247         s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
   3248 
   3249         s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b);
   3250 
   3251         s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
   3252 
   3253         s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b);
   3254 
   3255         s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
   3256 
   3257         s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
   3258         s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
   3259         s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
   3260 
   3261         /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3262         s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3263 
   3264         /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3265         s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b);
   3266 
   3267         s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b);
   3268 
   3269         /* store 8 8-bit output values  */
   3270         /* Store the output pixels of row 1*/
   3271         _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s18_8x16b);
   3272 
   3273 
   3274         /* ROW 3*/
   3275         s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
   3276         s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
   3277         s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
   3278 
   3279         /*load 4 pixel values*/
   3280         s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd)));
   3281 
   3282         s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b);
   3283 
   3284         s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
   3285 
   3286         s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
   3287         s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
   3288         s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
   3289 
   3290         /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3291         s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3292 
   3293 
   3294         /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3295         s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b);
   3296 
   3297         s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b);
   3298 
   3299         /* store 8 8-bit output values  */
   3300         /* Store the output pixels of row 2*/
   3301         _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b);
   3302 
   3303         pi2_src += (8 * src_strd);
   3304         pi2_dst += (4 * dst_strd);
   3305 
   3306         for(row = 4; row < ht; row += 4)
   3307         {
   3308 
   3309             s3_0_16x8b = s3_2_16x8b;
   3310             s3_1_16x8b = s3_3_16x8b;
   3311             s3_2_16x8b = s3_4_16x8b;
   3312 
   3313             s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b);
   3314             s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b);
   3315             s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b);
   3316 
   3317             /*load 4 pixel values from (cur_row + 4)th row*/
   3318             s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src));
   3319 
   3320             s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b);
   3321             s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b);
   3322 
   3323             s4_0_16x8b = s4_2_16x8b;
   3324             s4_1_16x8b = s4_3_16x8b;
   3325             s4_2_16x8b = s4_4_16x8b;
   3326 
   3327             s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b);
   3328             s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b);
   3329             s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b);
   3330 
   3331             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3332             s8_8x16b = _mm_srai_epi32(s6_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3333 
   3334             /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3335             s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b);
   3336 
   3337             s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b);
   3338 
   3339             /* store 8 8-bit output values  */
   3340             /* Store the output pixels of row 4*/
   3341             _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
   3342 
   3343 /* row + 2*/
   3344             s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b);
   3345             s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b);
   3346             s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b);
   3347 
   3348             /*load 4 pixel values from (cur_row + 5)th row*/
   3349             s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd));
   3350 
   3351             /*load 4 pixel values from (cur_row + 6)th row*/
   3352             s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
   3353 
   3354             /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/
   3355             s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b);
   3356 
   3357             s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b);
   3358 
   3359             s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b);
   3360             s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b);
   3361             s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b);
   3362 
   3363             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3364             s28_8x16b = _mm_srai_epi32(s26_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3365 
   3366             /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3367             s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b);
   3368 
   3369             s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b);
   3370 
   3371             /* store 8 8-bit output values  */
   3372             /* Store the output pixels of (cur_row+2)*/
   3373             _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b);
   3374 
   3375 
   3376 /*row + 1*/
   3377             s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b);
   3378             s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b);
   3379             s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b);
   3380 
   3381             /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/
   3382             s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b);
   3383             s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b);
   3384 
   3385             s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b);
   3386             s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b);
   3387             s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b);
   3388 
   3389             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3390             s18_8x16b = _mm_srai_epi32(s16_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3391 
   3392             /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3393             s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b);
   3394 
   3395             s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b);
   3396 
   3397             /* store 8 8-bit output values  */
   3398             /* Store the output pixels of (cur_row + 1)*/
   3399             _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s18_8x16b);
   3400 
   3401 
   3402 /* row + 3*/
   3403             s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b);
   3404             s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b);
   3405             s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b);
   3406 
   3407             /*load 4 pixel values from (cur_row + 7)th row*/
   3408             s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
   3409 
   3410             /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/
   3411             s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b);
   3412 
   3413             s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b);
   3414 
   3415             s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b);
   3416             s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b);
   3417             s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b);
   3418 
   3419             /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3420             s38_8x16b = _mm_srai_epi32(s36_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3421 
   3422             /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   3423             s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b);
   3424 
   3425             s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b);
   3426 
   3427             /* store 8 8-bit output values  */
   3428             /* Store the output pixels of (cur_row+3)*/
   3429             _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b);
   3430 
   3431             s2_10_16x8b = s2_3_16x8b;
   3432 
   3433             pi2_src += 4 * src_strd; /* pointer update */
   3434             pi2_dst += 4 * dst_strd; /* pointer update */
   3435         }
   3436     }
   3437 
   3438 }
   3439 
   3440 /**
   3441 *******************************************************************************
   3442 *
   3443 * @brief
   3444 *      Chroma interprediction filter for copy
   3445 *
   3446 * @par Description:
   3447 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
   3448 *    by 'src' to the location pointed by 'dst'
   3449 *
   3450 * @param[in] pu1_src
   3451 *  UWORD8 pointer to the source
   3452 *
   3453 * @param[out] pu1_dst
   3454 *  UWORD8 pointer to the destination
   3455 *
   3456 * @param[in] src_strd
   3457 *  integer source stride
   3458 *
   3459 * @param[in] dst_strd
   3460 *  integer destination stride
   3461 *
   3462 * @param[in] pi1_coeff
   3463 *  WORD8 pointer to the filter coefficients
   3464 *
   3465 * @param[in] ht
   3466 *  integer height of the array
   3467 *
   3468 * @param[in] wd
   3469 *  integer width of the array
   3470 *
   3471 * @returns
   3472 *
   3473 * @remarks
   3474 *  None
   3475 *
   3476 *******************************************************************************
   3477 */
   3478 
   3479 void ihevc_inter_pred_chroma_copy_ssse3(UWORD8 *pu1_src,
   3480                                         UWORD8 *pu1_dst,
   3481                                         WORD32 src_strd,
   3482                                         WORD32 dst_strd,
   3483                                         WORD8 *pi1_coeff,
   3484                                         WORD32 ht,
   3485                                         WORD32 wd)
   3486 {
   3487     WORD32 row, col;
   3488     __m128i  s3, mask_4x32b;
   3489     UNUSED(pi1_coeff);
   3490     ASSERT(wd % 2 == 0); /* checking assumption*/
   3491     ASSERT(ht % 2 == 0); /* checking assumption*/
   3492 
   3493     mask_4x32b = _mm_set_epi32(0, 0, 0, 0x80808080); /* Mask register */
   3494 
   3495 /*  for loop starts from here */
   3496     if(wd % 8 == 0)
   3497     {
   3498         for(row = 0; row < ht; row += 2)
   3499         {
   3500             int offset = 0;
   3501             for(col = 0; col < 2 * wd; col += 16)
   3502             {
   3503 /* row =0 */
   3504 
   3505                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
   3506                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */
   3507                 /* storing 16 8-bit output values */
   3508                 _mm_storeu_si128((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
   3509 
   3510 /* row =1 */
   3511                 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
   3512                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */
   3513                 /* storing 8 8-bit output values */
   3514                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]*/
   3515 
   3516                 offset += 16; /*To pointer update */
   3517             } /*  inner for loop ends here(16-output values in single iteration) */
   3518 
   3519             pu1_src += 2 * src_strd; /* pointer update */
   3520             pu1_dst += 2 * dst_strd; /* pointer update */
   3521         }
   3522     }
   3523     else if(wd % 4 == 0)
   3524     {
   3525         for(row = 0; row < ht; row += 2)
   3526         {
   3527             int offset = 0;
   3528             for(col = 0; col < 2 * wd; col += 8)
   3529             {
   3530 /* row =0  */
   3531                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
   3532                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */
   3533                 /* storing 8 8-bit output values */
   3534                 _mm_storel_epi64((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
   3535 /* row =1 */
   3536                 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
   3537                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */
   3538                 /* storing 8 8-bit output values */
   3539                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]; */
   3540 
   3541                 offset += 8; /* To pointer update */
   3542             } /* inner for loop ends here(8-output values in single iteration) */
   3543 
   3544             pu1_src += 2 * src_strd;  /* pointer update */
   3545             pu1_dst += 2 * dst_strd;  /* pointer update */
   3546         }
   3547     }
   3548     else
   3549     {
   3550         for(row = 0; row < ht; row += 2)
   3551         {
   3552             int offset = 0;
   3553             for(col = 0; col < 2 * wd; col += 4)
   3554             {
   3555 /* row =0 */
   3556                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
   3557                 /* storing four 8-bit output values */
   3558                 _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + offset)); /* pu1_dst[col] = pu1_src[col]; */
   3559 /* row =1 */
   3560                 /* pu1_src[col] */
   3561                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
   3562 
   3563                 /* storing four 8-bit output values */
   3564                 _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + dst_strd + offset)); /* pu1_dst[col] = pu1_src[col]; */
   3565 
   3566                 offset += 4; /* To pointer update */
   3567             } /*  inner for loop ends here(4-output values in single iteration) */
   3568 
   3569             pu1_src += 2 * src_strd; /* pointer increment */
   3570             pu1_dst += 2 * dst_strd; /* pointer increment */
   3571         }
   3572     }
   3573 }
   3574 
   3575 /**
   3576 *******************************************************************************
   3577 *
   3578 * @brief
   3579 *     Chroma interprediction filter for horizontal input
   3580 *
   3581 * @par Description:
   3582 *    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
   3583 *    to the elements pointed by 'pu1_src' and  writes to the location pointed
   3584 *    by 'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
   3585 *
   3586 * @param[in] pu1_src
   3587 *  UWORD8 pointer to the source
   3588 *
   3589 * @param[out] pu1_dst
   3590 *  UWORD8 pointer to the destination
   3591 *
   3592 * @param[in] src_strd
   3593 *  integer source stride
   3594 *
   3595 * @param[in] dst_strd
   3596 *  integer destination stride
   3597 *
   3598 * @param[in] pi1_coeff
   3599 *  WORD8 pointer to the filter coefficients
   3600 *
   3601 * @param[in] ht
   3602 *  integer height of the array
   3603 *
   3604 * @param[in] wd
   3605 *  integer width of the array
   3606 *
   3607 * @returns
   3608 *
   3609 * @remarks
   3610 *  None
   3611 *
   3612 *******************************************************************************
   3613 */
   3614 void ihevc_inter_pred_chroma_horz_ssse3(UWORD8 *pu1_src,
   3615                                         UWORD8 *pu1_dst,
   3616                                         WORD32 src_strd,
   3617                                         WORD32 dst_strd,
   3618                                         WORD8 *pi1_coeff,
   3619                                         WORD32 ht,
   3620                                         WORD32 wd)
   3621 {
   3622     WORD32 row, col;
   3623 
   3624     __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
   3625     __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
   3626     __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
   3627     __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b;
   3628     __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b;
   3629 
   3630     PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
   3631     PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
   3632     PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
   3633     PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
   3634     PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
   3635     PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
   3636 
   3637     ASSERT(wd % 2 == 0); /* checking assumption*/
   3638 
   3639 /* loading four 8-bit coefficients  */
   3640     src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   3641 
   3642     offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
   3643     mask_low_32b = _mm_cmpeq_epi16(offset_8x16b, offset_8x16b);
   3644     mask_high_96b = _mm_srli_si128(mask_low_32b, 12);
   3645     mask_low_32b = _mm_slli_si128(mask_low_32b, 4);
   3646 
   3647     control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
   3648     control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
   3649 
   3650     coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
   3651     coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
   3652 
   3653 /*  outer for loop starts from here */
   3654     if(wd % 2 == 0 && wd % 4 != 0)
   3655     {
   3656 
   3657         for(row = 0; row < ht; row += 2)
   3658         {
   3659             int offset = 0;
   3660 
   3661             PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   3662             PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   3663 
   3664 
   3665             for(col = 0; col < 2 * wd; col += 4)
   3666             {
   3667 
   3668 
   3669                 /*load 16 pixel values from row 0*/
   3670                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
   3671 
   3672                 /*load 16 pixel values from row 1*/
   3673                 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
   3674 
   3675                 /*Derive the source pixels for processing the 2nd pixel*/
   3676                 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   3677 
   3678                 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
   3679 
   3680                 /*Derive the source pixels for processing the 3rd pixel*/
   3681                 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
   3682 
   3683                 /*Derive the source pixels for processing the 4th pixel*/
   3684                 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
   3685 
   3686                 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
   3687 
   3688                 /*Derive the source pixels for processing the 2nd pixel*/
   3689                 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
   3690 
   3691                 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
   3692 
   3693                 /*Derive the source pixels for processing the 3rd pixel*/
   3694                 src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
   3695                 /*Derive the source pixels for processing the 4th pixel*/
   3696                 src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
   3697 
   3698                 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
   3699 
   3700                 res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b);
   3701                 res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b);
   3702                 res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
   3703                 res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
   3704 
   3705                 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
   3706                 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
   3707 
   3708                 res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b);             /* row = 0 */
   3709                 res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
   3710                 res_temp13_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b);       /* row = 0 */
   3711 
   3712                 res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 4);
   3713 
   3714                 res_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset));
   3715                 res_temp5_8x16b =  _mm_and_si128(res_temp4_8x16b, mask_low_32b);
   3716                 res_temp6_8x16b =  _mm_and_si128(res_temp13_8x16b, mask_high_96b);
   3717                 res_temp7_8x16b = _mm_or_si128(res_temp5_8x16b, res_temp6_8x16b);
   3718 
   3719                 /* store 4 16-bit values */
   3720                 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp7_8x16b); /* pu1_dst[col] = i2_tmp_u  */
   3721 
   3722                 res_temp14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset));
   3723                 res_temp15_8x16b =  _mm_and_si128(res_temp14_8x16b, mask_low_32b);
   3724                 res_temp16_8x16b =  _mm_and_si128(res_temp3_8x16b, mask_high_96b);
   3725                 res_temp17_8x16b = _mm_or_si128(res_temp15_8x16b, res_temp16_8x16b);
   3726 
   3727                 /* store 4 16-bit values */
   3728                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp17_8x16b); /* pu1_dst[col] = i2_tmp_u  */
   3729 
   3730 
   3731                 offset += 4; /* To pointer update*/
   3732 
   3733             } /* inner loop ends here(8- output values in single iteration)*/
   3734 
   3735             pu1_src += 2 * src_strd; /*pointer update*/
   3736             pu1_dst += 2 * dst_strd; /*pointer update*/
   3737         }
   3738     }
   3739     else
   3740     {
   3741 
   3742         for(row = 0; row < ht; row += 2)
   3743         {
   3744             int offset = 0;
   3745 
   3746             PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   3747             PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   3748 
   3749 
   3750             for(col = 0; col < 2 * wd; col += 8)
   3751             {
   3752 
   3753                 /*load 16 pixel values from row 0*/
   3754                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
   3755 
   3756                 /*load 16 pixel values from row 1*/
   3757                 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
   3758 
   3759                 /*Derive the source pixels for processing the 2nd pixel*/
   3760                 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   3761 
   3762                 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
   3763 
   3764                 /*Derive the source pixels for processing the 3rd pixel*/
   3765                 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
   3766 
   3767                 /*Derive the source pixels for processing the 4th pixel*/
   3768                 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
   3769 
   3770                 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
   3771 
   3772                 res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
   3773                 res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
   3774 
   3775                 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
   3776                 res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   3777 
   3778                 res_temp4_8x16b = _mm_adds_epi16(res_temp3_8x16b, offset_8x16b);             /* row = 0 */
   3779                 res_temp5_8x16b = _mm_srai_epi16(res_temp4_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
   3780                 res_temp6_8x16b = _mm_packus_epi16(res_temp5_8x16b, res_temp5_8x16b);        /* row = 0 */
   3781 
   3782                 /* store 4 16-bit values */
   3783                 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp6_8x16b); /* pi2_dst[col] = i2_tmp_u  */
   3784 
   3785                 /*Derive the source pixels for processing the 2nd pixel of row 1*/
   3786                 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
   3787 
   3788                 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
   3789 
   3790                 /*Derive the source pixels for processing the 3rd pixel of row 1*/
   3791                 src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
   3792 
   3793                 /*Derive the source pixels for processing the 4th pixel of row 1*/
   3794                 src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
   3795 
   3796                 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
   3797 
   3798                 res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b);
   3799                 res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b);
   3800 
   3801                 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
   3802                 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
   3803 
   3804                 res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b);             /* row = 0 */
   3805                 res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */
   3806                 res_temp16_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b);       /* row = 0 */
   3807 
   3808                 /* store 4 16-bit values */
   3809                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp16_8x16b); /* pu1_dst[col] = i2_tmp_u  */
   3810 
   3811 
   3812                 offset += 8; /* To pointer update*/
   3813 
   3814             } /* inner loop ends here(8- output values in single iteration)*/
   3815 
   3816             pu1_src += 2 * src_strd; /*pointer update*/
   3817             pu1_dst += 2 * dst_strd; /*pointer update*/
   3818         }
   3819     }
   3820 }
   3821 
   3822 /**
   3823 *******************************************************************************
   3824 *
   3825 * @brief
   3826 *     Chroma interprediction filter for vertical input
   3827 *
   3828 * @par Description:
   3829 *    Applies a vertcal filter with coefficients pointed to  by 'pi1_coeff' to
   3830 *    the elements pointed by 'pu1_src' and  writes to the location pointed by
   3831 *    'pu1_dst'  The output is downshifted by 6 and clipped to 8 bits
   3832 *
   3833 *
   3834 * @param[in] pu1_src
   3835 *  UWORD8 pointer to the source
   3836 *
   3837 * @param[out] pu1_dst
   3838 *  UWORD8 pointer to the destination
   3839 *
   3840 * @param[in] src_strd
   3841 *  integer source stride
   3842 *
   3843 * @param[in] dst_strd
   3844 *  integer destination stride
   3845 *
   3846 * @param[in] pi1_coeff
   3847 *  WORD8 pointer to the filter coefficients
   3848 *
   3849 * @param[in] ht
   3850 *  integer height of the array
   3851 *
   3852 * @param[in] wd
   3853 *  integer width of the array
   3854 *
   3855 * @returns
   3856 *
   3857 * @remarks
   3858 *  None
   3859 *
   3860 *******************************************************************************
   3861 */
   3862 void ihevc_inter_pred_chroma_vert_ssse3(UWORD8 *pu1_src,
   3863                                         UWORD8 *pu1_dst,
   3864                                         WORD32 src_strd,
   3865                                         WORD32 dst_strd,
   3866                                         WORD8 *pi1_coeff,
   3867                                         WORD32 ht,
   3868                                         WORD32 wd)
   3869 {
   3870     WORD32 row, col;
   3871     UWORD8 *pu1_src_copy;
   3872     UWORD8 *pu1_dst_copy;
   3873     __m128i coeff0_1_8x16b, coeff2_3_8x16b;
   3874     __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
   3875     __m128i control_mask_1_8x16b, control_mask_2_8x16b;
   3876     __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
   3877     __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b;
   3878     __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
   3879     __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
   3880 
   3881     PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
   3882     PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
   3883     PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
   3884     PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
   3885     PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
   3886     PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
   3887 
   3888 /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
   3889     s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   3890 
   3891     control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
   3892     control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
   3893 
   3894     coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
   3895     coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
   3896 
   3897 
   3898 /*  seting  values in register */
   3899     zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */
   3900     offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
   3901     mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
   3902     mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
   3903 
   3904 /*  outer for loop starts from here */
   3905     if(wd % 8 == 0)
   3906     { /* wd = multiple of 8 case */
   3907 
   3908         pu1_src_copy = pu1_src;
   3909         pu1_dst_copy = pu1_dst;
   3910 
   3911         for(col = 0; col < 2 * wd; col += 16)
   3912         {
   3913 
   3914             pu1_src = pu1_src_copy + col;
   3915             pu1_dst = pu1_dst_copy + col;
   3916 
   3917 
   3918             for(row = 0; row < ht; row += 2)
   3919             {
   3920 
   3921                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   3922                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   3923 
   3924 
   3925                 /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
   3926                 s21_8x16b  = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd)));
   3927 
   3928                 /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
   3929                 s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd)));
   3930 
   3931 
   3932                 /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
   3933                 s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd)));
   3934 
   3935                 /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
   3936                 s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd)));
   3937 
   3938                 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
   3939 
   3940                 s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b);
   3941 
   3942                 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
   3943 
   3944                 s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b);
   3945 
   3946                 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   3947 
   3948                 s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
   3949 
   3950                 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   3951 
   3952                 s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
   3953 
   3954                 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   3955 
   3956                 s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b);
   3957 
   3958                 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
   3959 
   3960                 s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b);
   3961 
   3962                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   3963                 s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3964 
   3965                 s32_8x16b = _mm_srai_epi16(s31_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   3966 
   3967                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   3968                 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
   3969 
   3970                 s33_8x16b =  _mm_packus_epi16(s32_8x16b, zero_8x16b);
   3971 
   3972                 s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b);
   3973 /* store 8 8-bit output values  */
   3974                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   3975                 _mm_storeu_si128((__m128i *)(pu1_dst), s7_8x16b);
   3976 
   3977 
   3978                 s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd)));
   3979 
   3980                 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
   3981 
   3982                 s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b);
   3983 
   3984                 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   3985 
   3986                 s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
   3987 
   3988                 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
   3989 
   3990                 s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b);
   3991 
   3992                 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   3993 
   3994                 s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
   3995 
   3996                 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   3997 
   3998                 s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   3999 
   4000                 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
   4001 
   4002                 s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b);
   4003 
   4004                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   4005                 s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   4006 
   4007                 s32_8x16b = _mm_srai_epi16(s31_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   4008 
   4009                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   4010                 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
   4011 
   4012                 s33_8x16b =  _mm_packus_epi16(s32_8x16b, zero_8x16b);
   4013 
   4014                 s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b);
   4015 /* store 8 8-bit output values  */
   4016                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   4017                 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
   4018 
   4019                 pu1_src += 2 * src_strd;
   4020                 pu1_dst += 2 * dst_strd;
   4021 
   4022 
   4023             } /* inner for loop ends here(8-output values in single iteration) */
   4024 
   4025         }
   4026     }
   4027     else if(wd % 4 == 0)
   4028     { /* wd = multiple of 8 case */
   4029 
   4030         for(row = 0; row < ht; row += 2)
   4031         {
   4032             pu1_src_copy = pu1_src;
   4033             pu1_dst_copy = pu1_dst;
   4034             for(col = 0; col < 2 * wd; col += 8)
   4035             {
   4036 
   4037                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   4038                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   4039 
   4040 
   4041                 /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
   4042                 s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
   4043 
   4044                 /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
   4045                 s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
   4046 
   4047                 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
   4048 
   4049                 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4050 
   4051                 /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
   4052                 s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
   4053 
   4054                 /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
   4055                 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   4056 
   4057                 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
   4058 
   4059                 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4060 
   4061                 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4062 
   4063                 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
   4064 
   4065                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   4066                 s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   4067 
   4068                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   4069                 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
   4070 
   4071 /* store 8 8-bit output values  */
   4072                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   4073                 _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b);
   4074 
   4075                 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   4076 
   4077                 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
   4078                 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4079 
   4080                 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
   4081                 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4082 
   4083                 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4084 
   4085                 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
   4086 
   4087                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   4088                 s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   4089 
   4090                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   4091                 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
   4092 
   4093 /* store 8 8-bit output values  */
   4094                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   4095                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
   4096 
   4097                 pu1_src += 8;    /* To pointer update */
   4098                 pu1_dst += 8;
   4099 
   4100             } /* inner for loop ends here(8-output values in single iteration) */
   4101 
   4102             pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
   4103             pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
   4104         }
   4105     }
   4106 
   4107     else
   4108     { /* wd = multiple of 4 case */
   4109 
   4110         for(row = 0; row < ht; row += 2)
   4111         {
   4112             pu1_src_copy = pu1_src;
   4113             pu1_dst_copy = pu1_dst;
   4114             for(col = 0; col < 2 * wd; col += 4)
   4115             {
   4116 
   4117                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   4118                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   4119 
   4120 
   4121                 /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/
   4122                 s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
   4123 
   4124                 /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/
   4125                 s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
   4126 
   4127                 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
   4128 
   4129                 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4130 
   4131                 /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/
   4132                 s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
   4133 
   4134                 /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/
   4135                 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   4136 
   4137                 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
   4138 
   4139                 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4140 
   4141                 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4142 
   4143                 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
   4144 
   4145                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   4146                 s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   4147 
   4148                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   4149                 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
   4150 
   4151                 s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
   4152                 s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
   4153                 s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
   4154                 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
   4155 
   4156 /* store 8 8-bit output values  */
   4157                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   4158                 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
   4159 
   4160                 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   4161 
   4162                 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
   4163                 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4164 
   4165                 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
   4166                 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4167 
   4168                 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4169 
   4170                 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b);
   4171 
   4172                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   4173                 s6_8x16b = _mm_srai_epi16(s5_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   4174 
   4175                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   4176                 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b);
   4177 
   4178                 s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
   4179                 s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
   4180                 s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
   4181                 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
   4182 
   4183 /* store 8 8-bit output values  */
   4184                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   4185                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b);
   4186 
   4187                 pu1_src += 4;   /* To pointer update */
   4188                 pu1_dst += 4;
   4189             } /* inner for loop ends here(8-output values in single iteration) */
   4190 
   4191             pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
   4192             pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
   4193         }
   4194     }
   4195 }
   4196 
   4197 /**
   4198 *******************************************************************************
   4199 *
   4200 * @brief
   4201 *       chroma interprediction filter for copying 16bit output
   4202 *
   4203 * @par Description:
   4204 *    Copies the array of width 'wd' and height 'ht' from the  location pointed
   4205 *    by 'src' to the location pointed by 'dst' The output is upshifted by 6
   4206 *    bits and is used as input for vertical filtering or weighted prediction
   4207 *
   4208 * @param[in] pu1_src
   4209 *  UWORD8 pointer to the source
   4210 *
   4211 * @param[out] pi2_dst
   4212 *  WORD16 pointer to the destination
   4213 *
   4214 * @param[in] src_strd
   4215 *  integer source stride
   4216 *
   4217 * @param[in] dst_strd
   4218 *  integer destination stride
   4219 *
   4220 * @param[in] pi1_coeff
   4221 *  WORD8 pointer to the filter coefficients
   4222 *
   4223 * @param[in] ht
   4224 *  integer height of the array
   4225 *
   4226 * @param[in] wd
   4227 *  integer width of the array
   4228 *
   4229 * @returns
   4230 *
   4231 * @remarks
   4232 *  None
   4233 *
   4234 *******************************************************************************
   4235 */
   4236 
   4237 void ihevc_inter_pred_chroma_copy_w16out_ssse3(UWORD8 *pu1_src,
   4238                                                WORD16 *pi2_dst,
   4239                                                WORD32 src_strd,
   4240                                                WORD32 dst_strd,
   4241                                                WORD8 *pi1_coeff,
   4242                                                WORD32 ht,
   4243                                                WORD32 wd)
   4244 {
   4245     WORD32 row, col;
   4246     __m128i  s3, zero_8x16b;
   4247 
   4248     ASSERT(wd % 2 == 0); /* checking assumption*/
   4249     ASSERT(ht % 2 == 0); /* checking assumption*/
   4250 
   4251     UNUSED(pi1_coeff);
   4252     zero_8x16b = _mm_setzero_si128();
   4253 /*  outer for loop starts from here */
   4254     if(wd == 2) /* for wd =2 */
   4255     {
   4256         for(row = 0; row < ht; row += 2)
   4257         {
   4258             int offset = 0;
   4259             for(col = 0; col < 2 * wd; col += 4)
   4260             {
   4261 /* row =0 */
   4262                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
   4263                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
   4264                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   4265 
   4266                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4267 
   4268                 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4269                 _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
   4270 
   4271 /* row =1 */
   4272                 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
   4273                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
   4274                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   4275 
   4276                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4277 
   4278                 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
   4279                 offset += 4; /* To pointer update */
   4280             } /* inner for loop ends here */
   4281 
   4282             pu1_src += 2 * src_strd; /* pointer update */
   4283             pi2_dst += 2 * dst_strd; /* pointer update */
   4284         }
   4285     }
   4286     else if(wd % 2 == 0 && wd % 4 != 0)
   4287     {
   4288         for(row = 0; row < ht / 2; row++)
   4289         {
   4290             int offset = 0;
   4291             int count = (2 * wd) / 8;
   4292             for(col = 0; col < count; col++)
   4293             {
   4294 /* row =0 */
   4295                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
   4296                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/
   4297                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   4298 
   4299                 /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4300                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH);
   4301 
   4302                 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
   4303                 _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3);
   4304 
   4305                 /*row=1*/       /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
   4306                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
   4307                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   4308 
   4309                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4310                 _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
   4311 
   4312                 offset += 8; /* To pointer update*/
   4313             } /*  inner for loop ends here(8-output values in single iteration) */
   4314 
   4315 /* finding last four values */
   4316             s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */
   4317             s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   4318 
   4319             s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4320 
   4321             /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4322             _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3);
   4323 
   4324             /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
   4325             s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
   4326             s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   4327 
   4328             s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4329             _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3);
   4330 
   4331             pu1_src += 2 * src_strd; /* pointer update */
   4332             pi2_dst += 2 * dst_strd;
   4333         }
   4334     }
   4335     else
   4336     {
   4337         for(row = 0; row < ht / 2; row++)
   4338         {
   4339             int offset = 0;
   4340             for(col = 0; col < 2 * wd / 8; col++)
   4341             {
   4342 /* row =0 */
   4343                 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/
   4344                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/
   4345                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   4346 
   4347                 /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4348                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH);
   4349 
   4350                 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */
   4351                 _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3);
   4352 
   4353                 /*row=1*/       /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/
   4354                 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset));
   4355                 s3 = _mm_unpacklo_epi8(s3, zero_8x16b);
   4356 
   4357                 s3 = _mm_slli_epi16(s3,  SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */
   4358                 _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3);
   4359 
   4360                 offset += 8; /* To pointer update*/
   4361             } /*  inner for loop ends here(8-output values in single iteration) */
   4362 
   4363             pu1_src += 2 * src_strd; /* pointer update */
   4364             pi2_dst += 2 * dst_strd;
   4365         }
   4366     }
   4367 }
   4368 
   4369 /**
   4370 *******************************************************************************
   4371 *
   4372 * @brief
   4373 *       chroma interprediction filter to store horizontal 16bit ouput
   4374 *
   4375 * @par Description:
   4376 *    Applies a horizontal filter with coefficients pointed to  by 'pi1_coeff'
   4377 *    to the elements pointed by 'pu1_src' and  writes to the location pointed
   4378 *    by 'pu1_dst'  No downshifting or clipping is done and the output is  used
   4379 *    as an input for vertical filtering or weighted  prediction
   4380 *
   4381 * @param[in] pu1_src
   4382 *  UWORD8 pointer to the source
   4383 *
   4384 * @param[out] pi2_dst
   4385 *  WORD16 pointer to the destination
   4386 *
   4387 * @param[in] src_strd
   4388 *  integer source stride
   4389 *
   4390 * @param[in] dst_strd
   4391 *  integer destination stride
   4392 *
   4393 * @param[in] pi1_coeff
   4394 *  WORD8 pointer to the filter coefficients
   4395 *
   4396 * @param[in] ht
   4397 *  integer height of the array
   4398 *
   4399 * @param[in] wd
   4400 *  integer width of the array
   4401 *
   4402 * @returns
   4403 *
   4404 * @remarks
   4405 *  None
   4406 *
   4407 *******************************************************************************
   4408 */
   4409 void ihevc_inter_pred_chroma_horz_w16out_ssse3(UWORD8 *pu1_src,
   4410                                                WORD16 *pi2_dst,
   4411                                                WORD32 src_strd,
   4412                                                WORD32 dst_strd,
   4413                                                WORD8 *pi1_coeff,
   4414                                                WORD32 ht,
   4415                                                WORD32 wd)
   4416 {
   4417     WORD32 row, col;
   4418 
   4419     __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, all_zero;
   4420     __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b;
   4421     __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b;
   4422     __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b;
   4423     __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b;
   4424 
   4425     PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
   4426     PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
   4427     PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
   4428     PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
   4429     PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
   4430     PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
   4431 
   4432     ASSERT(wd % 2 == 0); /* checking assumption*/
   4433 
   4434 /* loading four 8-bit coefficients and convert 8-bit into 16-bit */
   4435     src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   4436 
   4437     all_zero = _mm_setzero_si128();
   4438 
   4439     control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
   4440     control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
   4441 
   4442     coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b);  /* pi1_coeff[4] */
   4443     coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b);  /* pi1_coeff[4] */
   4444 
   4445 /*  outer for loop starts from here */
   4446     if(wd % 2 == 0 && wd % 4 != 0)
   4447     {
   4448         int offset = 0;
   4449         for(row = ht; row >= 2; row -= 2)
   4450         {
   4451             offset = 0;
   4452             PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   4453             PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   4454 
   4455 
   4456             for(col = 0; col < 2 * wd; col += 4)
   4457             {
   4458 
   4459                 /*load 16 pixel values of row 0*/
   4460                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
   4461 
   4462                 /*load 16 pixel values of row 1*/
   4463                 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
   4464 
   4465                 /*Derive the source pixels for processing the 2nd pixel of row 0*/
   4466                 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   4467 
   4468                 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
   4469 
   4470                 /*Derive the source pixels for processing the 3rd pixel of row 0*/
   4471                 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
   4472 
   4473                 /*Derive the source pixels for processing the 4th pixel of row 0*/
   4474                 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
   4475 
   4476                 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
   4477 
   4478                 /*Derive the source pixels for processing the 2nd pixel of row 1*/
   4479                 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
   4480 
   4481                 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
   4482 
   4483                 /*Derive the source pixels for processing the 3rd pixel of row 1*/
   4484                 src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
   4485 
   4486                 /*Derive the source pixels for processing the 4th pixel of row 1*/
   4487                 src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
   4488 
   4489                 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
   4490 
   4491                 res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b);
   4492                 res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b);
   4493                 res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
   4494                 res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
   4495 
   4496                 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
   4497                 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
   4498 
   4499                 res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 8);
   4500 
   4501                 /* store 4 16-bit values */
   4502                 _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
   4503 
   4504 
   4505 
   4506                 /* store 4 16-bit values */
   4507                 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
   4508 
   4509 
   4510                 offset += 4; /* To pointer update*/
   4511 
   4512             } /* inner loop ends here(8- output values in single iteration)*/
   4513 
   4514             pu1_src += 2 * src_strd; /*pointer update*/
   4515             pi2_dst += 2 * dst_strd; /*pointer update*/
   4516         }
   4517 
   4518         /*Epilogue to handle ht= odd case*/
   4519         if(row)
   4520         {
   4521             offset = 0;
   4522             for(col = 0; col < 2 * wd; col += 4)
   4523             {
   4524 
   4525                 /*load 16 pixel values of row 0*/
   4526                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
   4527 
   4528                 /*Derive the source pixels for processing the 2nd pixel of row 0*/
   4529                 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   4530 
   4531                 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
   4532 
   4533                 /*Derive the source pixels for processing the 3rd pixel of row 0*/
   4534                 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
   4535 
   4536                 /*Derive the source pixels for processing the 4th pixel of row 0*/
   4537                 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
   4538 
   4539                 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
   4540 
   4541                 res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, all_zero);
   4542                 res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, all_zero);
   4543                 res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b);
   4544                 res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b);
   4545 
   4546                 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
   4547                 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
   4548 
   4549                 //res_temp3_8x16b = _mm_srli_si128 (res_temp13_8x16b, 8);
   4550 
   4551                 /* store 4 16-bit values */
   4552                 _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
   4553 
   4554                 offset += 4; /* To pointer update*/
   4555 
   4556             }
   4557         }
   4558 
   4559     }
   4560     else
   4561     {
   4562         int offset = 0;
   4563 
   4564         for(row = ht; row >= 2; row -= 2)
   4565         {
   4566             offset = 0;
   4567             PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   4568             PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   4569 
   4570 
   4571             for(col = 0; col < 2 * wd; col += 8)
   4572             {
   4573 
   4574                 /*load 16 pixel values of row 0*/
   4575                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
   4576 
   4577                 /*load 16 pixel values of row 1*/
   4578                 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/
   4579 
   4580                 /*Derive the source pixels for processing the 2nd pixel of row 0*/
   4581                 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   4582 
   4583                 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
   4584 
   4585                 /*Derive the source pixels for processing the 3rd pixel of row 0*/
   4586                 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
   4587 
   4588                 /*Derive the source pixels for processing the 4th pixel of row 0*/
   4589                 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
   4590 
   4591                 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
   4592 
   4593                 res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
   4594                 res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
   4595 
   4596                 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
   4597                 res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   4598 
   4599                 /* store 8 16-bit values */
   4600                 _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
   4601 
   4602                 /*Derive the source pixels for processing the 2nd pixel of row 1*/
   4603                 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2);
   4604 
   4605                 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b);
   4606 
   4607                 /*Derive the source pixels for processing the 3rd pixel of row 1*/
   4608                 src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4);
   4609 
   4610                 /*Derive the source pixels for processing the 4th pixel of row 1*/
   4611                 src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6);
   4612 
   4613                 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b);
   4614 
   4615                 res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b);
   4616                 res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b);
   4617 
   4618                 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
   4619                 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b);
   4620 
   4621                 /* store 8 16-bit values */
   4622                 _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u  */
   4623 
   4624 
   4625                 offset += 8; /* To pointer update*/
   4626 
   4627             } /* inner loop ends here(8- output values in single iteration)*/
   4628 
   4629             pu1_src += 2 * src_strd; /*pointer update*/
   4630             pi2_dst += 2 * dst_strd; /*pointer update*/
   4631         }
   4632 
   4633         /*Epilogue to take care of odd ht*/
   4634         if(row)
   4635         {
   4636             offset = 0;
   4637             for(col = 0; col < 2 * wd; col += 8)
   4638             {
   4639 
   4640                 /*load 16 pixel values of row 0*/
   4641                 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/
   4642 
   4643                 /*Derive the source pixels for processing the 2nd pixel of row 0*/
   4644                 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
   4645 
   4646                 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b);
   4647 
   4648                 /*Derive the source pixels for processing the 3rd pixel of row 0*/
   4649                 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4);
   4650 
   4651                 /*Derive the source pixels for processing the 4th pixel of row 0*/
   4652                 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6);
   4653 
   4654                 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b);
   4655 
   4656                 res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b);
   4657                 res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b);
   4658 
   4659                 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */
   4660                 res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b);
   4661 
   4662                 /* store 8 16-bit values */
   4663                 _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u  */
   4664 
   4665                 offset += 8; /* To pointer update*/
   4666 
   4667             }
   4668         }
   4669 
   4670     }
   4671 }
   4672 
   4673 /**
   4674 *******************************************************************************
   4675 *
   4676 * @brief
   4677 *     Interprediction chroma filter to store vertical 16bit ouput
   4678 *
   4679 * @par Description:
   4680 *    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
   4681 *    the elements pointed by 'pu1_src' and  writes to the location pointed by
   4682 *    'pu1_dst'  No downshifting or clipping is done and the output is  used as
   4683 *    an input for weighted prediction
   4684 *
   4685 * @param[in] pu1_src
   4686 *  UWORD8 pointer to the source
   4687 *
   4688 * @param[out] pi2_dst
   4689 *  WORD16 pointer to the destination
   4690 *
   4691 * @param[in] src_strd
   4692 *  integer source stride
   4693 *
   4694 * @param[in] dst_strd
   4695 *  integer destination stride
   4696 *
   4697 * @param[in] pi1_coeff
   4698 *  WORD8 pointer to the filter coefficients
   4699 *
   4700 * @param[in] ht
   4701 *  integer height of the array
   4702 *
   4703 * @param[in] wd
   4704 *  integer width of the array
   4705 *
   4706 * @returns
   4707 *
   4708 * @remarks
   4709 *  None
   4710 *
   4711 *******************************************************************************
   4712 */
   4713 void ihevc_inter_pred_chroma_vert_w16out_ssse3(UWORD8 *pu1_src,
   4714                                                WORD16 *pi2_dst,
   4715                                                WORD32 src_strd,
   4716                                                WORD32 dst_strd,
   4717                                                WORD8 *pi1_coeff,
   4718                                                WORD32 ht,
   4719                                                WORD32 wd)
   4720 {
   4721     WORD32 row, col;
   4722     UWORD8 *pu1_src_copy;
   4723     WORD16 *pi2_dst_copy;
   4724     __m128i coeff0_1_8x16b, coeff2_3_8x16b;
   4725     __m128i s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b;
   4726     __m128i control_mask_1_8x16b, control_mask_2_8x16b;
   4727     __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
   4728     __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
   4729     __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
   4730 
   4731 
   4732     PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0)
   4733     PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0)
   4734     PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0)
   4735     PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0)
   4736     PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0)
   4737     PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0)
   4738 
   4739 /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
   4740     s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   4741 
   4742     control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */
   4743     control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */
   4744 
   4745     coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b);  /* pi1_coeff[4] */
   4746     coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b);  /* pi1_coeff[4] */
   4747 
   4748 
   4749 
   4750 /*  outer for loop starts from here */
   4751     if(wd % 8 == 0)
   4752     { /* wd = multiple of 8 case */
   4753 
   4754         pu1_src_copy = pu1_src;
   4755         pi2_dst_copy = pi2_dst;
   4756 
   4757         for(col = 0; col < 2 * wd; col += 16)
   4758         {
   4759 
   4760             pu1_src = pu1_src_copy + col;
   4761             pi2_dst = pi2_dst_copy + col;
   4762 
   4763 
   4764             for(row = 0; row < ht; row += 2)
   4765             {
   4766 
   4767                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   4768                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   4769 
   4770 
   4771                 /*load 16 pixel values */
   4772                 s21_8x16b  = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd)));
   4773 
   4774                 /*load 16 pixel values */
   4775                 s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd)));
   4776 
   4777 
   4778                 /*load 16 pixel values */
   4779                 s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd)));
   4780 
   4781                 /*load 16 pixel values */
   4782                 s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd)));
   4783 
   4784                 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
   4785 
   4786                 s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b);
   4787 
   4788                 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
   4789 
   4790                 s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b);
   4791 
   4792                 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4793 
   4794                 s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
   4795 
   4796                 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4797 
   4798                 s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
   4799 
   4800                 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4801 
   4802                 s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b);
   4803 
   4804 /* store 8 8-bit output values  */
   4805                 /* pi2_dst[col] = (UWORD8)i2_tmp; */
   4806                 _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b);
   4807 
   4808                 _mm_storeu_si128((__m128i *)(pi2_dst + 8), s35_8x16b);
   4809 
   4810 
   4811                 s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd)));
   4812 
   4813                 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
   4814 
   4815                 s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b);
   4816 
   4817                 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4818 
   4819                 s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b);
   4820 
   4821                 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
   4822 
   4823                 s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b);
   4824 
   4825                 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4826 
   4827                 s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b);
   4828 
   4829                 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4830 
   4831                 s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4832 
   4833 /* store 8 8-bit output values  */
   4834                 /* pi2_dst[col] = (UWORD8)i2_tmp; */
   4835                 _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
   4836 
   4837                 _mm_store_si128((__m128i *)(pi2_dst + dst_strd + 8), s35_8x16b);
   4838 
   4839 
   4840                 pu1_src += 2 * src_strd;
   4841                 pi2_dst += 2 * dst_strd;
   4842 
   4843 
   4844             } /* inner for loop ends here(8-output values in single iteration) */
   4845 
   4846         }
   4847     }
   4848 
   4849     else if(wd % 4 == 0)
   4850     { /* wd = multiple of 8 case */
   4851 
   4852         for(row = 0; row < ht; row += 2)
   4853         {
   4854 
   4855             pu1_src_copy = pu1_src;
   4856             pi2_dst_copy = pi2_dst;
   4857 
   4858             for(col = 0; col < 2 * wd; col += 8)
   4859             {
   4860 
   4861                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   4862                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   4863 
   4864 
   4865                 /*load 8 pixel values */
   4866                 s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
   4867 
   4868                 /*load 8 pixel values */
   4869                 s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
   4870 
   4871                 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
   4872 
   4873                 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4874 
   4875                 /*load 8 pixel values */
   4876                 s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
   4877 
   4878                 /*load 8 pixel values */
   4879                 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   4880 
   4881                 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
   4882 
   4883                 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4884 
   4885                 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4886 
   4887                 _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b);
   4888 
   4889                 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   4890 
   4891                 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
   4892                 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4893 
   4894                 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
   4895                 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4896 
   4897                 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4898 
   4899                 _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
   4900 
   4901                 pu1_src += 8;    /* To pointer update */
   4902                 pi2_dst += 8;
   4903 
   4904             } /* inner for loop ends here(8-output values in single iteration) */
   4905 
   4906             pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
   4907             pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
   4908         }
   4909     }
   4910 
   4911     else
   4912     { /* wd = multiple of 4 case */
   4913 
   4914         for(row = 0; row < ht; row += 2)
   4915         {
   4916             pu1_src_copy = pu1_src;
   4917             pi2_dst_copy = pi2_dst;
   4918             for(col = 0; col < 2 * wd; col += 4)
   4919             {
   4920 
   4921                 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0)
   4922                 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0)
   4923 
   4924 
   4925                 /*load 8 pixel values */
   4926                 s21_8x16b  = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd)));
   4927 
   4928                 /*load 8 pixel values */
   4929                 s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd)));
   4930 
   4931                 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b);
   4932 
   4933                 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4934 
   4935                 /*load 8 pixel values */
   4936                 s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd)));
   4937 
   4938                 /*load 8 pixel values */
   4939                 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd)));
   4940 
   4941                 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b);
   4942 
   4943                 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4944 
   4945                 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4946 
   4947 
   4948 /* store 8 8-bit output values  */
   4949                 /* pi2_dst[col] = (UWORD8)i2_tmp; */
   4950                 _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b);
   4951 
   4952                 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd)));
   4953 
   4954                 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b);
   4955                 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b);
   4956 
   4957                 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b);
   4958                 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b);
   4959 
   4960                 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   4961 
   4962 
   4963 /* store 8 8-bit output values  */
   4964                 /* pi2_dst[col] = (UWORD8)i2_tmp; */
   4965                 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s8_8x16b);
   4966 
   4967                 pu1_src += 4;   /* To pointer update */
   4968                 pi2_dst += 4;
   4969             } /* inner for loop ends here(8-output values in single iteration) */
   4970 
   4971             pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */
   4972             pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
   4973         }
   4974     }
   4975 }
   4976 
   4977 /**
   4978 *******************************************************************************
   4979 *
   4980 * @brief
   4981 *     chroma interprediction filter for vertical 16bit input
   4982 *
   4983 * @par Description:
   4984 *    Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
   4985 *    the elements pointed by 'pu1_src' and  writes to the location pointed by
   4986 *    'pu1_dst'  Input is 16 bits  The filter output is downshifted by 12 and
   4987 *    clipped to lie  between 0 and 255
   4988 *
   4989 * @param[in] pi2_src
   4990 *  WORD16 pointer to the source
   4991 *
   4992 * @param[out] pu1_dst
   4993 *  UWORD8 pointer to the destination
   4994 *
   4995 * @param[in] src_strd
   4996 *  integer source stride
   4997 *
   4998 * @param[in] dst_strd
   4999 *  integer destination stride
   5000 *
   5001 * @param[in] pi1_coeff
   5002 *  WORD8 pointer to the filter coefficients
   5003 *
   5004 * @param[in] ht
   5005 *  integer height of the array
   5006 *
   5007 * @param[in] wd
   5008 *  integer width of the array
   5009 *
   5010 * @returns
   5011 *
   5012 * @remarks
   5013 *  None
   5014 *
   5015 *******************************************************************************
   5016 */
   5017 void ihevc_inter_pred_chroma_vert_w16inp_ssse3(WORD16 *pi2_src,
   5018                                                UWORD8 *pu1_dst,
   5019                                                WORD32 src_strd,
   5020                                                WORD32 dst_strd,
   5021                                                WORD8 *pi1_coeff,
   5022                                                WORD32 ht,
   5023                                                WORD32 wd)
   5024 {
   5025     WORD32 row, col;
   5026     WORD16 *pi2_src_copy;
   5027     UWORD8 *pu1_dst_copy;
   5028     __m128i coeff0_1_8x16b, coeff2_3_8x16b;
   5029     __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
   5030     __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
   5031     __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg;
   5032     __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
   5033     __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
   5034 
   5035 
   5036 /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
   5037     s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   5038 
   5039     zero_8x16b = _mm_setzero_si128();
   5040     sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
   5041     s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
   5042 
   5043     coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
   5044     coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
   5045 
   5046 /*  seting  values in register */
   5047     offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */
   5048     mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000);
   5049     mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF);
   5050 
   5051 /*  outer for loop starts from here */
   5052     if(wd % 4 == 0)
   5053     { /* wd = multiple of 8 case */
   5054 
   5055         pi2_src_copy = pi2_src;
   5056         pu1_dst_copy = pu1_dst;
   5057 
   5058         for(col = 0; col < 2 * wd; col += 8)
   5059         {
   5060 
   5061             pi2_src = pi2_src_copy + col;
   5062             pu1_dst = pu1_dst_copy + col;
   5063 
   5064 
   5065             for(row = 0; row < ht; row += 2)
   5066             {
   5067 
   5068                 /*load 16 pixel values */
   5069                 s21_8x16b  = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd)));
   5070 
   5071                 /*load 16 pixel values */
   5072                 s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd)));
   5073 
   5074 
   5075                 /*load 16 pixel values */
   5076                 s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd)));
   5077 
   5078                 /*load 16 pixel values */
   5079                 s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd)));
   5080 
   5081                 s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
   5082 
   5083                 s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b);
   5084 
   5085                 s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
   5086 
   5087                 s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b);
   5088 
   5089                 s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
   5090 
   5091                 s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
   5092 
   5093                 s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
   5094 
   5095                 s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
   5096 
   5097                 s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5098 
   5099                 s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b);
   5100 
   5101                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5102                 s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5103 
   5104                 s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5105 
   5106 
   5107                 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   5108                 s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
   5109 
   5110                 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5111                 s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5112 
   5113                 s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
   5114 
   5115                 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   5116                 s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b);
   5117 
   5118                 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5119                 s34_8x16b = _mm_srai_epi32(s33_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5120 
   5121                 s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b);
   5122 
   5123 
   5124                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   5125                 s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
   5126 
   5127                 s33_8x16b =  _mm_packus_epi16(s35_8x16b, zero_8x16b);
   5128 
   5129                 s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b);
   5130 /* store 8 8-bit output values  */
   5131                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   5132                 _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b);
   5133 
   5134 
   5135                 s25_8x16b = _mm_load_si128((__m128i *)(pi2_src + (3 * src_strd)));
   5136 
   5137                 s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
   5138 
   5139                 s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b);
   5140 
   5141                 s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
   5142 
   5143                 s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
   5144 
   5145                 s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
   5146 
   5147                 s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b);
   5148 
   5149                 s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
   5150 
   5151                 s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
   5152 
   5153                 s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5154 
   5155                 s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5156 
   5157                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5158                 s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5159 
   5160                 s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5161 
   5162 
   5163                 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   5164                 s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
   5165 
   5166                 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5167                 s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5168 
   5169                 s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
   5170 
   5171                 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   5172                 s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b);
   5173 
   5174                 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5175                 s34_8x16b = _mm_srai_epi32(s33_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5176 
   5177                 s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b);
   5178 
   5179 
   5180                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   5181                 s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
   5182 
   5183                 s33_8x16b =  _mm_packus_epi16(s35_8x16b, zero_8x16b);
   5184 
   5185                 s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b);
   5186 /* store 8 8-bit output values  */
   5187                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   5188                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b);
   5189 
   5190                 pi2_src += 2 * src_strd;
   5191                 pu1_dst += 2 * dst_strd;
   5192 
   5193 
   5194             } /* inner for loop ends here(8-output values in single iteration) */
   5195 
   5196         }
   5197     }
   5198     else
   5199     { /* wd = multiple of 4 case */
   5200 
   5201         for(row = 0; row < ht; row += 2)
   5202         {
   5203             pi2_src_copy = pi2_src;
   5204             pu1_dst_copy = pu1_dst;
   5205             for(col = 0; col < 2 * wd; col += 4)
   5206             {
   5207 
   5208                 /*load 8 pixel values  */
   5209                 s21_8x16b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
   5210 
   5211                 /*load 8 pixel values */
   5212                 s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
   5213 
   5214                 s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
   5215 
   5216                 s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
   5217 
   5218                 /*load 8 pixel values */
   5219                 s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
   5220 
   5221                 /*load 8 pixel values */
   5222                 s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
   5223 
   5224                 s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
   5225 
   5226                 s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
   5227 
   5228                 s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5229 
   5230 
   5231                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5232                 s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5233 
   5234 
   5235                 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   5236                 s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
   5237 
   5238                 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5239                 s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5240 
   5241                 s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
   5242 
   5243 
   5244                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   5245                 s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
   5246 
   5247                 s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst));
   5248                 s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
   5249                 s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
   5250                 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
   5251 
   5252 /* store 8 8-bit output values  */
   5253                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   5254                 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b);
   5255 
   5256                 s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
   5257 
   5258                 s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
   5259                 s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
   5260 
   5261                 s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
   5262                 s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
   5263 
   5264                 s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5265 
   5266                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5267                 s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5268 
   5269                 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */
   5270                 s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b);
   5271 
   5272                 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5273                 s8_8x16b = _mm_srai_epi32(s7_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5274 
   5275                 s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b);
   5276 
   5277                 /* i2_tmp = CLIP_U8(i2_tmp);*/
   5278                 s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b);
   5279 
   5280                 s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
   5281                 s5_8x16b =  _mm_and_si128(s9_8x16b, mask_low_32b);
   5282                 s6_8x16b =  _mm_and_si128(s7_8x16b, mask_high_96b);
   5283                 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b);
   5284 
   5285 /* store 8 8-bit output values  */
   5286                 /* pu1_dst[col] = (UWORD8)i2_tmp; */
   5287                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b);
   5288 
   5289                 pi2_src += 4;   /* To pointer update */
   5290                 pu1_dst += 4;
   5291             } /* inner for loop ends here(8-output values in single iteration) */
   5292 
   5293             pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */
   5294             pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */
   5295         }
   5296     }
   5297 
   5298 }
   5299 
   5300 /**
   5301 *******************************************************************************
   5302 *
   5303 * @brief
   5304 *
   5305 *      Chroma interprediction filter for 16bit vertical input and output.
   5306 *
   5307 * @par Description:
   5308 *       Applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
   5309 *       the elements pointed by 'pu1_src' and  writes to the location pointed by
   5310 *       'pu1_dst'  Input is 16 bits  The filter output is downshifted by 6 and
   5311 *       8192 is  subtracted to store it as a 16 bit number  The output is used as
   5312 *       a input to weighted prediction
   5313 *
   5314 * @param[in] pi2_src
   5315 *  WORD16 pointer to the source
   5316 *
   5317 * @param[out] pi2_dst
   5318 *  WORD16 pointer to the destination
   5319 *
   5320 * @param[in] src_strd
   5321 *  integer source stride
   5322 *
   5323 * @param[in] dst_strd
   5324 *  integer destination stride
   5325 *
   5326 * @param[in] pi1_coeff
   5327 *  WORD8 pointer to the filter coefficients
   5328 *
   5329 * @param[in] ht
   5330 *  integer height of the array
   5331 *
   5332 * @param[in] wd
   5333 *  integer width of the array
   5334 *
   5335 * @returns
   5336 *
   5337 * @remarks
   5338 *  None
   5339 *
   5340 *******************************************************************************
   5341 */
   5342 void ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src,
   5343                                                       WORD16 *pi2_dst,
   5344                                                       WORD32 src_strd,
   5345                                                       WORD32 dst_strd,
   5346                                                       WORD8 *pi1_coeff,
   5347                                                       WORD32 ht,
   5348                                                       WORD32 wd)
   5349 {
   5350     WORD32 row, col;
   5351     WORD16 *pi2_src_copy;
   5352     WORD16 *pi2_dst_copy;
   5353     __m128i coeff0_1_8x16b, coeff2_3_8x16b;
   5354     __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b;
   5355     __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b;
   5356     __m128i zero_8x16b, sign_reg;
   5357     __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b;
   5358     __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b;
   5359 
   5360 
   5361 /* load 8 8-bit coefficients and convert 8-bit into 16-bit  */
   5362     s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff);
   5363 
   5364     zero_8x16b = _mm_setzero_si128();
   5365     sign_reg =  _mm_cmpgt_epi8(zero_8x16b, s4_8x16b);
   5366     s5_8x16b  = _mm_unpacklo_epi8(s4_8x16b, sign_reg);
   5367 
   5368     coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0));  /* pi1_coeff[4] */
   5369     coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1));  /* pi1_coeff[4] */
   5370 
   5371 
   5372 /*  outer for loop starts from here */
   5373     if(wd % 4 == 0)
   5374     { /* wd = multiple of 8 case */
   5375 
   5376         pi2_src_copy = pi2_src;
   5377         pi2_dst_copy = pi2_dst;
   5378 
   5379         for(col = 0; col < 2 * wd; col += 8)
   5380         {
   5381 
   5382             pi2_src = pi2_src_copy + col;
   5383             pi2_dst = pi2_dst_copy + col;
   5384 
   5385 
   5386             for(row = 0; row < ht; row += 2)
   5387             {
   5388 
   5389                 /*load 16 pixel values */
   5390                 s21_8x16b  = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd)));
   5391 
   5392                 /*load 16 pixel values */
   5393                 s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd)));
   5394 
   5395 
   5396                 /*load 16 pixel values */
   5397                 s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd)));
   5398 
   5399                 /*load 16 pixel values */
   5400                 s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd)));
   5401 
   5402                 s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
   5403 
   5404                 s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b);
   5405 
   5406                 s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
   5407 
   5408                 s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b);
   5409 
   5410                 s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
   5411 
   5412                 s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
   5413 
   5414                 s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
   5415 
   5416                 s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
   5417 
   5418                 s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5419 
   5420                 s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b);
   5421 
   5422                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5423                 s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5424 
   5425                 s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5426 
   5427                 s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
   5428 
   5429                 s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b);
   5430 
   5431                 s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b);
   5432 /* store 8 8-bit output values  */
   5433                 /* pi2_dst[col] = (UWORD8)i2_tmp; */
   5434                 _mm_store_si128((__m128i *)(pi2_dst), s7_8x16b);
   5435 
   5436 
   5437                 s25_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + (3 * src_strd)));
   5438 
   5439                 s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
   5440 
   5441                 s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b);
   5442 
   5443                 s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
   5444 
   5445                 s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b);
   5446 
   5447                 s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
   5448 
   5449                 s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b);
   5450 
   5451                 s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
   5452 
   5453                 s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b);
   5454 
   5455                 s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5456 
   5457                 s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5458 
   5459                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5460                 s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5461 
   5462                 s32_8x16b = _mm_srai_epi32(s35_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5463 
   5464                 s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
   5465 
   5466                 s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b);
   5467 
   5468                 s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b);
   5469 /* store 8 8-bit output values  */
   5470                 /* pi2_dst[col] = (UWORD8)i2_tmp; */
   5471                 _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s7_8x16b);
   5472 
   5473                 pi2_src += 2 * src_strd;
   5474                 pi2_dst += 2 * dst_strd;
   5475 
   5476 
   5477             } /* inner for loop ends here(8-output values in single iteration) */
   5478 
   5479         }
   5480     }
   5481     else
   5482     { /* wd = multiple of 4 case */
   5483 
   5484         for(row = 0; row < ht; row += 2)
   5485         {
   5486             pi2_src_copy = pi2_src;
   5487             pi2_dst_copy = pi2_dst;
   5488             for(col = 0; col < 2 * wd; col += 4)
   5489             {
   5490 
   5491                 /*load 4 pixel values */
   5492                 s21_8x16b  = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd)));
   5493 
   5494                 /*load 4 pixel values */
   5495                 s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd)));
   5496 
   5497                 s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b);
   5498 
   5499                 s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
   5500 
   5501                 /*load 4 pixel values */
   5502                 s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd)));
   5503 
   5504                 /*load 4 pixel values */
   5505                 s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd)));
   5506 
   5507                 s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b);
   5508 
   5509                 s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
   5510 
   5511                 s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5512 
   5513                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5514                 s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5515 
   5516                 s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
   5517 
   5518 /* store 8 8-bit output values  */
   5519                 /* pi2_dst[col] = (UWORD8)i2_tmp; */
   5520                 _mm_storel_epi64((__m128i *)(pi2_dst), s9_8x16b);
   5521 
   5522                 s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd)));
   5523 
   5524                 s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b);
   5525                 s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b);
   5526 
   5527                 s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b);
   5528                 s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b);
   5529 
   5530                 s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */
   5531 
   5532                 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */
   5533                 s6_8x16b = _mm_srai_epi32(s8_8x16b,  SHIFT_14_MINUS_BIT_DEPTH);
   5534 
   5535                 s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b);
   5536 
   5537 /* store 8 8-bit output values  */
   5538                 /* pi2_dst[col] = (UWORD8)i2_tmp; */
   5539                 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s9_8x16b);
   5540 
   5541                 pi2_src += 4;   /* To pointer update */
   5542                 pi2_dst += 4;
   5543             } /* inner for loop ends here(8-output values in single iteration) */
   5544 
   5545             pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */
   5546             pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */
   5547         }
   5548     }
   5549 
   5550 }
   5551