Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /*****************************************************************************/
     21 /*                                                                           */
     22 /*  File Name         : ih264_inter_pred_filters_intr_ssse3.c                */
     23 /*                                                                           */
     24 /*  Description       : Contains function definitions for weighted           */
     25 /*                      prediction functions in x86 sse4 intrinsics          */
     26 /*                                                                           */
     27 /*  List of Functions : ih264_inter_pred_luma_copy_ssse3()                   */
     28 /*                      ih264_inter_pred_luma_horz_ssse3()                   */
     29 /*                      ih264_inter_pred_luma_vert_ssse3()                   */
     30 /*                      ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3()    */
     31 /*                      ih264_inter_pred_luma_horz_qpel_ssse3()              */
     32 /*                      ih264_inter_pred_luma_vert_qpel_ssse3()              */
     33 /*                      ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3()    */
     34 /*                      ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3()    */
     35 /*                      ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3()    */
     36 /*                      ih264_inter_pred_chroma_ssse3()                      */
     37 /*                                                                           */
     38 /*  Issues / Problems : None                                                 */
     39 /*                                                                           */
     40 /*  Revision History  :                                                      */
     41 /*                                                                           */
     42 /*         DD MM YYYY   Author(s)       Changes                              */
     43 /*         13 02 2015   Kaushik         Initial version                      */
     44 /*                      Senthoor                                             */
     45 /*                                                                           */
     46 /*****************************************************************************/
     47 /*****************************************************************************/
     48 /* File Includes                                                             */
     49 /*****************************************************************************/
     50 
     51 #include <immintrin.h>
     52 #include "ih264_typedefs.h"
     53 #include "ih264_macros.h"
     54 #include "ih264_platform_macros.h"
     55 #include "ih264_inter_pred_filters.h"
     56 
     57 /*****************************************************************************/
     58 /* Constant Data variables                                                   */
     59 /*****************************************************************************/
     60 
     61 /* coefficients for 6 tap filtering*/
     62 //const WORD32 ih264_g_six_tap[3] ={1,-5,20};
     63 /*****************************************************************************/
     64 /*  Function definitions .                                                   */
     65 /*****************************************************************************/
     66 /*****************************************************************************/
     67 /*                                                                           */
     68 /*  Function Name : ih264_inter_pred_luma_copy_ssse3                         */
     69 /*                                                                           */
     70 /*  Description   : This function copies the contents of ht x wd block from  */
     71 /*                  source to destination. (ht,wd) can be (4,4), (8,4),      */
     72 /*                  (4,8), (8,8), (16,8), (8,16) or (16,16).                 */
     73 /*                                                                           */
     74 /*  Inputs        : puc_src  - pointer to source                             */
     75 /*                  puc_dst  - pointer to destination                        */
     76 /*                  src_strd - stride for source                             */
     77 /*                  dst_strd - stride for destination                        */
     78 /*                  ht       - height of the block                           */
     79 /*                  wd       - width of the block                            */
     80 /*                                                                           */
     81 /*  Issues        : None                                                     */
     82 /*                                                                           */
     83 /*  Revision History:                                                        */
     84 /*                                                                           */
     85 /*         DD MM YYYY   Author(s)       Changes                              */
     86 /*         13 02 2015   Kaushik         Initial Version                      */
     87 /*                      Senthoor                                             */
     88 /*                                                                           */
     89 /*****************************************************************************/
     90 void ih264_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src,
     91                                       UWORD8 *pu1_dst,
     92                                       WORD32 src_strd,
     93                                       WORD32 dst_strd,
     94                                       WORD32 ht,
     95                                       WORD32 wd,
     96                                       UWORD8* pu1_tmp,
     97                                       WORD32 dydx)
     98 {
     99     __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b;
    100 
    101     WORD32 src_strd2, src_strd3, src_strd4, dst_strd2, dst_strd3, dst_strd4;
    102     UNUSED(pu1_tmp);
    103     UNUSED(dydx);
    104 
    105     src_strd2 = src_strd << 1;
    106     dst_strd2 = dst_strd << 1;
    107     src_strd4 = src_strd << 2;
    108     dst_strd4 = dst_strd << 2;
    109     src_strd3 = src_strd2 + src_strd;
    110     dst_strd3 = dst_strd2 + dst_strd;
    111 
    112     if(wd == 4)
    113     {
    114         do
    115         {
    116             *((WORD32 *)(pu1_dst)) =  *((WORD32 *)(pu1_src));
    117             *((WORD32 *)(pu1_dst + dst_strd)) = *((WORD32 *)(pu1_src + src_strd));
    118             *((WORD32 *)(pu1_dst + dst_strd2)) = *((WORD32 *)(pu1_src + src_strd2));
    119             *((WORD32 *)(pu1_dst + dst_strd3)) = *((WORD32 *)(pu1_src + src_strd3));
    120 
    121             ht -= 4;
    122             pu1_src += src_strd4;
    123             pu1_dst += dst_strd4;
    124         }
    125         while(ht > 0);
    126     }
    127     else if(wd == 8)
    128     {
    129         do
    130         {
    131             y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    132             y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    133             y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd2));
    134             y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd3));
    135 
    136             _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b);
    137             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
    138             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
    139             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
    140 
    141             ht -= 4;
    142             pu1_src += src_strd4;
    143             pu1_dst += dst_strd4;
    144         }
    145         while(ht > 0);
    146     }
    147     else // wd == 16
    148     {
    149         WORD32 src_strd5, src_strd6, src_strd7, src_strd8;
    150         WORD32 dst_strd5, dst_strd6, dst_strd7, dst_strd8;
    151 
    152         __m128i y_4_16x8b, y_5_16x8b, y_6_16x8b, y_7_16x8b;
    153 
    154         src_strd5 = src_strd2 + src_strd3;
    155         dst_strd5 = dst_strd2 + dst_strd3;
    156         src_strd6 = src_strd3 << 1;
    157         dst_strd6 = dst_strd3 << 1;
    158         src_strd7 = src_strd3 + src_strd4;
    159         dst_strd7 = dst_strd3 + dst_strd4;
    160         src_strd8 = src_strd << 3;
    161         dst_strd8 = dst_strd << 3;
    162 
    163         do
    164         {
    165             y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    166             y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
    167             y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd2));
    168             y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd3));
    169             y_4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd4));
    170             y_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd5));
    171             y_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd6));
    172             y_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd7));
    173 
    174             _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b);
    175             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b);
    176             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), y_2_16x8b);
    177             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), y_3_16x8b);
    178             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd4), y_4_16x8b);
    179             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd5), y_5_16x8b);
    180             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd6), y_6_16x8b);
    181             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd7), y_7_16x8b);
    182 
    183             ht -= 8;
    184             pu1_src += src_strd8;
    185             pu1_dst += dst_strd8;
    186         }
    187         while(ht > 0);
    188     }
    189 }
    190 
    191 /*****************************************************************************/
    192 /*                                                                           */
    193 /*  Function Name : ih264_inter_pred_luma_horz_ssse3                         */
    194 /*                                                                           */
    195 /*  Description   : This function applies a horizontal 6-tap filter on       */
    196 /*                  ht x wd block as mentioned in sec. 8.4.2.2.1 titled      */
    197 /*                  "Luma sample interpolation process". (ht,wd) can be      */
    198 /*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
    199 /*                                                                           */
    200 /*  Inputs        : puc_src  - pointer to source                             */
    201 /*                  puc_dst  - pointer to destination                        */
    202 /*                  src_strd - stride for source                             */
    203 /*                  dst_strd - stride for destination                        */
    204 /*                  ht       - height of the block                           */
    205 /*                  wd       - width of the block                            */
    206 /*                                                                           */
    207 /*  Issues        : None                                                     */
    208 /*                                                                           */
    209 /*  Revision History:                                                        */
    210 /*                                                                           */
    211 /*         DD MM YYYY   Author(s)       Changes                              */
    212 /*         13 02 2015   Kaushik         Initial Version                      */
    213 /*                      Senthoor                                             */
    214 /*                                                                           */
    215 /*****************************************************************************/
    216 void ih264_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src,
    217                                       UWORD8 *pu1_dst,
    218                                       WORD32 src_strd,
    219                                       WORD32 dst_strd,
    220                                       WORD32 ht,
    221                                       WORD32 wd,
    222                                       UWORD8* pu1_tmp,
    223                                       WORD32 dydx)
    224 {
    225     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    226     __m128i const_val16_8x16b;
    227 
    228     UNUSED(pu1_tmp);
    229     UNUSED(dydx);
    230 
    231     pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
    232 
    233     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    234     coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    235     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
    236                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    237     const_val16_8x16b = _mm_set1_epi16(16);
    238 
    239     if(wd == 4)
    240     {
    241         __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_16x8b;
    242         __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
    243 
    244         __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
    245         __m128i res_r0r1_16x8b;
    246 
    247         //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
    248         //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
    249 
    250         do
    251         {
    252             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                     //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
    253             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));        //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
    254 
    255             src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                     //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
    256             src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                     //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
    257 
    258             src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);       //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
    259             src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);       //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
    260 
    261             src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);        //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
    262             res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);  //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
    263                                                                                     //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
    264 
    265             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                         //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
    266             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                         //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0
    267 
    268             src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);        //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
    269             res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b);  //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
    270                                                                                     //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
    271 
    272             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                         //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
    273             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                         //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0
    274 
    275             src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);        //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
    276             res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b);  //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
    277                                                                                     //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
    278 
    279             res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
    280             res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
    281             res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b); //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16;
    282                                                                                      //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16;
    283                                                                                      //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16;
    284                                                                                      //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16;
    285                                                                                      //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16;
    286                                                                                      //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16;
    287                                                                                      //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16;
    288                                                                                      //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16;
    289 
    290             res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5);                //shifting right by 5 bits.
    291 
    292             res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
    293 
    294             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
    295             res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
    296             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
    297 
    298             ht -= 2;
    299             pu1_src += src_strd << 1;
    300             pu1_dst += dst_strd << 1;
    301         }
    302         while(ht > 0);
    303     }
    304     else if(wd == 8)
    305     {
    306         __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
    307         __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
    308 
    309         __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
    310         __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
    311 
    312         //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
    313         //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
    314 
    315         do
    316         {
    317             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                   //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
    318             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));      //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
    319 
    320             src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                   //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
    321             src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                   //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
    322 
    323             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
    324             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
    325 
    326             res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
    327                                                                                   //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
    328             res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
    329                                                                                   //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
    330 
    331             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                       //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
    332             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                       //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
    333 
    334             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);               //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
    335             src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);               //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
    336 
    337             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
    338             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
    339 
    340             res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
    341                                                                                   //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
    342             res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
    343                                                                                   //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
    344 
    345             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                       //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
    346             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                       //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
    347 
    348             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);               //a5 a6 a7 a8 a9....a15 0  0  0  0  0
    349             src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);               //b5 b6 b7 b8 b9....b15 0  0  0  0  0
    350 
    351             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
    352             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
    353 
    354             res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
    355                                                                                   //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
    356             res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
    357                                                                                   //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
    358             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
    359             res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
    360             res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
    361             res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
    362             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
    363             res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
    364 
    365             res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                 //shifting right by 5 bits.
    366             res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
    367 
    368             src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
    369             src_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
    370 
    371             _mm_storel_epi64((__m128i *)pu1_dst, src_r0_16x8b);
    372             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_r1_16x8b);
    373 
    374             ht -= 2;
    375             pu1_src += src_strd << 1;
    376             pu1_dst += dst_strd << 1;
    377         }
    378         while(ht > 0);
    379     }
    380     else // wd == 16
    381     {
    382         __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
    383         __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
    384 
    385         __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
    386         __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
    387 
    388         //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
    389         //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
    390         //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
    391 
    392         do
    393         {
    394             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                  //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
    395             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));            //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
    396 
    397             src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                   //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
    398             src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                   //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
    399 
    400             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
    401             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
    402 
    403             res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
    404                                                                                   //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
    405             res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
    406                                                                                   //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
    407 
    408             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                       //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
    409             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                       //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
    410 
    411             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);               //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
    412             src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);               //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
    413 
    414             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
    415             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
    416 
    417             res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
    418                                                                                   //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
    419             res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
    420                                                                                   //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
    421 
    422             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                       //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
    423             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                       //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
    424 
    425             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);               //a5 a6 a7 a8 a9....a15 0  0  0  0  0
    426             src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);               //b5 b6 b7 b8 b9....b15 0  0  0  0  0
    427 
    428             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);  //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
    429             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);  //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
    430 
    431             res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
    432                                                                                   //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
    433             res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
    434                                                                                   //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
    435             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
    436             res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
    437             res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
    438             res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
    439             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
    440             res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
    441 
    442             res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                 //shifting right by 5 bits.
    443             res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
    444 
    445             src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
    446             _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);
    447 
    448             ht--;
    449             pu1_src += src_strd;
    450             pu1_dst += dst_strd;
    451         }
    452         while(ht > 0);
    453     }
    454 }
    455 
    456 /*****************************************************************************/
    457 /*                                                                           */
    458 /*  Function Name : ih264_inter_pred_luma_vert_ssse3                         */
    459 /*                                                                           */
    460 /*  Description   : This function applies a vertical 6-tap filter on         */
    461 /*                  ht x wd block as mentioned in sec. 8.4.2.2.1 titled      */
    462 /*                  "Luma sample interpolation process". (ht,wd) can be      */
    463 /*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
    464 /*                                                                           */
    465 /*  Inputs        : puc_src  - pointer to source                             */
    466 /*                  puc_dst  - pointer to destination                        */
    467 /*                  src_strd - stride for source                             */
    468 /*                  dst_strd - stride for destination                        */
    469 /*                  ht       - height of the block                           */
    470 /*                  wd       - width of the block                            */
    471 /*                                                                           */
    472 /*  Issues        : None                                                     */
    473 /*                                                                           */
    474 /*  Revision History:                                                        */
    475 /*                                                                           */
    476 /*         DD MM YYYY   Author(s)       Changes                              */
    477 /*         13 02 2015   Kaushik         Initial Version                      */
    478 /*                      Senthoor                                             */
    479 /*                                                                           */
    480 /*****************************************************************************/
    481 void ih264_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src,
    482                                       UWORD8 *pu1_dst,
    483                                       WORD32 src_strd,
    484                                       WORD32 dst_strd,
    485                                       WORD32 ht,
    486                                       WORD32 wd,
    487                                       UWORD8* pu1_tmp,
    488                                       WORD32 dydx)
    489 {
    490     __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
    491     __m128i src_r5_16x8b, src_r6_16x8b;
    492     __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
    493 
    494     __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
    495 
    496     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    497     __m128i const_val16_8x16b;
    498 
    499     UNUSED(pu1_tmp);
    500     UNUSED(dydx);
    501 
    502     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    503     coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    504     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
    505                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    506     const_val16_8x16b = _mm_set1_epi16(16);
    507 
    508     pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
    509 
    510     if(wd == 4)
    511     {
    512         //Epilogue: Load all the pred rows except sixth and seventh row
    513         //          for the first and second row processing.
    514         src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    515         pu1_src += src_strd;
    516         src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    517         pu1_src += src_strd;
    518         src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    519         pu1_src += src_strd;
    520         src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    521         pu1_src += src_strd;
    522         src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    523         pu1_src += src_strd;
    524 
    525         src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
    526         src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
    527         src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
    528         src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
    529 
    530         do
    531         {
    532             src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    533             src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    534 
    535             src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
    536             src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
    537 
    538             src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
    539             src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
    540             src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
    541 
    542             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    543             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    544             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    545 
    546             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    547             res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
    548             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
    549 
    550             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
    551             res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
    552 
    553             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
    554             res_16x8b = _mm_srli_si128(res_16x8b, 4);
    555             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
    556 
    557             src_r0_16x8b = src_r2_16x8b;
    558             src_r1_16x8b = src_r3_16x8b;
    559             src_r2_16x8b = src_r4_16x8b;
    560             src_r3_16x8b = src_r5_16x8b;
    561             src_r4_16x8b = src_r6_16x8b;
    562 
    563             ht -= 2;
    564             pu1_src += src_strd << 1;
    565             pu1_dst += dst_strd << 1;
    566         }
    567         while(ht > 0);
    568     }
    569 
    570     else if(wd == 8)
    571     {
    572         //Epilogue: Load all the pred rows except sixth and seventh row
    573         //          for the first and second row processing.
    574         src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    575         pu1_src += src_strd;
    576         src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    577         pu1_src += src_strd;
    578         src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    579         pu1_src += src_strd;
    580         src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    581         pu1_src += src_strd;
    582         src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    583         pu1_src += src_strd;
    584 
    585         src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
    586         src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
    587         src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
    588         src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
    589 
    590         do
    591         {
    592             src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
    593             src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    594 
    595             src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
    596             src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
    597 
    598             src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
    599             src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
    600             src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
    601 
    602             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    603             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    604             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    605 
    606             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    607             res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
    608             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
    609 
    610             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
    611             res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
    612 
    613             _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
    614 
    615             src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
    616             src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
    617             src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
    618 
    619             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    620             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    621             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    622 
    623             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    624             res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
    625             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
    626 
    627             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
    628             res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
    629 
    630             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
    631 
    632             src_r0_16x8b = src_r2_16x8b;
    633             src_r1_16x8b = src_r3_16x8b;
    634             src_r2_16x8b = src_r4_16x8b;
    635             src_r3_16x8b = src_r5_16x8b;
    636             src_r4_16x8b = src_r6_16x8b;
    637 
    638             ht -= 2;
    639             pu1_src += src_strd << 1;
    640             pu1_dst += dst_strd << 1;
    641         }
    642         while(ht > 0);
    643     }
    644     else // wd == 16
    645     {
    646         __m128i res_t0_8x16b;
    647 
    648         //Epilogue: Load all the pred rows except sixth and seventh row
    649         //          for the first and second row processing.
    650         src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    651         pu1_src += src_strd;
    652         src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    653         pu1_src += src_strd;
    654         src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    655         pu1_src += src_strd;
    656         src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    657         pu1_src += src_strd;
    658         src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    659         pu1_src += src_strd;
    660 
    661         do
    662         {
    663             src_r5_16x8b  = _mm_loadu_si128((__m128i *)pu1_src);
    664             src_r6_16x8b  = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
    665 
    666             src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
    667             src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
    668             src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
    669 
    670             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    671             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    672             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    673 
    674             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    675             res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
    676             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
    677             res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
    678 
    679             src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
    680             src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
    681             src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
    682 
    683             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    684             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    685             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    686 
    687             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    688             res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
    689             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
    690             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
    691 
    692             res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
    693 
    694             _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
    695 
    696             src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
    697             src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
    698             src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
    699 
    700             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    701             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    702             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    703 
    704             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    705             res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
    706             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
    707             res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
    708 
    709             src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
    710             src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
    711             src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
    712 
    713             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    714             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    715             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    716 
    717             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    718             res_t3_8x16b = _mm_add_epi16(res_t3_8x16b, const_val16_8x16b);
    719             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
    720             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
    721 
    722             res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
    723 
    724             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b);
    725 
    726             src_r0_16x8b = src_r2_16x8b;
    727             src_r1_16x8b = src_r3_16x8b;
    728             src_r2_16x8b = src_r4_16x8b;
    729             src_r3_16x8b = src_r5_16x8b;
    730             src_r4_16x8b = src_r6_16x8b;
    731 
    732             ht -= 2;
    733             pu1_src += src_strd << 1;
    734             pu1_dst += dst_strd << 1;
    735         }
    736         while(ht > 0);
    737     }
    738 }
    739 
    740 /*****************************************************************************/
    741 /*                                                                           */
    742 /*  Function Name : ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3          */
    743 /*                                                                           */
    744 /*  Description   : This function implements a two stage cascaded six tap    */
    745 /*                  filter, horizontally and then vertically on ht x wd      */
    746 /*                  block as mentioned in sec. 8.4.2.2.1 titled "Luma sample */
    747 /*                  interpolation process". (ht,wd) can be (4,4), (8,4),     */
    748 /*                  (4,8), (8,8), (16,8), (8,16) or (16,16).                 */
    749 /*                                                                           */
    750 /*  Inputs        : puc_src  - pointer to source                             */
    751 /*                  puc_dst  - pointer to destination                        */
    752 /*                  src_strd - stride for source                             */
    753 /*                  dst_strd - stride for destination                        */
    754 /*                  ht       - height of the block                           */
    755 /*                  wd       - width of the block                            */
    756 /*                  pu1_tmp  - pointer to temporary buffer                   */
    757 /*                                                                           */
    758 /*  Issues        : None                                                     */
    759 /*                                                                           */
    760 /*  Revision History:                                                        */
    761 /*                                                                           */
    762 /*         DD MM YYYY   Author(s)       Changes                              */
    763 /*         13 02 2015   Kaushik         Initial Version                      */
    764 /*                      Senthoor                                             */
    765 /*                                                                           */
    766 /*****************************************************************************/
    767 void ih264_inter_pred_luma_horz_hpel_vert_hpel_ssse3(UWORD8 *pu1_src,
    768                                                      UWORD8 *pu1_dst,
    769                                                      WORD32 src_strd,
    770                                                      WORD32 dst_strd,
    771                                                      WORD32 ht,
    772                                                      WORD32 wd,
    773                                                      UWORD8* pu1_tmp,
    774                                                      WORD32 dydx)
    775 {
    776     UNUSED(dydx);
    777 
    778     if(wd == 4)
    779     {
    780         WORD16 *pi2_temp;
    781 
    782         pu1_tmp += 4;
    783         pu1_src -= src_strd << 1;
    784         pi2_temp = (WORD16 *)pu1_tmp;
    785         pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
    786 
    787         // Horizontal 6-tap filtering
    788         {
    789             WORD32 ht_tmp = ht + 4;
    790 
    791             __m128i src_r0_16x8b, src_r1_16x8b;
    792             __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
    793             __m128i src_r0r1_t1_16x8b;
    794             __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
    795             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    796 
    797             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    798             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    799             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
    800                                                           //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    801             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
    802             //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
    803 
    804             do
    805             {
    806                 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                       //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
    807                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));          //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
    808 
    809                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                       //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
    810                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                       //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
    811 
    812                 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);         //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
    813                 src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);         //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
    814 
    815                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);       //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
    816                 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
    817                                                                                           //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
    818 
    819                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                           //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
    820                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                           //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0
    821 
    822                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);       //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
    823                 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
    824                                                                                           //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
    825 
    826                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                           //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
    827                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                           //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0
    828 
    829                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);       //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
    830                 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
    831                                                                                           //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
    832                 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
    833                 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b);
    834 
    835                 _mm_storeu_si128((__m128i *)pi2_temp, res_r0r1_t1_8x16b);
    836 
    837                 ht_tmp -= 2;
    838                 pu1_src += src_strd << 1;
    839                 pi2_temp += 8;
    840             }
    841             while(ht_tmp > 0);
    842 
    843             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                           //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
    844             src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                           //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
    845 
    846             src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);             //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
    847             res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff0_1_16x8b);          //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
    848 
    849             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4);                                //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
    850             res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff2_3_16x8b);          //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
    851 
    852             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b,4);                                //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
    853             res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0_16x8b, coeff4_5_16x8b);          //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
    854 
    855             res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
    856             res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t3_8x16b, res_r0r1_t1_8x16b);
    857 
    858             _mm_storel_epi64((__m128i *)pi2_temp, res_r0r1_t1_8x16b);
    859         }
    860 
    861         pi2_temp = (WORD16 *)pu1_tmp;
    862 
    863         // Vertical 6-tap filtering
    864         {
    865             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b,
    866                             src_r4_8x16b;
    867             __m128i src_r5_8x16b, src_r6_8x16b;
    868             __m128i src_t1_8x16b, src_t2_8x16b;
    869 
    870             __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
    871             __m128i res_8x16b, res_16x8b;
    872 
    873             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
    874             __m128i const_val512_4x32b;
    875 
    876             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
    877             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
    878             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
    879 
    880             const_val512_4x32b = _mm_set1_epi32(512);
    881 
    882             src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp));
    883             src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4));
    884             src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 8));
    885             src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 12));
    886             src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 16));
    887             pi2_temp += 20;
    888 
    889             do
    890             {
    891                 src_r5_8x16b = _mm_loadl_epi64((__m128i *)pi2_temp);
    892                 src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp + 4));
    893 
    894                 src_r0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
    895                 src_t1_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
    896                 src_t2_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
    897 
    898                 res_t1_4x32b = _mm_madd_epi16(src_r0_8x16b, coeff0_1_8x16b);
    899                 res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b);
    900                 res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b);
    901 
    902                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
    903                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
    904                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
    905 
    906                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
    907 
    908                 src_r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
    909                 src_t1_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
    910                 src_t2_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
    911 
    912                 res_t1_4x32b = _mm_madd_epi16(src_r1_8x16b, coeff0_1_8x16b);
    913                 res_t2_4x32b = _mm_madd_epi16(src_t1_8x16b, coeff2_3_8x16b);
    914                 res_t3_4x32b = _mm_madd_epi16(src_t2_8x16b, coeff4_5_8x16b);
    915 
    916                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
    917                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
    918                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
    919 
    920                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
    921 
    922                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
    923                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
    924 
    925                 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
    926                 res_16x8b = _mm_srli_si128(res_16x8b, 4);
    927                 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
    928 
    929                 src_r0_8x16b = src_r2_8x16b;
    930                 src_r1_8x16b = src_r3_8x16b;
    931                 src_r2_8x16b = src_r4_8x16b;
    932                 src_r3_8x16b = src_r5_8x16b;
    933                 src_r4_8x16b = src_r6_8x16b;
    934 
    935                 ht -= 2;
    936                 pi2_temp += 8;
    937                 pu1_dst += dst_strd << 1;
    938             }
    939             while(ht > 0);
    940         }
    941     }
    942     else if(wd == 8)
    943     {
    944         WORD16 *pi2_temp;
    945 
    946         pu1_tmp += 4;
    947         pu1_src -= src_strd << 1;
    948         pi2_temp = (WORD16 *)pu1_tmp;
    949         pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
    950 
    951         // Horizontal 6-tap filtering
    952         {
    953             WORD32 ht_tmp = ht + 4;
    954 
    955             __m128i src_r0_16x8b, src_r1_16x8b;
    956             __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
    957             __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
    958             __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
    959             __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
    960             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    961 
    962             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    963             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    964             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
    965                                                           //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    966             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
    967             //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
    968 
    969             do
    970             {
    971                 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                      //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
    972                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));         //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15
    973 
    974                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
    975                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
    976 
    977                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
    978                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
    979 
    980                 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
    981                                                                                          //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
    982                 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
    983                                                                                          //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
    984 
    985                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
    986                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
    987 
    988                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
    989                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
    990 
    991                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
    992                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
    993 
    994                 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
    995                                                                                          //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
    996                 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
    997                                                                                          //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
    998 
    999                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   1000                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
   1001 
   1002                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   1003                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0
   1004 
   1005                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   1006                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
   1007 
   1008                 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   1009                                                                                          //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   1010                 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
   1011                                                                                          //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
   1012                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   1013                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   1014 
   1015                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
   1016                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
   1017 
   1018                 _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
   1019                 _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b);
   1020 
   1021                 ht_tmp -= 2;
   1022                 pu1_src += src_strd << 1;
   1023                 pi2_temp += 16;
   1024             }
   1025             while(ht_tmp > 0);
   1026 
   1027             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                          //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15
   1028             src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                          //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   1029 
   1030             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b,src_r0_sht_16x8b);          //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   1031             res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b,coeff0_1_16x8b);         //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   1032                                                                                          //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
   1033 
   1034             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                              //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
   1035             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                      //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
   1036 
   1037             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);         //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
   1038             res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);        //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   1039                                                                                          //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
   1040 
   1041             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                              //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   1042             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                      //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   1043 
   1044             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);         //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   1045             res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);        //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   1046                                                                                          //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   1047             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   1048             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   1049 
   1050             _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
   1051         }
   1052 
   1053         pi2_temp = (WORD16 *)pu1_tmp;
   1054 
   1055         // Vertical 6-tap filtering
   1056         {
   1057             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b,
   1058                             src_r4_8x16b;
   1059             __m128i src_r5_8x16b, src_r6_8x16b;
   1060             __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
   1061 
   1062             __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
   1063             __m128i res_c0_4x32b, res_c1_4x32b;
   1064             __m128i res_8x16b, res_16x8b;
   1065 
   1066             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
   1067             __m128i const_val512_4x32b;
   1068 
   1069             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
   1070             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
   1071             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
   1072 
   1073             const_val512_4x32b = _mm_set1_epi32(512);
   1074 
   1075             src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
   1076             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8));
   1077             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
   1078             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 24));
   1079             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32));
   1080             pi2_temp += 40;
   1081 
   1082             do
   1083             {
   1084                 src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
   1085                 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 8));
   1086 
   1087                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   1088                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   1089                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   1090 
   1091                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1092                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1093                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1094 
   1095                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1096                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1097                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1098                 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1099 
   1100                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   1101                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   1102                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   1103 
   1104                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1105                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1106                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1107 
   1108                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1109                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1110                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1111                 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1112 
   1113                 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
   1114                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   1115 
   1116                 _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
   1117 
   1118                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
   1119                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
   1120                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
   1121 
   1122                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1123                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1124                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1125 
   1126                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1127                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1128                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1129                 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1130 
   1131                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
   1132                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
   1133                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
   1134 
   1135                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1136                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1137                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1138 
   1139                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1140                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1141                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1142                 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1143 
   1144                 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
   1145                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   1146 
   1147                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
   1148 
   1149                 src_r0_8x16b = src_r2_8x16b;
   1150                 src_r1_8x16b = src_r3_8x16b;
   1151                 src_r2_8x16b = src_r4_8x16b;
   1152                 src_r3_8x16b = src_r5_8x16b;
   1153                 src_r4_8x16b = src_r6_8x16b;
   1154 
   1155                 ht -= 2;
   1156                 pi2_temp += 16;
   1157                 pu1_dst += dst_strd << 1;
   1158             }
   1159             while(ht > 0);
   1160         }
   1161     }
   1162     else // wd == 16
   1163     {
   1164         WORD16 *pi2_temp;
   1165         WORD32 ht_tmp;
   1166 
   1167         pu1_tmp += 4;
   1168         pu1_src -= src_strd << 1;
   1169         pi2_temp = (WORD16 *)pu1_tmp;
   1170         pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
   1171 
   1172         // Horizontal 6-tap filtering
   1173         {
   1174             __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
   1175             __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
   1176 
   1177             __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
   1178             __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
   1179 
   1180             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   1181 
   1182             ht_tmp = ht + 5;
   1183 
   1184             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   1185             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   1186             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
   1187                                                           //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
   1188             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   1189             //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   1190             //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
   1191 
   1192             do
   1193             {
   1194                 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                      //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   1195                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));                //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   1196 
   1197                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   1198                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   1199 
   1200                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   1201                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   1202 
   1203                 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   1204                                                                                          //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
   1205                 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   1206                                                                                          //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
   1207 
   1208                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
   1209                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
   1210 
   1211                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
   1212                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
   1213 
   1214                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
   1215                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
   1216 
   1217                 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   1218                                                                                          //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
   1219                 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
   1220                                                                                          //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
   1221 
   1222                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   1223                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
   1224 
   1225                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   1226                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0
   1227 
   1228                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   1229                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
   1230 
   1231                 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   1232                                                                                          //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   1233                 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
   1234                                                                                          //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
   1235                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   1236                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   1237 
   1238                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
   1239                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
   1240 
   1241                 _mm_storeu_si128((__m128i *)pi2_temp, res_r0_t1_8x16b);
   1242                 _mm_storeu_si128((__m128i *)(pi2_temp + 8), res_r1_t1_8x16b);
   1243 
   1244                 ht_tmp--;
   1245                 pu1_src += src_strd;
   1246                 pi2_temp += 16;
   1247             }
   1248             while(ht_tmp > 0);
   1249         }
   1250 
   1251         pi2_temp = (WORD16 *)pu1_tmp;
   1252 
   1253         // Vertical 6-tap filtering
   1254         {
   1255             WORD16 *pi2_temp2;
   1256             UWORD8 *pu1_dst2;
   1257             WORD32 ht_tmp;
   1258 
   1259             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b;
   1260             __m128i src_r5_8x16b, src_r6_8x16b;
   1261             __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
   1262 
   1263             __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
   1264             __m128i res_c0_4x32b, res_c1_4x32b;
   1265             __m128i res_8x16b, res_16x8b;
   1266 
   1267             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
   1268             __m128i const_val512_4x32b;
   1269 
   1270             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
   1271             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
   1272             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
   1273 
   1274             const_val512_4x32b = _mm_set1_epi32(512);
   1275 
   1276             pi2_temp2 = pi2_temp + 8;
   1277             pu1_dst2 = pu1_dst + 8;
   1278             ht_tmp = ht;
   1279 
   1280             /**********************************************************/
   1281             /*     Do first height x 8 block                          */
   1282             /**********************************************************/
   1283             src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
   1284             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
   1285             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 32));
   1286             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 48));
   1287             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 64));
   1288             pi2_temp += 80;
   1289 
   1290             do
   1291             {
   1292                 src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp);
   1293                 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp + 16));
   1294 
   1295                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   1296                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   1297                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   1298 
   1299                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1300                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1301                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1302 
   1303                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1304                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1305                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1306                 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1307 
   1308                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   1309                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   1310                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   1311 
   1312                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1313                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1314                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1315 
   1316                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1317                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1318                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1319                 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1320 
   1321                 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
   1322                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   1323 
   1324                 _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
   1325 
   1326                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
   1327                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
   1328                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
   1329 
   1330                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1331                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1332                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1333 
   1334                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1335                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1336                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1337                 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1338 
   1339                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
   1340                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
   1341                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
   1342 
   1343                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1344                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1345                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1346 
   1347                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1348                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1349                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1350                 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1351 
   1352                 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
   1353                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   1354 
   1355                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
   1356 
   1357                 src_r0_8x16b = src_r2_8x16b;
   1358                 src_r1_8x16b = src_r3_8x16b;
   1359                 src_r2_8x16b = src_r4_8x16b;
   1360                 src_r3_8x16b = src_r5_8x16b;
   1361                 src_r4_8x16b = src_r6_8x16b;
   1362 
   1363                 ht_tmp -= 2;
   1364                 pi2_temp += 32;
   1365                 pu1_dst += dst_strd << 1;
   1366             }
   1367             while(ht_tmp > 0);
   1368 
   1369             /**********************************************************/
   1370             /*     Do second ht x 8 block                          */
   1371             /**********************************************************/
   1372             src_r0_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2);
   1373             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
   1374             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
   1375             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48));
   1376             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64));
   1377             pi2_temp2 += 80;
   1378 
   1379             do
   1380             {
   1381                 src_r5_8x16b = _mm_loadu_si128((__m128i *)pi2_temp2);
   1382                 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
   1383 
   1384                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   1385                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   1386                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   1387 
   1388                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1389                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1390                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1391 
   1392                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1393                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1394                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1395                 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1396 
   1397                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   1398                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   1399                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   1400 
   1401                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1402                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1403                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1404 
   1405                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1406                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1407                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1408                 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1409 
   1410                 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
   1411                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   1412 
   1413                 _mm_storel_epi64((__m128i *)pu1_dst2, res_16x8b);
   1414 
   1415                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
   1416                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
   1417                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
   1418 
   1419                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1420                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1421                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1422 
   1423                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1424                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1425                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1426                 res_c0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1427 
   1428                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
   1429                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
   1430                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
   1431 
   1432                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   1433                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   1434                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   1435 
   1436                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   1437                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   1438                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   1439                 res_c1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   1440 
   1441                 res_8x16b = _mm_packs_epi32(res_c0_4x32b, res_c1_4x32b);
   1442                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   1443 
   1444                 _mm_storel_epi64((__m128i *)(pu1_dst2 + dst_strd), res_16x8b);
   1445 
   1446                 src_r0_8x16b = src_r2_8x16b;
   1447                 src_r1_8x16b = src_r3_8x16b;
   1448                 src_r2_8x16b = src_r4_8x16b;
   1449                 src_r3_8x16b = src_r5_8x16b;
   1450                 src_r4_8x16b = src_r6_8x16b;
   1451 
   1452                 ht -= 2;
   1453                 pi2_temp2 += 32;
   1454                 pu1_dst2 += dst_strd << 1;
   1455             }
   1456             while(ht > 0);
   1457         }
   1458     }
   1459 }
   1460 
   1461 /*****************************************************************************/
   1462 /*                                                                           */
   1463 /*  Function Name : ih264_inter_pred_luma_horz_qpel_ssse3                    */
   1464 /*                                                                           */
   1465 /*  Description   : This function implements a six-tap filter horizontally   */
   1466 /*                  on ht x wd block and averages the values with the source */
   1467 /*                  pixels to calculate horizontal quarter-pel as mentioned  */
   1468 /*                  in sec. 8.4.2.2.1 titled "Luma sample interpolation      */
   1469 /*                  process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8),     */
   1470 /*                  (16,8), (8,16) or (16,16).                               */
   1471 /*                                                                           */
   1472 /*  Inputs        : puc_src  - pointer to source                             */
   1473 /*                  puc_dst  - pointer to destination                        */
   1474 /*                  src_strd - stride for source                             */
   1475 /*                  dst_strd - stride for destination                        */
   1476 /*                  ht       - height of the block                           */
   1477 /*                  wd       - width of the block                            */
   1478 /*                  pu1_tmp  - pointer to temporary buffer                   */
   1479 /*                  dydx     - x and y reference offset for q-pel            */
   1480 /*                             calculations                                  */
   1481 /*                                                                           */
   1482 /*  Issues        : None                                                     */
   1483 /*                                                                           */
   1484 /*  Revision History:                                                        */
   1485 /*                                                                           */
   1486 /*         DD MM YYYY   Author(s)       Changes                              */
   1487 /*         13 02 2015   Kaushik         Initial Version                      */
   1488 /*                      Senthoor                                             */
   1489 /*                                                                           */
   1490 /*****************************************************************************/
   1491 void ih264_inter_pred_luma_horz_qpel_ssse3(UWORD8 *pu1_src,
   1492                                            UWORD8 *pu1_dst,
   1493                                            WORD32 src_strd,
   1494                                            WORD32 dst_strd,
   1495                                            WORD32 ht,
   1496                                            WORD32 wd,
   1497                                            UWORD8* pu1_tmp,
   1498                                            WORD32 dydx)
   1499 {
   1500     WORD32 x_offset;
   1501     UWORD8 *pu1_pred1;
   1502 
   1503     __m128i src_r0_16x8b, src_r1_16x8b;
   1504     __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
   1505     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   1506     __m128i const_val16_8x16b;
   1507 
   1508     UNUSED(pu1_tmp);
   1509 
   1510     x_offset = dydx & 3;
   1511 
   1512     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   1513     coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   1514     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
   1515                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
   1516     pu1_pred1 = pu1_src + (x_offset >> 1);
   1517 
   1518     const_val16_8x16b = _mm_set1_epi16(16);
   1519 
   1520     pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
   1521 
   1522     if(wd == 4)
   1523     {
   1524         __m128i src_r0r1_16x8b;
   1525 
   1526         __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
   1527         __m128i res_r0r1_16x8b;
   1528 
   1529         //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   1530         //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   1531 
   1532         do
   1533         {
   1534             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                         //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   1535             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));            //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   1536 
   1537             src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                         //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   1538             src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                         //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   1539 
   1540             src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);           //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   1541             src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);           //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   1542 
   1543             src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);            //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
   1544             res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);      //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   1545                                                                                         //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   1546 
   1547             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                             //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
   1548             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                             //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0
   1549 
   1550             src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);            //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
   1551             res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff2_3_16x8b);      //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   1552                                                                                         //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
   1553 
   1554             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                             //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
   1555             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                             //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0
   1556 
   1557             src_r0r1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);            //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
   1558             res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff4_5_16x8b);      //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   1559                                                                                         //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
   1560             src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
   1561             src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
   1562 
   1563             res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
   1564             res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
   1565             res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b);    //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 16;
   1566                                                                                         //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 16;
   1567                                                                                         //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 16;
   1568                                                                                         //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 16;
   1569                                                                                         //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 16;
   1570                                                                                         //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 16;
   1571                                                                                         //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 16;
   1572                                                                                         //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 16;
   1573             src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b);
   1574 
   1575             res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5);                   //shifting right by 5 bits.
   1576 
   1577             res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b, res_r0r1_t1_8x16b);
   1578             res_r0r1_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_r0r1_16x8b);              //computing q-pel
   1579 
   1580             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
   1581             res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
   1582             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
   1583 
   1584             ht -= 2;
   1585             pu1_src += src_strd << 1;
   1586             pu1_pred1 += src_strd << 1;
   1587             pu1_dst += dst_strd << 1;
   1588         }
   1589         while(ht > 0);
   1590     }
   1591     else if(wd == 8)
   1592     {
   1593         __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
   1594 
   1595         __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
   1596         __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
   1597         __m128i res_r0_16x8b, res_r1_16x8b;
   1598 
   1599         //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   1600         //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   1601 
   1602         do
   1603         {
   1604             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                      //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   1605             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));         //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   1606 
   1607             src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   1608             src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   1609 
   1610             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   1611             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   1612 
   1613             res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   1614                                                                                      //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
   1615             res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   1616                                                                                      //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
   1617 
   1618             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
   1619             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
   1620 
   1621             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
   1622             src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
   1623 
   1624             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
   1625             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
   1626 
   1627             res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   1628                                                                                      //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
   1629             res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
   1630                                                                                      //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
   1631 
   1632             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   1633             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
   1634 
   1635             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   1636             src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0
   1637 
   1638             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   1639             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
   1640 
   1641             res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   1642                                                                                      //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   1643             res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
   1644                                                                                      //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
   1645             src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
   1646             src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
   1647 
   1648             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   1649             res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
   1650             res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
   1651             res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
   1652             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   1653             res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
   1654 
   1655             res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);
   1656             res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);                    //shifting right by 5 bits.
   1657 
   1658             res_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
   1659             res_r1_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
   1660 
   1661             res_r0_16x8b = _mm_avg_epu8(src_r0_16x8b, res_r0_16x8b);
   1662             res_r1_16x8b = _mm_avg_epu8(src_r1_16x8b, res_r1_16x8b);                 //computing q-pel
   1663 
   1664             _mm_storel_epi64((__m128i *)pu1_dst, res_r0_16x8b);
   1665             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_r1_16x8b);
   1666 
   1667             ht -= 2;
   1668             pu1_src += src_strd << 1;
   1669             pu1_pred1 += src_strd << 1;
   1670             pu1_dst += dst_strd << 1;
   1671         }
   1672         while(ht > 0);
   1673     }
   1674     else // wd == 16
   1675     {
   1676         __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
   1677 
   1678         __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
   1679         __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
   1680         __m128i res_16x8b;
   1681 
   1682         //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   1683         //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   1684         //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
   1685 
   1686         do
   1687         {
   1688             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                      //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   1689             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));                //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   1690 
   1691             src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   1692             src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   1693 
   1694             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   1695             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   1696 
   1697             res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   1698                                                                                      //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
   1699             res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   1700                                                                                      //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
   1701 
   1702             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
   1703             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
   1704 
   1705             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
   1706             src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
   1707 
   1708             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
   1709             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
   1710 
   1711             res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   1712                                                                                      //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
   1713             res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
   1714                                                                                      //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
   1715 
   1716             src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   1717             src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
   1718 
   1719             src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   1720             src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0
   1721 
   1722             src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   1723             src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
   1724 
   1725             res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   1726                                                                                      //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   1727             res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
   1728                                                                                      //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
   1729             src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1);
   1730 
   1731             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   1732             res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
   1733             res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
   1734             res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
   1735             res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   1736             res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
   1737 
   1738             res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);
   1739             res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);                    //shifting right by 5 bits
   1740 
   1741             res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
   1742             res_16x8b = _mm_avg_epu8(src_r0_16x8b, res_16x8b);                       //computing q-pel
   1743 
   1744             _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
   1745 
   1746             ht--;
   1747             pu1_src += src_strd;
   1748             pu1_pred1 += src_strd;
   1749             pu1_dst += dst_strd;
   1750         }
   1751         while(ht > 0);
   1752     }
   1753 }
   1754 
   1755 /*****************************************************************************/
   1756 /*                                                                           */
   1757 /*  Function Name : ih264_inter_pred_luma_vert_qpel_ssse3                    */
   1758 /*                                                                           */
   1759 /*  Description   : This function implements a six-tap filter vertically on  */
   1760 /*                  ht x wd block and averages the values with the source    */
   1761 /*                  pixels to calculate vertical quarter-pel as mentioned in */
   1762 /*                  sec. 8.4.2.2.1 titled "Luma sample interpolation         */
   1763 /*                  process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8),     */
   1764 /*                  (16,8), (8,16) or (16,16).                               */
   1765 /*                                                                           */
   1766 /*  Inputs        : puc_src  - pointer to source                             */
   1767 /*                  puc_dst  - pointer to destination                        */
   1768 /*                  src_strd - stride for source                             */
   1769 /*                  dst_strd - stride for destination                        */
   1770 /*                  ht       - height of the block                           */
   1771 /*                  wd       - width of the block                            */
   1772 /*                  pu1_tmp  - pointer to temporary buffer                   */
   1773 /*                  dydx     - x and y reference offset for q-pel            */
   1774 /*                             calculations                                  */
   1775 /*                                                                           */
   1776 /*  Issues        : None                                                     */
   1777 /*                                                                           */
   1778 /*  Revision History:                                                        */
   1779 /*                                                                           */
   1780 /*         DD MM YYYY   Author(s)       Changes                              */
   1781 /*         13 02 2015   Kaushik         Initial Version                      */
   1782 /*                      Senthoor                                             */
   1783 /*                                                                           */
   1784 /*****************************************************************************/
   1785 void ih264_inter_pred_luma_vert_qpel_ssse3(UWORD8 *pu1_src,
   1786                                            UWORD8 *pu1_dst,
   1787                                            WORD32 src_strd,
   1788                                            WORD32 dst_strd,
   1789                                            WORD32 ht,
   1790                                            WORD32 wd,
   1791                                            UWORD8* pu1_tmp,
   1792                                            WORD32 dydx)
   1793 {
   1794     WORD32 y_offset;
   1795     UWORD8 *pu1_pred1;
   1796 
   1797 
   1798     __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
   1799     __m128i src_r5_16x8b, src_r6_16x8b;
   1800     __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
   1801     __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
   1802 
   1803     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   1804     __m128i const_val16_8x16b;
   1805 
   1806     UNUSED(pu1_tmp);
   1807     y_offset = dydx & 0xf;
   1808 
   1809     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   1810     coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   1811     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
   1812                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
   1813 
   1814     pu1_pred1 = pu1_src + (y_offset >> 3) * src_strd;
   1815 
   1816     const_val16_8x16b = _mm_set1_epi16(16);
   1817 
   1818     pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
   1819 
   1820     if(wd == 4)
   1821     {
   1822         //Epilogue: Load all the pred rows except sixth and seventh row
   1823         //          for the first and second row processing.
   1824         src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1825         pu1_src += src_strd;
   1826         src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1827         pu1_src += src_strd;
   1828         src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1829         pu1_src += src_strd;
   1830         src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1831         pu1_src += src_strd;
   1832         src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1833         pu1_src += src_strd;
   1834 
   1835         src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
   1836         src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
   1837         src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
   1838         src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
   1839 
   1840         do
   1841         {
   1842             src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1843             src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
   1844 
   1845             src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
   1846             src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
   1847 
   1848             src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   1849             src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   1850             src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   1851 
   1852             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   1853             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   1854             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   1855 
   1856             src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
   1857             src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
   1858 
   1859             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   1860             res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   1861             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   1862 
   1863             src_r0r1_16x8b = _mm_unpacklo_epi32(src_r0_16x8b,src_r1_16x8b);
   1864 
   1865             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   1866 
   1867             res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
   1868 
   1869             res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
   1870 
   1871             *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
   1872             res_16x8b = _mm_srli_si128(res_16x8b, 4);
   1873             *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
   1874 
   1875             src_r0_16x8b = src_r2_16x8b;
   1876             src_r1_16x8b = src_r3_16x8b;
   1877             src_r2_16x8b = src_r4_16x8b;
   1878             src_r3_16x8b = src_r5_16x8b;
   1879             src_r4_16x8b = src_r6_16x8b;
   1880 
   1881             ht -= 2;
   1882             pu1_src += src_strd << 1;
   1883             pu1_pred1 += src_strd << 1;
   1884             pu1_dst += dst_strd << 1;
   1885         }
   1886         while(ht > 0);
   1887     }
   1888 
   1889     else if(wd == 8)
   1890     {
   1891         //Epilogue: Load all the pred rows except sixth and seventh row
   1892         //          for the first and second row processing.
   1893         src_r0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1894         pu1_src += src_strd;
   1895         src_r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1896         pu1_src += src_strd;
   1897         src_r2_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1898         pu1_src += src_strd;
   1899         src_r3_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1900         pu1_src += src_strd;
   1901         src_r4_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1902         pu1_src += src_strd;
   1903 
   1904         src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
   1905         src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
   1906         src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
   1907         src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
   1908 
   1909         do
   1910         {
   1911             src_r5_16x8b = _mm_loadl_epi64((__m128i *)pu1_src);
   1912             src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
   1913 
   1914             src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
   1915             src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
   1916 
   1917             src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   1918             src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   1919             src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   1920 
   1921             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   1922             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   1923             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   1924 
   1925             src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)pu1_pred1);
   1926 
   1927             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   1928             res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   1929             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   1930 
   1931             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   1932 
   1933             res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
   1934             res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
   1935 
   1936             _mm_storel_epi64((__m128i *)pu1_dst, res_16x8b);
   1937 
   1938             src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
   1939             src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
   1940             src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
   1941 
   1942             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   1943             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   1944             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   1945 
   1946             src_r0r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred1 + src_strd));
   1947 
   1948             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   1949             res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   1950             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   1951 
   1952             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   1953 
   1954             res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
   1955             res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
   1956 
   1957             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
   1958 
   1959             src_r0_16x8b = src_r2_16x8b;
   1960             src_r1_16x8b = src_r3_16x8b;
   1961             src_r2_16x8b = src_r4_16x8b;
   1962             src_r3_16x8b = src_r5_16x8b;
   1963             src_r4_16x8b = src_r6_16x8b;
   1964 
   1965             ht -= 2;
   1966             pu1_src += src_strd << 1;
   1967             pu1_pred1 += src_strd << 1;
   1968             pu1_dst += dst_strd << 1;
   1969         }
   1970         while(ht > 0);
   1971     }
   1972     else // wd == 16
   1973     {
   1974         __m128i res_t0_8x16b;
   1975 
   1976         //Epilogue: Load all the pred rows except sixth and seventh row
   1977         //          for the first and second row processing.
   1978         src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   1979         pu1_src += src_strd;
   1980         src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   1981         pu1_src += src_strd;
   1982         src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   1983         pu1_src += src_strd;
   1984         src_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   1985         pu1_src += src_strd;
   1986         src_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   1987         pu1_src += src_strd;
   1988 
   1989         do
   1990         {
   1991             src_r5_16x8b  = _mm_loadu_si128((__m128i *)pu1_src);
   1992             src_r6_16x8b  = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
   1993 
   1994             src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   1995             src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   1996             src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   1997 
   1998             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   1999             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2000             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2001 
   2002             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2003             res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2004             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   2005 
   2006             res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2007 
   2008             src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
   2009             src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
   2010             src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
   2011 
   2012             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2013             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2014             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2015 
   2016             src_r0r1_16x8b = _mm_loadu_si128((__m128i *)pu1_pred1);
   2017 
   2018             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2019             res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2020             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   2021 
   2022             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2023 
   2024             res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
   2025             res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
   2026 
   2027             _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
   2028 
   2029             src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
   2030             src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
   2031             src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
   2032 
   2033             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2034             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2035             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2036 
   2037             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2038             res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2039             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   2040 
   2041             res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2042 
   2043             src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
   2044             src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
   2045             src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
   2046 
   2047             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2048             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2049             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2050 
   2051             src_r0r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred1 + src_strd));
   2052 
   2053             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2054             res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2055             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   2056 
   2057             res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2058 
   2059             res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
   2060             res_16x8b = _mm_avg_epu8(src_r0r1_16x8b, res_16x8b); //computing q-pel
   2061 
   2062             _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res_16x8b);
   2063 
   2064             src_r0_16x8b = src_r2_16x8b;
   2065             src_r1_16x8b = src_r3_16x8b;
   2066             src_r2_16x8b = src_r4_16x8b;
   2067             src_r3_16x8b = src_r5_16x8b;
   2068             src_r4_16x8b = src_r6_16x8b;
   2069 
   2070             ht -= 2;
   2071             pu1_src += src_strd << 1;
   2072             pu1_pred1 += src_strd << 1;
   2073             pu1_dst += dst_strd << 1;
   2074         }
   2075         while(ht > 0);
   2076     }
   2077 }
   2078 
   2079 /*****************************************************************************/
   2080 /*                                                                           */
   2081 /*  Function Name : ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3          */
   2082 /*                                                                           */
   2083 /*  Description   : This function implements a six-tap filter vertically and */
   2084 /*                  horizontally on ht x wd block separately and averages    */
   2085 /*                  the two sets of values to calculate values at (1/4,1/4), */
   2086 /*                  (1/4, 3/4), (3/4, 1/4) or (3/4, 3/4) as mentioned in     */
   2087 /*                  sec. 8.4.2.2.1 titled "Luma sample interpolation         */
   2088 /*                  process". (ht,wd) can be (4,4), (8,4), (4,8), (8,8),     */
   2089 /*                  (16,8), (8,16) or (16,16).                               */
   2090 /*                                                                           */
   2091 /*  Inputs        : puc_src  - pointer to source                             */
   2092 /*                  puc_dst  - pointer to destination                        */
   2093 /*                  src_strd - stride for source                             */
   2094 /*                  dst_strd - stride for destination                        */
   2095 /*                  ht       - height of the block                           */
   2096 /*                  wd       - width of the block                            */
   2097 /*                  pu1_tmp  - pointer to temporary buffer                   */
   2098 /*                  dydx     - x and y reference offset for q-pel            */
   2099 /*                             calculations                                  */
   2100 /*                                                                           */
   2101 /*  Issues        : None                                                     */
   2102 /*                                                                           */
   2103 /*  Revision History:                                                        */
   2104 /*                                                                           */
   2105 /*         DD MM YYYY   Author(s)       Changes                              */
   2106 /*         13 02 2015   Kaushik         Initial Version                      */
   2107 /*                      Senthoor                                             */
   2108 /*                                                                           */
   2109 /*****************************************************************************/
   2110 void ih264_inter_pred_luma_horz_qpel_vert_qpel_ssse3(UWORD8 *pu1_src,
   2111                                                      UWORD8 *pu1_dst,
   2112                                                      WORD32 src_strd,
   2113                                                      WORD32 dst_strd,
   2114                                                      WORD32 ht,
   2115                                                      WORD32 wd,
   2116                                                      UWORD8* pu1_tmp,
   2117                                                      WORD32 dydx)
   2118 {
   2119     WORD32 ht_temp;
   2120     UWORD8 *pu1_pred_vert,*pu1_pred_horiz;
   2121     UWORD8 *pu1_tmp1, *pu1_tmp2;
   2122     WORD32 x_offset, y_offset;
   2123 
   2124     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   2125     __m128i const_val16_8x16b;
   2126 
   2127     pu1_tmp1 = pu1_tmp;
   2128 
   2129     dydx &= 0xf;
   2130     ht_temp = ht;
   2131     x_offset = dydx & 0x3;
   2132     y_offset = dydx >> 2;
   2133     pu1_tmp2 = pu1_tmp1;
   2134 
   2135     pu1_pred_vert  = pu1_src + (x_offset >> 1) - 2*src_strd;
   2136     pu1_pred_horiz = pu1_src + (y_offset >> 1) * src_strd - 2;
   2137     //the filter input starts from x[-2] (till x[3])
   2138 
   2139     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   2140     coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   2141     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
   2142                                                   //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
   2143     const_val16_8x16b = _mm_set1_epi16(16);
   2144 
   2145     if(wd == 4)
   2146     {
   2147         //vertical q-pel filter
   2148         {
   2149             __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
   2150             __m128i src_r5_16x8b, src_r6_16x8b;
   2151             __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
   2152 
   2153             __m128i res_r0r1_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
   2154 
   2155             //epilogue: Load all the pred rows except sixth  and seventh row for the
   2156             //first and second row processing.
   2157             src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2158             pu1_pred_vert = pu1_pred_vert + src_strd;
   2159 
   2160             src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2161             pu1_pred_vert = pu1_pred_vert + src_strd;
   2162             src_r0_16x8b = _mm_unpacklo_epi32(src_r0_16x8b, src_r1_16x8b);
   2163 
   2164             src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2165             pu1_pred_vert = pu1_pred_vert + src_strd;
   2166             src_r1_16x8b = _mm_unpacklo_epi32(src_r1_16x8b, src_r2_16x8b);
   2167 
   2168             src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2169             pu1_pred_vert = pu1_pred_vert + src_strd;
   2170             src_r2_16x8b = _mm_unpacklo_epi32(src_r2_16x8b, src_r3_16x8b);
   2171 
   2172             src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2173             pu1_pred_vert = pu1_pred_vert + src_strd;
   2174             src_r3_16x8b = _mm_unpacklo_epi32(src_r3_16x8b, src_r4_16x8b);
   2175 
   2176             //Core Loop: Process all the rows.
   2177             do
   2178             {
   2179                 src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2180                 src_r4_16x8b = _mm_unpacklo_epi32(src_r4_16x8b, src_r5_16x8b);
   2181 
   2182                 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
   2183                 src_r5_16x8b = _mm_unpacklo_epi32(src_r5_16x8b, src_r6_16x8b);
   2184 
   2185                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   2186                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   2187                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   2188 
   2189                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2190                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2191                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2192 
   2193                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2194                 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2195                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2196 
   2197                 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2198                 res_r0r1_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
   2199 
   2200                 _mm_storel_epi64((__m128i *)pu1_tmp1, res_r0r1_16x8b);
   2201 
   2202                 src_r0_16x8b = src_r2_16x8b;
   2203                 src_r1_16x8b = src_r3_16x8b;
   2204                 src_r2_16x8b = src_r4_16x8b;
   2205                 src_r3_16x8b = src_r5_16x8b;
   2206                 src_r4_16x8b = src_r6_16x8b;
   2207 
   2208                 ht_temp -= 2;
   2209                 pu1_pred_vert += src_strd << 1;
   2210                 pu1_tmp1 += 8;
   2211             }
   2212             while(ht_temp > 0);
   2213         }
   2214 
   2215         //horizontal q-pel filter
   2216         {
   2217             __m128i src_r0_16x8b, src_r1_16x8b;
   2218             __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
   2219             __m128i src_r0r1_vpel_16x8b, src_r0r1_t1_16x8b;
   2220 
   2221             __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
   2222             __m128i res_r0r1_16x8b;
   2223 
   2224             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   2225             //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   2226 
   2227             do
   2228             {
   2229                 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_pred_horiz);                  //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   2230                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd));     //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   2231 
   2232                 src_r0r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)pu1_tmp2);
   2233 
   2234                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                          //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   2235                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                          //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   2236 
   2237                 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);            //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   2238                 src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);            //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   2239 
   2240                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);          //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
   2241                 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   2242                                                                                              //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   2243 
   2244                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                              //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
   2245                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                              //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0
   2246 
   2247                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);          //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
   2248                 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   2249                                                                                              //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
   2250 
   2251                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                              //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
   2252                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                              //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0
   2253 
   2254                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);          //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
   2255                 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   2256                                                                                              //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
   2257 
   2258                 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
   2259                 res_r0r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0r1_t3_8x16b);
   2260                 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b);     //a0*c0+a1*c1+a2*c2+a3*c3+a4*a4+a5*c5 + 15;
   2261                                                                                              //a1*c0+a2*c1+a2*c2+a3*c3+a5*a4+a6*c5 + 15;
   2262                                                                                              //a2*c0+a3*c1+a4*c2+a5*c3+a6*a4+a7*c5 + 15;
   2263                                                                                              //a3*c0+a4*c1+a5*c2+a6*c3+a6*a4+a8*c5 + 15;
   2264                                                                                              //b0*c0+b1*c1+b2*c2+b3*c3+b4*b4+b5*c5 + 15;
   2265                                                                                              //b1*c0+b2*c1+b2*c2+b3*c3+b5*b4+b6*c5 + 15;
   2266                                                                                              //b2*c0+b3*c1+b4*c2+b5*c3+b6*b4+b7*c5 + 15;
   2267                                                                                              //b3*c0+b4*c1+b5*c2+b6*c3+b6*b4+b8*c5 + 15;
   2268 
   2269                 res_r0r1_t1_8x16b = _mm_srai_epi16(res_r0r1_t1_8x16b, 5);                    //shifting right by 5 bits.
   2270 
   2271                 res_r0r1_16x8b = _mm_packus_epi16(res_r0r1_t1_8x16b,res_r0r1_t1_8x16b);
   2272 
   2273                 res_r0r1_16x8b = _mm_avg_epu8(res_r0r1_16x8b,src_r0r1_vpel_16x8b);
   2274 
   2275                 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
   2276                 res_r0r1_16x8b = _mm_srli_si128(res_r0r1_16x8b, 4);
   2277                 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_r0r1_16x8b);
   2278 
   2279                 ht -= 2;
   2280                 pu1_pred_horiz += src_strd << 1;
   2281                 pu1_tmp2 += 8;
   2282                 pu1_dst += dst_strd << 1;
   2283             }
   2284             while(ht > 0);
   2285         }
   2286     }
   2287     else if(wd == 8)
   2288     {
   2289         //vertical q-pel filter
   2290         {
   2291             __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
   2292             __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
   2293             __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
   2294 
   2295             __m128i res_16x8b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
   2296 
   2297             //epilogue: Load all the pred rows except sixth  and seventh row for the
   2298             //first and second row processing.
   2299             src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2300             pu1_pred_vert = pu1_pred_vert + src_strd;
   2301 
   2302             src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2303             pu1_pred_vert = pu1_pred_vert + src_strd;
   2304             src_r0_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);
   2305 
   2306             src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2307             pu1_pred_vert = pu1_pred_vert + src_strd;
   2308             src_r1_16x8b = _mm_unpacklo_epi64(src_r1_16x8b, src_r2_16x8b);
   2309 
   2310             src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2311             pu1_pred_vert = pu1_pred_vert + src_strd;
   2312             src_r2_16x8b = _mm_unpacklo_epi64(src_r2_16x8b, src_r3_16x8b);
   2313 
   2314             src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2315             pu1_pred_vert = pu1_pred_vert + src_strd;
   2316             src_r3_16x8b = _mm_unpacklo_epi64(src_r3_16x8b, src_r4_16x8b);
   2317 
   2318             //Core Loop: Process all the rows.
   2319             do
   2320             {
   2321                 src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert));
   2322                 src_r4_16x8b = _mm_unpacklo_epi64(src_r4_16x8b, src_r5_16x8b);
   2323 
   2324                 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_pred_vert + src_strd));
   2325                 src_r5_16x8b = _mm_unpacklo_epi64(src_r5_16x8b, src_r6_16x8b);
   2326 
   2327                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   2328                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   2329                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   2330 
   2331                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2332                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2333                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2334 
   2335                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2336                 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2337                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2338 
   2339                 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2340                 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
   2341 
   2342                 _mm_storel_epi64((__m128i *)(pu1_tmp1), res_16x8b);
   2343 
   2344                 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
   2345                 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
   2346                 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
   2347 
   2348                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2349                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2350                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2351 
   2352                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2353                 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2354                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2355 
   2356                 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2357                 res_16x8b = _mm_packus_epi16(res_t1_8x16b, res_t1_8x16b);
   2358 
   2359                 _mm_storel_epi64((__m128i *)(pu1_tmp1 + 8), res_16x8b);
   2360 
   2361                 src_r0_16x8b = src_r2_16x8b;
   2362                 src_r1_16x8b = src_r3_16x8b;
   2363                 src_r2_16x8b = src_r4_16x8b;
   2364                 src_r3_16x8b = src_r5_16x8b;
   2365                 src_r4_16x8b = src_r6_16x8b;
   2366 
   2367                 ht_temp -= 2;
   2368                 pu1_pred_vert += src_strd << 1;
   2369                 pu1_tmp1 += 16;
   2370             }
   2371             while(ht_temp > 0);
   2372         }
   2373 
   2374         //horizontal q-pel filter
   2375         {
   2376             __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
   2377             __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
   2378             __m128i src_r0_vpel_16x8b, src_r1_vpel_16x8b;
   2379 
   2380             __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
   2381             __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b, res_16x8b;
   2382 
   2383             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   2384             //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   2385 
   2386             do
   2387             {
   2388                 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz));               //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   2389                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + src_strd));    //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   2390 
   2391                 src_r0_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2));                //a2 a3 a4 a5 a6 a7 a8....a15 0 or
   2392                                                                                            //a3 a4 a5 a6 a7 a8 a9....a15 0
   2393                 src_r1_vpel_16x8b = _mm_loadl_epi64((__m128i *)(pu1_tmp2 + 8));
   2394                                                                                            //b2 b3 b4 b5 b6 b7 b8....b15 0 or
   2395                                                                                            //b3 b4 b5 b6 b7 b8 b9....b15 0
   2396 
   2397                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                        //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   2398                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                        //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   2399 
   2400                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);       //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   2401                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);       //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   2402 
   2403                 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);      //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   2404                                                                                            //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
   2405                 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);      //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   2406                                                                                            //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
   2407 
   2408                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                            //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
   2409                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                            //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
   2410 
   2411                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                    //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
   2412                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                    //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
   2413 
   2414                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);       //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
   2415                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);       //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
   2416 
   2417                 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);      //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   2418                                                                                            //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
   2419                 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);      //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
   2420                                                                                            //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
   2421 
   2422                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                            //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   2423                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                            //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
   2424 
   2425                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                    //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   2426                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                    //b5 b6 b7 b8 b9....b15 0  0  0  0  0
   2427 
   2428                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);       //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   2429                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);       //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
   2430 
   2431                 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);      //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   2432                                                                                            //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   2433                 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);      //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
   2434                                                                                            //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
   2435                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   2436                 res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
   2437                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   2438                 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                      //shifting right by 5 bits.
   2439 
   2440                 res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r0_t1_8x16b);
   2441                 res_16x8b = _mm_avg_epu8(res_16x8b, src_r0_vpel_16x8b);
   2442 
   2443                 _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
   2444 
   2445                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
   2446                 res_r1_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r1_t3_8x16b);
   2447                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
   2448                 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);                      //shifting right by 5 bits.
   2449 
   2450                 res_16x8b = _mm_packus_epi16(res_r1_t1_8x16b, res_r1_t1_8x16b);
   2451                 res_16x8b = _mm_avg_epu8(res_16x8b,src_r1_vpel_16x8b);
   2452 
   2453                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
   2454 
   2455                 ht -= 2;
   2456                 pu1_pred_horiz += src_strd << 1;
   2457                 pu1_dst += dst_strd << 1;
   2458                 pu1_tmp2 += 16;
   2459             }
   2460             while(ht > 0);
   2461         }
   2462     }
   2463     else // wd == 16
   2464     {
   2465         //vertical q-pel filter
   2466         {
   2467             __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
   2468             __m128i src_r4_16x8b, src_r5_16x8b, src_r6_16x8b;
   2469             __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
   2470 
   2471             __m128i res_t0_8x16b, res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
   2472             __m128i res_16x8b;
   2473 
   2474             //epilogue: Load all the pred rows except sixth  and seventh row for the
   2475             //first and second row processing.
   2476             src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
   2477             pu1_pred_vert =  pu1_pred_vert + src_strd;
   2478             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
   2479             pu1_pred_vert =  pu1_pred_vert + src_strd;
   2480             src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
   2481             pu1_pred_vert =  pu1_pred_vert + src_strd;
   2482             src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
   2483             pu1_pred_vert =  pu1_pred_vert + src_strd;
   2484             src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
   2485             pu1_pred_vert =  pu1_pred_vert + src_strd;
   2486 
   2487             //Core Loop: Process all the rows.
   2488             do
   2489             {
   2490                 src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert));
   2491                 src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_vert + src_strd));
   2492 
   2493                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   2494                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   2495                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   2496 
   2497                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2498                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2499                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2500 
   2501                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2502                 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2503                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   2504                 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2505 
   2506                 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
   2507                 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
   2508                 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
   2509 
   2510                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2511                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2512                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2513 
   2514                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2515                 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2516                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   2517                 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2518 
   2519                 res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
   2520 
   2521                 _mm_storeu_si128((__m128i *)(pu1_tmp1), res_16x8b);
   2522 
   2523                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
   2524                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
   2525                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
   2526 
   2527                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2528                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2529                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2530 
   2531                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2532                 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2533                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   2534                 res_t0_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2535 
   2536                 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
   2537                 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
   2538                 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
   2539 
   2540                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2541                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2542                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2543 
   2544                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2545                 res_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_t3_8x16b);
   2546                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t3_8x16b);
   2547                 res_t1_8x16b = _mm_srai_epi16(res_t1_8x16b, 5); //shifting right by 5 bits.
   2548 
   2549                 res_16x8b = _mm_packus_epi16(res_t0_8x16b, res_t1_8x16b);
   2550 
   2551                 _mm_storeu_si128((__m128i *)(pu1_tmp1 + 16), res_16x8b);
   2552 
   2553                 src_r0_16x8b = src_r2_16x8b;
   2554                 src_r1_16x8b = src_r3_16x8b;
   2555                 src_r2_16x8b = src_r4_16x8b;
   2556                 src_r3_16x8b = src_r5_16x8b;
   2557                 src_r4_16x8b = src_r6_16x8b;
   2558 
   2559                 ht_temp -= 2;
   2560                 pu1_pred_vert += src_strd << 1;
   2561                 pu1_tmp1 += 32;
   2562             }
   2563             while(ht_temp > 0);
   2564         }
   2565         //horizontal q-pel filter
   2566         {
   2567             __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
   2568             __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
   2569             __m128i src_vpel_16x8b;
   2570 
   2571             __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
   2572             __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
   2573             __m128i res_16x8b;
   2574 
   2575             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   2576             //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   2577             //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
   2578 
   2579             do
   2580             {
   2581                 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz));             //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   2582                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_pred_horiz + 8));         //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   2583                 src_vpel_16x8b = _mm_loadu_si128((__m128i *)(pu1_tmp2));
   2584 
   2585                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   2586                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   2587 
   2588                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   2589                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   2590 
   2591                 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   2592                                                                                          //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
   2593                 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   2594                                                                                          //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
   2595 
   2596                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
   2597                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
   2598 
   2599                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
   2600                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
   2601 
   2602                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
   2603                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
   2604 
   2605                 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   2606                                                                                          //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
   2607                 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
   2608                                                                                          //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
   2609 
   2610                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   2611                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
   2612 
   2613                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   2614                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0
   2615 
   2616                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   2617                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
   2618 
   2619                 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   2620                                                                                          //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   2621                 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
   2622                                                                                          //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
   2623                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   2624                 res_r0_t3_8x16b = _mm_add_epi16(const_val16_8x16b, res_r0_t3_8x16b);
   2625                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   2626                 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                    //shifting right by 5 bits.
   2627 
   2628                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
   2629                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
   2630                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, const_val16_8x16b);
   2631                 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);                    //shifting right by 5 bits.
   2632 
   2633                 res_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
   2634 
   2635                 res_16x8b = _mm_avg_epu8(res_16x8b, src_vpel_16x8b);
   2636                 _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b);
   2637 
   2638                 ht --;
   2639                 pu1_pred_horiz  += src_strd;
   2640                 pu1_dst += dst_strd;
   2641                 pu1_tmp2 += 16;
   2642             }
   2643             while(ht > 0);
   2644         }
   2645     }
   2646 }
   2647 
   2648 /*****************************************************************************/
   2649 /*                                                                           */
   2650 /*  Function Name : ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3          */
   2651 /*                                                                           */
   2652 /*  Description   : This function implements a six-tap filter vertically and */
   2653 /*                  horizontally on ht x wd block separately and averages    */
   2654 /*                  the two sets of values to calculate values at (1/4,1/2), */
   2655 /*                  or (3/4, 1/2) as mentioned in sec. 8.4.2.2.1 titled      */
   2656 /*                  "Luma sample interpolation process". (ht,wd) can be      */
   2657 /*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
   2658 /*                                                                           */
   2659 /*  Inputs        : puc_src  - pointer to source                             */
   2660 /*                  puc_dst  - pointer to destination                        */
   2661 /*                  src_strd - stride for source                             */
   2662 /*                  dst_strd - stride for destination                        */
   2663 /*                  ht       - height of the block                           */
   2664 /*                  wd       - width of the block                            */
   2665 /*                  pu1_tmp  - pointer to temporary buffer                   */
   2666 /*                  dydx     - x and y reference offset for q-pel            */
   2667 /*                             calculations                                  */
   2668 /*                                                                           */
   2669 /*  Issues        : None                                                     */
   2670 /*                                                                           */
   2671 /*  Revision History:                                                        */
   2672 /*                                                                           */
   2673 /*         DD MM YYYY   Author(s)       Changes                              */
   2674 /*         13 02 2015   Kaushik         Initial Version                      */
   2675 /*                      Senthoor                                             */
   2676 /*                                                                           */
   2677 /*****************************************************************************/
   2678 void ih264_inter_pred_luma_horz_qpel_vert_hpel_ssse3(UWORD8 *pu1_src,
   2679                                                      UWORD8 *pu1_dst,
   2680                                                      WORD32 src_strd,
   2681                                                      WORD32 dst_strd,
   2682                                                      WORD32 ht,
   2683                                                      WORD32 wd,
   2684                                                      UWORD8* pu1_tmp,
   2685                                                      WORD32 dydx)
   2686 {
   2687     WORD32 ht_temp;
   2688     WORD32 x_offset;
   2689     WORD32 off0,off1, off2, off3, off4, off5;
   2690     WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3;
   2691 
   2692     ht_temp = ht;
   2693     x_offset = dydx & 0x3;
   2694     pi2_temp1 = (WORD16 *)pu1_tmp;
   2695     pi2_temp2 = pi2_temp1;
   2696     pi2_temp3 = pi2_temp1 + (x_offset >> 1);
   2697 
   2698     pu1_src -= 2 * src_strd;
   2699     pu1_src -= 2;
   2700     pi2_temp3 += 2;
   2701     //the filter input starts from x[-2] (till x[3])
   2702 
   2703     if(wd == 4)
   2704     {
   2705         //vertical half-pel
   2706         {
   2707             __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
   2708             __m128i src_r5_16x8b, src_r6_16x8b;
   2709             __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
   2710 
   2711             __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
   2712 
   2713             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   2714 
   2715             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   2716             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   2717             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
   2718                                                           //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
   2719             off0 = -((src_strd << 2) + src_strd) + 8;
   2720             off1 = -(src_strd << 2) + 8;
   2721             off2 = -((src_strd << 1) + src_strd) + 8;
   2722             off3 = -(src_strd << 1) + 8;
   2723             off4 = -src_strd + 8;
   2724             off5 = 8;
   2725 
   2726             //epilogue: Load all the pred rows except sixth  and seventh row for the
   2727             //first and second row processing.
   2728             src_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2729             pu1_src =  pu1_src + src_strd;
   2730 
   2731             src_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2732             pu1_src =  pu1_src + src_strd;
   2733 
   2734             src_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2735             pu1_src =  pu1_src + src_strd;
   2736 
   2737             src_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2738             pu1_src =  pu1_src + src_strd;
   2739 
   2740             src_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2741             pu1_src =  pu1_src + src_strd;
   2742 
   2743             //Core Loop: Process all the rows.
   2744             do
   2745             {
   2746                 src_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2747 
   2748                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   2749                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   2750                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   2751 
   2752                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2753                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2754                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2755 
   2756                 res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b);
   2757                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2758 
   2759                 _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);
   2760 
   2761                 pi2_temp1[8] = pu1_src[off0] + pu1_src[off5]
   2762                                    - (pu1_src[off1] + pu1_src[off4])
   2763                                    + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2)
   2764                                    + ((pu1_src[off2] + pu1_src[off3]) << 4);
   2765 
   2766                 pu1_src = pu1_src + src_strd;
   2767                 pi2_temp1 = pi2_temp1 + 9;
   2768 
   2769                 src_r6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   2770 
   2771                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
   2772                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
   2773                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
   2774 
   2775                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2776                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2777                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2778 
   2779                 res_t1_8x16b = _mm_add_epi16(res_t2_8x16b, res_t1_8x16b);
   2780                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2781 
   2782                 _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);
   2783 
   2784                 pi2_temp1[8] = pu1_src[off0] + pu1_src[off5]
   2785                                    - (pu1_src[off1] + pu1_src[off4])
   2786                                    + ((pu1_src[off2] + pu1_src[off3] - pu1_src[off1] - pu1_src[off4]) << 2)
   2787                                    + ((pu1_src[off2] + pu1_src[off3]) << 4);
   2788 
   2789                 ht_temp -= 2;
   2790                 pu1_src = pu1_src + src_strd;
   2791                 pi2_temp1 = pi2_temp1 + 9;
   2792 
   2793                 src_r0_16x8b = src_r2_16x8b;
   2794                 src_r1_16x8b = src_r3_16x8b;
   2795                 src_r2_16x8b = src_r4_16x8b;
   2796                 src_r3_16x8b = src_r5_16x8b;
   2797                 src_r4_16x8b = src_r6_16x8b;
   2798             }
   2799             while(ht_temp > 0);
   2800         }
   2801 
   2802         //horizontal q-pel
   2803         {
   2804             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b;
   2805             __m128i src_r3_8x16b, src_r4_8x16b, src_r5_8x16b;
   2806             __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
   2807             __m128i src_hpel_16x8b, src_hpel_8x16b;
   2808 
   2809             __m128i res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
   2810             __m128i res_8x16b, res_16x8b;
   2811 
   2812             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
   2813             __m128i const_val512_4x32b, const_val16_8x16b;
   2814 
   2815             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
   2816             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
   2817             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
   2818 
   2819             const_val512_4x32b = _mm_set1_epi32(512);
   2820             const_val16_8x16b = _mm_set1_epi16(16);
   2821 
   2822             do
   2823             {
   2824                 src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
   2825                 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
   2826                 src_r2_8x16b = _mm_srli_si128(src_r1_8x16b, 2);
   2827                 src_r3_8x16b = _mm_srli_si128(src_r1_8x16b, 4);
   2828                 src_r4_8x16b = _mm_srli_si128(src_r1_8x16b, 6);
   2829                 src_r5_8x16b = _mm_srli_si128(src_r1_8x16b, 8);
   2830 
   2831                 src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   2832                 src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   2833                 src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   2834 
   2835                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
   2836                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
   2837                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);
   2838 
   2839                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   2840                 res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
   2841                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   2842                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   2843 
   2844                 res_8x16b = _mm_packs_epi32(res_t1_4x32b, res_t1_4x32b);
   2845                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   2846 
   2847                 src_hpel_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp3));
   2848                 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
   2849                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   2850                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   2851 
   2852                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   2853 
   2854                 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
   2855 
   2856                 ht--;
   2857                 pi2_temp2 = pi2_temp2 + 4 + 5;
   2858                 pi2_temp3 = pi2_temp3 + 4 + 5;
   2859                 pu1_dst = pu1_dst + dst_strd;
   2860             }
   2861             while(ht > 0);
   2862         }
   2863     }
   2864     else if(wd == 8)
   2865     {
   2866         // vertical half-pel
   2867         {
   2868             __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b, src_r4_16x8b;
   2869             __m128i src_r5_16x8b, src_r6_16x8b;
   2870             __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
   2871 
   2872             __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
   2873 
   2874             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   2875 
   2876             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   2877             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   2878             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
   2879 
   2880             //epilogue: Load all the pred rows except sixth  and seventh row for the
   2881             //first and second row processing.
   2882             src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   2883             pu1_src =  pu1_src + src_strd;
   2884 
   2885             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   2886             pu1_src =  pu1_src + src_strd;
   2887 
   2888             src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   2889             pu1_src =  pu1_src + src_strd;
   2890 
   2891             src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   2892             pu1_src =  pu1_src + src_strd;
   2893 
   2894             src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   2895             pu1_src =  pu1_src + src_strd;
   2896 
   2897             //Core Loop: Process all the rows.
   2898             do
   2899             {
   2900                 src_r5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   2901                 src_r6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
   2902 
   2903                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   2904                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   2905                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   2906 
   2907                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2908                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2909                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2910 
   2911                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2912                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2913 
   2914                 _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);
   2915 
   2916                 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
   2917                 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
   2918                 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
   2919 
   2920                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2921                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2922                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2923 
   2924                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2925                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2926 
   2927                 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b);
   2928 
   2929                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r2_16x8b);
   2930                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r3_16x8b, src_r4_16x8b);
   2931                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r5_16x8b, src_r6_16x8b);
   2932 
   2933                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2934                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2935                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2936 
   2937                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2938                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2939 
   2940                 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5), res_t1_8x16b);
   2941 
   2942                 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r1_16x8b, src_r2_16x8b);
   2943                 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r3_16x8b, src_r4_16x8b);
   2944                 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r5_16x8b, src_r6_16x8b);
   2945 
   2946                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   2947                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   2948                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   2949 
   2950                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   2951                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   2952 
   2953                 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8 + 5 + 8), res_t1_8x16b);
   2954 
   2955                 src_r0_16x8b = src_r2_16x8b;
   2956                 src_r1_16x8b = src_r3_16x8b;
   2957                 src_r2_16x8b = src_r4_16x8b;
   2958                 src_r3_16x8b = src_r5_16x8b;
   2959                 src_r4_16x8b = src_r6_16x8b;
   2960 
   2961                 ht_temp -= 2;
   2962                 pu1_src =  pu1_src + (src_strd << 1);
   2963                 pi2_temp1 = pi2_temp1 + (13 << 1);
   2964             }
   2965             while(ht_temp > 0);
   2966         }
   2967         // horizontal q-pel
   2968         {
   2969             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
   2970             __m128i src_r4_8x16b, src_r5_8x16b;
   2971             __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
   2972             __m128i src_r0r1_c1_8x16b, src_r2r3_c1_8x16b, src_r4r5_c1_8x16b;
   2973             __m128i src_hpel_8x16b, src_hpel_16x8b;
   2974 
   2975             __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
   2976             __m128i res_8x16b, res_16x8b;
   2977 
   2978             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
   2979             __m128i const_val512_4x32b, const_val16_8x16b;
   2980 
   2981             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
   2982             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
   2983             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
   2984 
   2985             const_val512_4x32b = _mm_set1_epi32(512);
   2986             const_val16_8x16b = _mm_set1_epi16(16);
   2987 
   2988             do
   2989             {
   2990                 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
   2991                 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
   2992                 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2));
   2993                 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3));
   2994                 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4));
   2995                 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5));
   2996 
   2997                 src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   2998                 src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   2999                 src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   3000 
   3001                 src_r0r1_c1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   3002                 src_r2r3_c1_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   3003                 src_r4r5_c1_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   3004 
   3005                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
   3006                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
   3007                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);
   3008 
   3009                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3010                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3011                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3012 
   3013                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3014 
   3015                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c1_8x16b, coeff0_1_8x16b);
   3016                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c1_8x16b, coeff2_3_8x16b);
   3017                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c1_8x16b, coeff4_5_8x16b);
   3018 
   3019                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3020                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3021                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3022 
   3023                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3024 
   3025                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3026                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   3027 
   3028                 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
   3029                 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
   3030                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   3031                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   3032 
   3033                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3034 
   3035                 _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
   3036 
   3037                 ht--;
   3038                 pi2_temp2 = pi2_temp2 + 8 + 5;
   3039                 pi2_temp3 = pi2_temp3 + 8 + 5;
   3040                 pu1_dst = pu1_dst + dst_strd;
   3041             }
   3042             while(ht > 0);
   3043         }
   3044     }
   3045     else // wd == 16
   3046     {
   3047         // vertical half-pel
   3048         {
   3049             __m128i src_r0_16x8b, src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
   3050             __m128i src_r4_16x8b, src_r5_16x8b;
   3051             __m128i src_r0_c2_16x8b, src_r1_c2_16x8b, src_r2_c2_16x8b, src_r3_c2_16x8b;
   3052             __m128i src_r4_c2_16x8b, src_r5_c2_16x8b;
   3053             __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
   3054 
   3055             __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
   3056 
   3057             __m128i coeff0_1_16x8b,coeff2_3_16x8b,coeff4_5_16x8b;
   3058 
   3059             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   3060             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   3061             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
   3062 
   3063             src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   3064             src_r0_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
   3065             pu1_src =  pu1_src + src_strd;
   3066             src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   3067             src_r1_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
   3068             pu1_src =  pu1_src + src_strd;
   3069             src_r2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   3070             src_r2_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
   3071             pu1_src =  pu1_src + src_strd;
   3072             src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   3073             src_r3_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
   3074             pu1_src =  pu1_src + src_strd;
   3075             src_r4_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));
   3076             src_r4_c2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 16));
   3077             pu1_src =  pu1_src + src_strd;
   3078 
   3079             //Core Loop: Process all the rows.
   3080             do
   3081             {
   3082                 src_r5_16x8b  = _mm_loadu_si128((__m128i *)(pu1_src));
   3083                 src_r5_c2_16x8b  = _mm_loadu_si128((__m128i *)(pu1_src + 16));
   3084 
   3085                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r1_16x8b);
   3086                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_16x8b, src_r3_16x8b);
   3087                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_16x8b, src_r5_16x8b);
   3088 
   3089                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   3090                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   3091                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   3092 
   3093                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   3094                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   3095 
   3096                 _mm_storeu_si128((__m128i *)(pi2_temp1), res_t1_8x16b);
   3097 
   3098                 src_r0r1_16x8b = _mm_unpackhi_epi8(src_r0_16x8b, src_r1_16x8b);
   3099                 src_r2r3_16x8b = _mm_unpackhi_epi8(src_r2_16x8b, src_r3_16x8b);
   3100                 src_r4r5_16x8b = _mm_unpackhi_epi8(src_r4_16x8b, src_r5_16x8b);
   3101 
   3102                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   3103                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   3104                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   3105 
   3106                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   3107                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   3108 
   3109                 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_t1_8x16b);
   3110 
   3111                 src_r0r1_16x8b = _mm_unpacklo_epi8(src_r0_c2_16x8b, src_r1_c2_16x8b);
   3112                 src_r2r3_16x8b = _mm_unpacklo_epi8(src_r2_c2_16x8b, src_r3_c2_16x8b);
   3113                 src_r4r5_16x8b = _mm_unpacklo_epi8(src_r4_c2_16x8b, src_r5_c2_16x8b);
   3114 
   3115                 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
   3116                 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
   3117                 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
   3118 
   3119                 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
   3120                 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
   3121 
   3122                 _mm_storeu_si128((__m128i *)(pi2_temp1 + 16), res_t1_8x16b);
   3123 
   3124                 src_r0_16x8b = src_r1_16x8b;
   3125                 src_r1_16x8b = src_r2_16x8b;
   3126                 src_r2_16x8b = src_r3_16x8b;
   3127                 src_r3_16x8b = src_r4_16x8b;
   3128                 src_r4_16x8b = src_r5_16x8b;
   3129 
   3130                 src_r0_c2_16x8b = src_r1_c2_16x8b;
   3131                 src_r1_c2_16x8b = src_r2_c2_16x8b;
   3132                 src_r2_c2_16x8b = src_r3_c2_16x8b;
   3133                 src_r3_c2_16x8b = src_r4_c2_16x8b;
   3134                 src_r4_c2_16x8b = src_r5_c2_16x8b;
   3135 
   3136                 ht_temp--;
   3137                 pu1_src =  pu1_src + src_strd;
   3138                 pi2_temp1 =  pi2_temp1 + 16 + 5;
   3139             }
   3140             while(ht_temp > 0);
   3141         }
   3142         // horizontal q-pel
   3143         {
   3144             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
   3145             __m128i src_r4_8x16b, src_r5_8x16b;
   3146             __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
   3147             __m128i src_hpel1_8x16b, src_hpel2_8x16b, src_hpel_16x8b;
   3148 
   3149             __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
   3150             __m128i res_c0_8x16b, res_c1_8x16b, res_16x8b;
   3151 
   3152             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
   3153             __m128i const_val512_4x32b, const_val16_8x16b;
   3154 
   3155             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
   3156             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
   3157             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
   3158 
   3159             const_val512_4x32b = _mm_set1_epi32(512);
   3160             const_val16_8x16b = _mm_set1_epi16(16);
   3161 
   3162             do
   3163             {
   3164                 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
   3165                 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 1));
   3166                 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 2));
   3167                 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 3));
   3168                 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 4));
   3169                 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 5));
   3170 
   3171                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   3172                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   3173                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   3174 
   3175                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3176                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3177                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3178 
   3179                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3180                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3181                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3182                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3183 
   3184                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   3185                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   3186                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   3187 
   3188                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3189                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3190                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3191 
   3192                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3193                 res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
   3194                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3195                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3196 
   3197                 res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3198 
   3199                 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));
   3200                 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 1));
   3201                 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 2));
   3202                 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 3));
   3203                 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 4));
   3204                 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8 + 5));
   3205 
   3206                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   3207                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   3208                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   3209 
   3210                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3211                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3212                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3213 
   3214                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3215                 res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
   3216                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3217                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10);
   3218 
   3219                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   3220                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   3221                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   3222 
   3223                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3224                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3225                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3226 
   3227                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3228                 res_t3_4x32b = _mm_add_epi32(const_val512_4x32b, res_t3_4x32b);
   3229                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3230                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3231 
   3232                 res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3233                 res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b);
   3234 
   3235                 src_hpel1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
   3236                 src_hpel1_8x16b = _mm_add_epi16(src_hpel1_8x16b, const_val16_8x16b);
   3237                 src_hpel1_8x16b = _mm_srai_epi16(src_hpel1_8x16b, 5); //shifting right by 5 bits.
   3238 
   3239                 src_hpel2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8));
   3240                 src_hpel2_8x16b = _mm_add_epi16(src_hpel2_8x16b, const_val16_8x16b);
   3241                 src_hpel2_8x16b = _mm_srai_epi16(src_hpel2_8x16b, 5); //shifting right by 5 bits.
   3242 
   3243                 src_hpel_16x8b = _mm_packus_epi16(src_hpel1_8x16b, src_hpel2_8x16b);
   3244                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3245 
   3246                 _mm_storeu_si128((__m128i *)(pu1_dst), res_16x8b);
   3247 
   3248                 ht--;
   3249                 pi2_temp2 = pi2_temp2 + 16 + 5;
   3250                 pi2_temp3 = pi2_temp3 + 16 + 5;
   3251                 pu1_dst = pu1_dst + dst_strd;
   3252             }
   3253             while(ht > 0);
   3254         }
   3255     }
   3256 }
   3257 
   3258 /*****************************************************************************/
   3259 /*                                                                           */
   3260 /*  Function Name : ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3          */
   3261 /*                                                                           */
   3262 /*  Description   : This function implements a six-tap filter vertically and */
   3263 /*                  horizontally on ht x wd block separately and averages    */
   3264 /*                  the two sets of values to calculate values at (1/2,1/4), */
   3265 /*                  or (1/2, 3/4) as mentioned in sec. 8.4.2.2.1 titled      */
   3266 /*                  "Luma sample interpolation process". (ht,wd) can be      */
   3267 /*                  (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16).   */
   3268 /*                                                                           */
   3269 /*  Inputs        : puc_src  - pointer to source                             */
   3270 /*                  puc_dst  - pointer to destination                        */
   3271 /*                  src_strd - stride for source                             */
   3272 /*                  dst_strd - stride for destination                        */
   3273 /*                  ht       - height of the block                           */
   3274 /*                  wd       - width of the block                            */
   3275 /*                  pu1_tmp  - pointer to temporary buffer                   */
   3276 /*                  dydx     - x and y reference offset for q-pel            */
   3277 /*                             calculations                                  */
   3278 /*                                                                           */
   3279 /*  Issues        : None                                                     */
   3280 /*                                                                           */
   3281 /*  Revision History:                                                        */
   3282 /*                                                                           */
   3283 /*         DD MM YYYY   Author(s)       Changes                              */
   3284 /*         13 02 2015   Kaushik         Initial Version                      */
   3285 /*                      Senthoor                                             */
   3286 /*                                                                           */
   3287 /*****************************************************************************/
   3288 void ih264_inter_pred_luma_horz_hpel_vert_qpel_ssse3(UWORD8 *pu1_src,
   3289                                                      UWORD8 *pu1_dst,
   3290                                                      WORD32 src_strd,
   3291                                                      WORD32 dst_strd,
   3292                                                      WORD32 ht,
   3293                                                      WORD32 wd,
   3294                                                      UWORD8* pu1_tmp,
   3295                                                      WORD32 dydx)
   3296 {
   3297     WORD32 ht_temp;
   3298     WORD32 y_offset;
   3299     WORD16 *pi2_temp1,*pi2_temp2,*pi2_temp3;
   3300 
   3301     y_offset = (dydx & 0xf) >> 2;
   3302     pi2_temp1 = (WORD16 *)pu1_tmp;
   3303     pi2_temp2 = pi2_temp1;
   3304     pi2_temp3 = pi2_temp1 + (y_offset >> 1) * wd;
   3305 
   3306     ht_temp = ht + 5;
   3307     pu1_src -= src_strd << 1;
   3308     pu1_src -= 2;
   3309     pi2_temp3 += wd << 1;
   3310     //the filter input starts from x[-2] (till x[3])
   3311 
   3312     if(wd == 4)
   3313     {
   3314         // horizontal half-pel
   3315         {
   3316             __m128i src_r0_16x8b, src_r1_16x8b, src_r0r1_t1_16x8b;
   3317             __m128i src_r0_sht_16x8b, src_r1_sht_16x8b;
   3318             __m128i res_r0r1_t1_8x16b, res_r0r1_t2_8x16b, res_r0r1_t3_8x16b;
   3319             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   3320 
   3321             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   3322             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   3323             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
   3324 
   3325             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   3326             //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   3327 
   3328             do
   3329             {
   3330                 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                         //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   3331                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));            //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   3332 
   3333                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                         //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   3334                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                         //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   3335 
   3336                 src_r0_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);           //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   3337                 src_r1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);           //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   3338 
   3339                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);         //a0 a1 a1 a2 a2 a3 a3 a4 b0 b1 b1 b2 b2 b3 b3 b4
   3340                 res_r0r1_t1_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff0_1_16x8b);   //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   3341                                                                                             //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   3342 
   3343                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                             //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0
   3344                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                             //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0
   3345 
   3346                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);         //a2 a3 a3 a4 a4 a5 a5 a6 b2 b3 b3 b4 b4 b5 b5 b6
   3347                 res_r0r1_t2_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff2_3_16x8b);   //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   3348                                                                                             //b2*c2+b3*c3 b3*c2+b4*c3 b4*c2+b5*c3 b5*c2+b6*c3
   3349 
   3350                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 4);                             //a4 a5 a5 a6 a6 a7 a7 a8  0  0  0  0  0  0  0  0
   3351                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 4);                             //b4 b5 b5 b6 b6 b7 b7 b8  0  0  0  0  0  0  0  0
   3352 
   3353                 src_r0r1_t1_16x8b = _mm_unpacklo_epi64(src_r0_16x8b, src_r1_16x8b);         //a4 a5 a5 a6 a6 a7 a7 a8 b4 b5 b5 b6 b6 b7 b7 b8
   3354                 res_r0r1_t3_8x16b = _mm_maddubs_epi16(src_r0r1_t1_16x8b, coeff4_5_16x8b);   //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   3355                                                                                             //b4*c4+b5*c5 b5*c4+b6*c5 b4*c6+b7*c5 b7*c4+b8*c5
   3356 
   3357                 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t2_8x16b);
   3358                 res_r0r1_t1_8x16b = _mm_add_epi16(res_r0r1_t1_8x16b, res_r0r1_t3_8x16b);
   3359 
   3360 
   3361                 _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0r1_t1_8x16b);
   3362 
   3363                 ht_temp -= 2;
   3364                 pu1_src =  pu1_src + (src_strd << 1);
   3365                 pi2_temp1 =  pi2_temp1 + (4 << 1);
   3366             }
   3367             while(ht_temp > 0);
   3368         }
   3369         // vertical q-pel
   3370         {
   3371             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
   3372             __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b;
   3373             __m128i src_r0r1_c0_8x16b, src_r2r3_c0_8x16b, src_r4r5_c0_8x16b;
   3374             __m128i src_hpel_16x8b, src_hpel_8x16b;
   3375 
   3376             __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
   3377             __m128i res_8x16b, res_16x8b;
   3378 
   3379             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
   3380             __m128i const_val512_4x32b, const_val16_8x16b;
   3381 
   3382             const_val512_4x32b = _mm_set1_epi32(512);
   3383             const_val16_8x16b = _mm_set1_epi16(16);
   3384 
   3385             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
   3386             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
   3387             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
   3388 
   3389             src_r0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
   3390             src_r1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4));
   3391             src_r2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 8));
   3392             src_r3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 12));
   3393             src_r4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 16));
   3394             pi2_temp2 += 20;
   3395 
   3396             do
   3397             {
   3398                 src_r5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2));
   3399                 src_r6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_temp2 + 4));
   3400 
   3401                 src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   3402                 src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   3403                 src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   3404 
   3405                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
   3406                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
   3407                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);
   3408 
   3409                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3410                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3411                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3412                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3413 
   3414                 src_r0r1_c0_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
   3415                 src_r2r3_c0_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
   3416                 src_r4r5_c0_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
   3417 
   3418                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_c0_8x16b, coeff0_1_8x16b);
   3419                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_c0_8x16b, coeff2_3_8x16b);
   3420                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_c0_8x16b, coeff4_5_8x16b);
   3421 
   3422                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3423                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3424                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3425                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3426 
   3427                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3428                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   3429 
   3430                 src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3);
   3431                 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
   3432                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   3433                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   3434 
   3435                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3436 
   3437                 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(res_16x8b);
   3438                 res_16x8b = _mm_srli_si128(res_16x8b, 4);
   3439                 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(res_16x8b);
   3440 
   3441                 src_r0_8x16b = src_r2_8x16b;
   3442                 src_r1_8x16b = src_r3_8x16b;
   3443                 src_r2_8x16b = src_r4_8x16b;
   3444                 src_r3_8x16b = src_r5_8x16b;
   3445                 src_r4_8x16b = src_r6_8x16b;
   3446 
   3447                 ht -= 2;
   3448                 pi2_temp2 =  pi2_temp2 + (4 << 1);
   3449                 pi2_temp3 =  pi2_temp3 + (4 << 1);
   3450                 pu1_dst = pu1_dst + (dst_strd << 1);
   3451             }
   3452             while(ht > 0);
   3453         }
   3454     }
   3455     else if(wd == 8)
   3456     {
   3457         // horizontal half-pel
   3458         {
   3459             __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
   3460             __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
   3461 
   3462             __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
   3463             __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
   3464 
   3465             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   3466 
   3467             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   3468             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   3469             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
   3470 
   3471             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   3472             //Row1 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   3473 
   3474             do
   3475             {
   3476                 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                   //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   3477                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));        //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   3478 
   3479                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                     //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   3480                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                     //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   3481 
   3482                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   3483                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   3484 
   3485                 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);   //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   3486                                                                                         //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
   3487                 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);   //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   3488                                                                                         //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
   3489 
   3490                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                         //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
   3491                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                         //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
   3492 
   3493                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                 //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
   3494                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                 //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
   3495 
   3496                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
   3497                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
   3498 
   3499                 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);   //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   3500                                                                                         //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
   3501                 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);   //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
   3502                                                                                         //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
   3503 
   3504                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                         //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   3505                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                         //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
   3506 
   3507                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                 //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   3508                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                 //b5 b6 b7 b8 b9....b15 0  0  0  0  0
   3509 
   3510                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   3511                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
   3512 
   3513                 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);   //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   3514                                                                                         //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   3515                 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);   //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
   3516                                                                                         //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
   3517                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   3518                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   3519 
   3520                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
   3521                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
   3522 
   3523                 _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b);
   3524                 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b);
   3525 
   3526                 ht_temp -= 2;
   3527                 pu1_src =  pu1_src + (src_strd << 1);
   3528                 pi2_temp1 =  pi2_temp1 + (8 << 1);
   3529             }
   3530             while(ht_temp > 0);
   3531         }
   3532         // vertical q-pel
   3533         {
   3534             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
   3535             __m128i src_r4_8x16b, src_r5_8x16b, src_r6_8x16b;
   3536             __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
   3537             __m128i src_hpel_8x16b, src_hpel_16x8b;
   3538 
   3539             __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
   3540             __m128i res_8x16b, res_16x8b;
   3541 
   3542             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
   3543             __m128i const_val512_4x32b, const_val16_8x16b;
   3544 
   3545             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
   3546             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
   3547             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
   3548 
   3549             const_val512_4x32b = _mm_set1_epi32(512);
   3550             const_val16_8x16b = _mm_set1_epi16(16);
   3551 
   3552             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
   3553             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));
   3554             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
   3555             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 24));
   3556             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
   3557             pi2_temp2 += 40;
   3558 
   3559             do
   3560             {
   3561                 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
   3562                 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 8));
   3563 
   3564                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   3565                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   3566                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   3567 
   3568                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3569                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3570                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3571 
   3572                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3573                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3574                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3575                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3576 
   3577                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   3578                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   3579                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   3580 
   3581                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3582                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3583                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3584 
   3585                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3586                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3587                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3588                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3589 
   3590                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3591                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   3592 
   3593                 src_hpel_8x16b = _mm_loadu_si128((__m128i *)pi2_temp3);
   3594                 src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b);
   3595                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   3596                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   3597 
   3598                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3599 
   3600                 _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
   3601 
   3602                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
   3603                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
   3604                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
   3605 
   3606                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3607                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3608                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3609 
   3610                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3611                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3612                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3613                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3614 
   3615                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
   3616                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
   3617                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
   3618 
   3619                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3620                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3621                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3622 
   3623                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3624                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3625                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3626                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3627 
   3628                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3629                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   3630 
   3631                 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 8));
   3632                 src_hpel_8x16b = _mm_add_epi16(const_val16_8x16b, src_hpel_8x16b);
   3633                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   3634                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   3635 
   3636                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3637 
   3638                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
   3639 
   3640                 src_r0_8x16b = src_r2_8x16b;
   3641                 src_r1_8x16b = src_r3_8x16b;
   3642                 src_r2_8x16b = src_r4_8x16b;
   3643                 src_r3_8x16b = src_r5_8x16b;
   3644                 src_r4_8x16b = src_r6_8x16b;
   3645 
   3646                 ht -= 2;
   3647                 pi2_temp2 = pi2_temp2 + (8 << 1);
   3648                 pi2_temp3 = pi2_temp3 + (8 << 1);
   3649                 pu1_dst = pu1_dst + (dst_strd << 1);
   3650             }
   3651             while(ht > 0);
   3652         }
   3653     }
   3654     else // wd == 16
   3655     {
   3656         UWORD8 *pu1_dst1;
   3657         WORD16 *pi2_temp4,*pi2_temp5;
   3658 
   3659         pu1_dst1 = pu1_dst + 8;
   3660         pi2_temp4 = pi2_temp2 + 8;
   3661         pi2_temp5 = pi2_temp3 + 8;
   3662 
   3663         // horizontal half-pel
   3664         {
   3665             __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
   3666             __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
   3667 
   3668             __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
   3669             __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
   3670 
   3671             __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
   3672 
   3673             coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01);  //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
   3674             coeff2_3_16x8b = _mm_set1_epi32(0x14141414);  //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
   3675             coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB);  //c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5 c4 c5 c5 c5
   3676 
   3677             //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
   3678             //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
   3679             //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
   3680 
   3681             do
   3682             {
   3683                 src_r0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src));                  //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
   3684                 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));              //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
   3685 
   3686                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                    //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
   3687                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                    //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
   3688 
   3689                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);   //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
   3690                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);   //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
   3691 
   3692                 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);   //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
   3693                                                                                         //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
   3694                 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);   //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
   3695                                                                                         //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
   3696 
   3697                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                         //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
   3698                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                         //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
   3699 
   3700                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                 //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
   3701                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                 //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
   3702 
   3703                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
   3704                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
   3705 
   3706                 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);   //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
   3707                                                                                         //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
   3708                 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);   //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
   3709                                                                                         //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
   3710 
   3711                 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                         //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
   3712                 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                         //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
   3713 
   3714                 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                 //a5 a6 a7 a8 a9....a15 0  0  0  0  0
   3715                 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                 //b5 b6 b7 b8 b9....b15 0  0  0  0  0
   3716 
   3717                 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);    //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
   3718                 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);    //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
   3719 
   3720                 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);   //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
   3721                                                                                         //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
   3722                 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);   //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
   3723                                                                                         //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
   3724                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
   3725                 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
   3726 
   3727                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
   3728                 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
   3729 
   3730                 _mm_storeu_si128((__m128i *)(pi2_temp1), res_r0_t1_8x16b);
   3731                 _mm_storeu_si128((__m128i *)(pi2_temp1 + 8), res_r1_t1_8x16b);
   3732 
   3733                 ht_temp--;
   3734                 pu1_src =  pu1_src + src_strd;
   3735                 pi2_temp1 =  pi2_temp1 + 16;
   3736             }
   3737             while(ht_temp > 0);
   3738         }
   3739         // vertical q-pel
   3740         {
   3741             __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b, src_r4_8x16b;
   3742             __m128i src_r5_8x16b, src_r6_8x16b;
   3743             __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
   3744             __m128i src_hpel_8x16b, src_hpel_16x8b;
   3745 
   3746             __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
   3747             __m128i res_8x16b, res_16x8b;
   3748 
   3749             __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
   3750             __m128i const_val512_4x32b, const_val16_8x16b;
   3751 
   3752             coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001);
   3753             coeff2_3_8x16b = _mm_set1_epi32(0x00140014);
   3754             coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB);
   3755 
   3756             const_val512_4x32b = _mm_set1_epi32(512);
   3757             const_val16_8x16b = _mm_set1_epi16(16);
   3758 
   3759             /**********************************************************/
   3760             /*     Do first height x 8 block                          */
   3761             /**********************************************************/
   3762             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
   3763             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
   3764             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 32));
   3765             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 48));
   3766             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 64));
   3767             pi2_temp2 += 80;
   3768 
   3769             ht_temp = ht;
   3770             do
   3771             {
   3772                 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2));
   3773                 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp2 + 16));
   3774 
   3775                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   3776                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   3777                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   3778 
   3779                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3780                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3781                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3782 
   3783                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3784                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3785                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3786                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3787 
   3788                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   3789                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   3790                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   3791 
   3792                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3793                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3794                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3795 
   3796                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3797                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3798                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3799                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3800 
   3801                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3802                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   3803 
   3804                 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3));
   3805                 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
   3806                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   3807                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   3808 
   3809                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3810                 _mm_storel_epi64((__m128i *)(pu1_dst), res_16x8b);
   3811 
   3812                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
   3813                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
   3814                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
   3815 
   3816                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3817                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3818                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3819 
   3820                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3821                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3822                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3823                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3824 
   3825                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
   3826                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
   3827                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
   3828 
   3829                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3830                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3831                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3832 
   3833                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3834                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3835                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3836                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3837 
   3838                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3839                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   3840 
   3841                 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp3 + 16));
   3842                 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
   3843                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   3844                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   3845 
   3846                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3847                 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res_16x8b);
   3848 
   3849                 src_r0_8x16b = src_r2_8x16b;
   3850                 src_r1_8x16b = src_r3_8x16b;
   3851                 src_r2_8x16b = src_r4_8x16b;
   3852                 src_r3_8x16b = src_r5_8x16b;
   3853                 src_r4_8x16b = src_r6_8x16b;
   3854 
   3855                 ht_temp -= 2;
   3856                 pi2_temp3 = pi2_temp3 + (16 << 1);
   3857                 pi2_temp2 = pi2_temp2 + (16 << 1);
   3858                 pu1_dst = pu1_dst + (dst_strd << 1);
   3859             }
   3860             while(ht_temp > 0);
   3861 
   3862             /**********************************************************/
   3863             /*     Do second height * 8 block                         */
   3864             /**********************************************************/
   3865             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4));
   3866             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16));
   3867             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 32));
   3868             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 48));
   3869             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 64));
   3870             pi2_temp4 += 80;
   3871 
   3872             do
   3873             {
   3874                 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4));
   3875                 src_r6_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp4 + 16));
   3876 
   3877                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
   3878                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
   3879                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
   3880 
   3881                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3882                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3883                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3884 
   3885                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3886                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3887                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3888                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3889 
   3890                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
   3891                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
   3892                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
   3893 
   3894                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3895                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3896                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3897 
   3898                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3899                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3900                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3901                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3902 
   3903                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3904                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   3905 
   3906                 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5));
   3907                 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
   3908                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   3909                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   3910 
   3911                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3912                 _mm_storel_epi64((__m128i *)(pu1_dst1), res_16x8b);
   3913 
   3914                 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r1_8x16b, src_r2_8x16b);
   3915                 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r3_8x16b, src_r4_8x16b);
   3916                 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r5_8x16b, src_r6_8x16b);
   3917 
   3918                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3919                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3920                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3921 
   3922                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3923                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3924                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3925                 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3926 
   3927                 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r1_8x16b, src_r2_8x16b);
   3928                 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r3_8x16b, src_r4_8x16b);
   3929                 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r5_8x16b, src_r6_8x16b);
   3930 
   3931                 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
   3932                 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
   3933                 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
   3934 
   3935                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
   3936                 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
   3937                 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
   3938                 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
   3939 
   3940                 res_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
   3941                 res_16x8b = _mm_packus_epi16(res_8x16b, res_8x16b);
   3942 
   3943                 src_hpel_8x16b = _mm_loadu_si128((__m128i *)(pi2_temp5 + 16));
   3944                 src_hpel_8x16b = _mm_add_epi16(src_hpel_8x16b, const_val16_8x16b);
   3945                 src_hpel_8x16b = _mm_srai_epi16(src_hpel_8x16b, 5); //shifting right by 5 bits.
   3946                 src_hpel_16x8b = _mm_packus_epi16(src_hpel_8x16b, src_hpel_8x16b);
   3947 
   3948                 res_16x8b = _mm_avg_epu8(res_16x8b, src_hpel_16x8b);
   3949                 _mm_storel_epi64((__m128i *)(pu1_dst1 + dst_strd), res_16x8b);
   3950 
   3951                 src_r0_8x16b = src_r2_8x16b;
   3952                 src_r1_8x16b = src_r3_8x16b;
   3953                 src_r2_8x16b = src_r4_8x16b;
   3954                 src_r3_8x16b = src_r5_8x16b;
   3955                 src_r4_8x16b = src_r6_8x16b;
   3956 
   3957                 ht -= 2;
   3958                 pi2_temp5 = pi2_temp5 + (16 << 1);
   3959                 pi2_temp4 = pi2_temp4 + (16 << 1);
   3960                 pu1_dst1 = pu1_dst1 + (dst_strd << 1);
   3961             }
   3962             while(ht > 0);
   3963         }
   3964     }
   3965 }
   3966 
   3967 /*****************************************************************************/
   3968 /*                                                                           */
   3969 /*  Function Name : ih264_inter_pred_chroma_ssse3                            */
   3970 /*                                                                           */
   3971 /*  Description   : This function implements a four-tap 2D filter as         */
   3972 /*                  mentioned in sec. 8.4.2.2.2 titled "Chroma sample        */
   3973 /*                  "interpolation process". (ht,wd) can be (2,2), (4,2),    */
   3974 /*                  (2,4), (4,4), (8,4), (4,8) or (8,8).                     */
   3975 /*                                                                           */
   3976 /*  Inputs        : puc_src  - pointer to source                             */
   3977 /*                  puc_dst  - pointer to destination                        */
   3978 /*                  src_strd - stride for source                             */
   3979 /*                  dst_strd - stride for destination                        */
   3980 /*                  dx       - x position of destination value               */
   3981 /*                  dy       - y position of destination value               */
   3982 /*                  ht       - height of the block                           */
   3983 /*                  wd       - width of the block                            */
   3984 /*                                                                           */
   3985 /*  Issues        : None                                                     */
   3986 /*                                                                           */
   3987 /*  Revision History:                                                        */
   3988 /*                                                                           */
   3989 /*         DD MM YYYY   Author(s)       Changes                              */
   3990 /*         13 02 2015   Kaushik         Initial Version                      */
   3991 /*                      Senthoor                                             */
   3992 /*                                                                           */
   3993 /*****************************************************************************/
   3994 void ih264_inter_pred_chroma_ssse3(UWORD8 *pu1_src,
   3995                                    UWORD8 *pu1_dst,
   3996                                    WORD32 src_strd,
   3997                                    WORD32 dst_strd,
   3998                                    WORD32 dx,
   3999                                    WORD32 dy,
   4000                                    WORD32 ht,
   4001                                    WORD32 wd)
   4002 {
   4003     WORD32 i, j, A, B, C, D;
   4004 
   4005     i = 8 - dx;
   4006     j = 8 - dy;
   4007 
   4008     A = i * j;
   4009     B = dx * j;
   4010     C = i * dy;
   4011     D = dx * dy;
   4012 
   4013     if(wd == 2)
   4014     {
   4015         WORD32 tmp1, tmp2, tmp3, tmp4;
   4016 
   4017         do
   4018         {
   4019             //U
   4020             tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
   4021             tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
   4022             //V
   4023             tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
   4024             tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
   4025 
   4026             tmp1 = (tmp1 + 32) >> 6;
   4027             tmp2 = (tmp2 + 32) >> 6;
   4028             tmp3 = (tmp3 + 32) >> 6;
   4029             tmp4 = (tmp4 + 32) >> 6;
   4030 
   4031             pu1_dst[0] = CLIP_U8(tmp1);
   4032             pu1_dst[2] = CLIP_U8(tmp2);
   4033             pu1_dst[1] = CLIP_U8(tmp3);
   4034             pu1_dst[3] = CLIP_U8(tmp4);
   4035 
   4036             pu1_src += src_strd;
   4037             pu1_dst += dst_strd;
   4038 
   4039             tmp1 = A * pu1_src[0] + B * pu1_src[2] + C * pu1_src[src_strd] + D * pu1_src[src_strd + 2];
   4040             tmp2 = A * pu1_src[2] + B * pu1_src[4] + C * pu1_src[src_strd + 2] + D * pu1_src[src_strd + 4];
   4041             tmp3 = A * pu1_src[1] + B * pu1_src[3] + C * pu1_src[src_strd + 1] + D * pu1_src[src_strd + 3];
   4042             tmp4 = A * pu1_src[3] + B * pu1_src[5] + C * pu1_src[src_strd + 3] + D * pu1_src[src_strd + 5];
   4043 
   4044             tmp1 = (tmp1 + 32) >> 6;
   4045             tmp2 = (tmp2 + 32) >> 6;
   4046             tmp3 = (tmp3 + 32) >> 6;
   4047             tmp4 = (tmp4 + 32) >> 6;
   4048 
   4049             pu1_dst[0] = CLIP_U8(tmp1);
   4050             pu1_dst[2] = CLIP_U8(tmp2);
   4051             pu1_dst[1] = CLIP_U8(tmp3);
   4052             pu1_dst[3] = CLIP_U8(tmp4);
   4053 
   4054             ht -= 2;
   4055             pu1_src += src_strd;
   4056             pu1_dst += dst_strd;
   4057         }
   4058         while(ht > 0);
   4059 
   4060     }
   4061     else if(wd == 4)
   4062     {
   4063         WORD32 AB, CD;
   4064 
   4065         __m128i src_r1_16x8b, src_r2_16x8b, src_r3_16x8b;
   4066         __m128i res1_AB_8x16b, res1_CD_8x16b, res1_8x16b, res1_16x8b;
   4067         __m128i res2_AB_8x16b, res2_CD_8x16b, res2_8x16b, res2_16x8b;
   4068 
   4069         __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
   4070         __m128i const_shuff_16x8b;
   4071 
   4072         AB = (B << 8) + A;
   4073         CD = (D << 8) + C;
   4074 
   4075         coeffAB_16x8b = _mm_set1_epi16(AB);
   4076         coeffCD_16x8b = _mm_set1_epi16(CD);
   4077 
   4078         round_add32_8x16b = _mm_set1_epi16(32);
   4079 
   4080         const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);
   4081 
   4082         src_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   4083         src_r1_16x8b = _mm_shuffle_epi8(src_r1_16x8b, const_shuff_16x8b);
   4084         pu1_src += src_strd;
   4085 
   4086         do
   4087         {
   4088             src_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   4089             src_r3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd));
   4090 
   4091             src_r2_16x8b = _mm_shuffle_epi8(src_r2_16x8b, const_shuff_16x8b);
   4092             src_r3_16x8b = _mm_shuffle_epi8(src_r3_16x8b, const_shuff_16x8b);
   4093 
   4094             res1_AB_8x16b = _mm_maddubs_epi16(src_r1_16x8b, coeffAB_16x8b);
   4095             res1_CD_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffCD_16x8b);
   4096             res2_AB_8x16b = _mm_maddubs_epi16(src_r2_16x8b, coeffAB_16x8b);
   4097             res2_CD_8x16b = _mm_maddubs_epi16(src_r3_16x8b, coeffCD_16x8b);
   4098 
   4099             res1_8x16b = _mm_add_epi16(res1_AB_8x16b, res1_CD_8x16b);
   4100             res2_8x16b = _mm_add_epi16(res2_AB_8x16b, res2_CD_8x16b);
   4101             res1_8x16b = _mm_add_epi16(res1_8x16b, round_add32_8x16b);
   4102             res2_8x16b = _mm_add_epi16(res2_8x16b, round_add32_8x16b);
   4103 
   4104             res1_8x16b = _mm_srai_epi16(res1_8x16b, 6);
   4105             res2_8x16b = _mm_srai_epi16(res2_8x16b, 6);
   4106 
   4107             res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b);
   4108             res2_16x8b = _mm_packus_epi16(res2_8x16b, res2_8x16b);
   4109 
   4110             _mm_storel_epi64((__m128i *)pu1_dst, res1_16x8b);
   4111             _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
   4112 
   4113             src_r1_16x8b = src_r3_16x8b;
   4114 
   4115             ht -= 2;
   4116             pu1_src += src_strd << 1;
   4117             pu1_dst += dst_strd << 1;
   4118         }
   4119         while(ht > 0);
   4120     }
   4121     else // wd == 8
   4122     {
   4123         WORD32 AB, CD;
   4124 
   4125         __m128i src_r1l_16x8b, src_r2l_16x8b;
   4126         __m128i src_r1h_16x8b, src_r2h_16x8b;
   4127 
   4128         __m128i res_l_AB_8x16b, res_l_CD_8x16b;
   4129         __m128i res_h_AB_8x16b, res_h_CD_8x16b;
   4130         __m128i res_l_8x16b, res_h_8x16b, res_16x8b;
   4131 
   4132         __m128i coeffAB_16x8b, coeffCD_16x8b, round_add32_8x16b;
   4133         __m128i const_shuff_16x8b;
   4134 
   4135         AB = (B << 8) + A;
   4136         CD = (D << 8) + C;
   4137 
   4138         coeffAB_16x8b = _mm_set1_epi16(AB);
   4139         coeffCD_16x8b = _mm_set1_epi16(CD);
   4140 
   4141         round_add32_8x16b = _mm_set1_epi16(32);
   4142 
   4143         const_shuff_16x8b = _mm_setr_epi32(0x03010200, 0x05030402, 0x07050604, 0x09070806);
   4144 
   4145         src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   4146         src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
   4147 
   4148         src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
   4149         src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);
   4150 
   4151         pu1_src += src_strd;
   4152 
   4153         do
   4154         {
   4155             //row 1
   4156             src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   4157             src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
   4158 
   4159             src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b);
   4160             src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b);
   4161 
   4162             res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b);
   4163             res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b);
   4164             res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b);
   4165             res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b);
   4166 
   4167             res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
   4168             res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
   4169             res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
   4170             res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);
   4171 
   4172             res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
   4173             res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);
   4174 
   4175             res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);
   4176 
   4177             _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
   4178 
   4179             pu1_src += src_strd;
   4180             pu1_dst += dst_strd;
   4181 
   4182             //row 2
   4183             src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   4184             src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
   4185 
   4186             src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
   4187             src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);
   4188 
   4189             res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b);
   4190             res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b);
   4191             res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b);
   4192             res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b);
   4193 
   4194             res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
   4195             res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
   4196             res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
   4197             res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);
   4198 
   4199             res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
   4200             res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);
   4201 
   4202             res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);
   4203 
   4204             _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
   4205 
   4206             pu1_src += src_strd;
   4207             pu1_dst += dst_strd;
   4208 
   4209             //row 3
   4210             src_r2l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   4211             src_r2h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
   4212 
   4213             src_r2l_16x8b = _mm_shuffle_epi8(src_r2l_16x8b, const_shuff_16x8b);
   4214             src_r2h_16x8b = _mm_shuffle_epi8(src_r2h_16x8b, const_shuff_16x8b);
   4215 
   4216             res_l_AB_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffAB_16x8b);
   4217             res_h_AB_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffAB_16x8b);
   4218             res_l_CD_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffCD_16x8b);
   4219             res_h_CD_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffCD_16x8b);
   4220 
   4221             res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
   4222             res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
   4223             res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
   4224             res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);
   4225 
   4226             res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
   4227             res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);
   4228 
   4229             res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);
   4230 
   4231             _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
   4232 
   4233             pu1_src += src_strd;
   4234             pu1_dst += dst_strd;
   4235 
   4236             //row 1
   4237             src_r1l_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
   4238             src_r1h_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));
   4239 
   4240             src_r1l_16x8b = _mm_shuffle_epi8(src_r1l_16x8b, const_shuff_16x8b);
   4241             src_r1h_16x8b = _mm_shuffle_epi8(src_r1h_16x8b, const_shuff_16x8b);
   4242 
   4243             res_l_AB_8x16b = _mm_maddubs_epi16(src_r2l_16x8b, coeffAB_16x8b);
   4244             res_h_AB_8x16b = _mm_maddubs_epi16(src_r2h_16x8b, coeffAB_16x8b);
   4245             res_l_CD_8x16b = _mm_maddubs_epi16(src_r1l_16x8b, coeffCD_16x8b);
   4246             res_h_CD_8x16b = _mm_maddubs_epi16(src_r1h_16x8b, coeffCD_16x8b);
   4247 
   4248             res_l_8x16b = _mm_add_epi16(res_l_AB_8x16b, round_add32_8x16b);
   4249             res_h_8x16b = _mm_add_epi16(res_h_AB_8x16b, round_add32_8x16b);
   4250             res_l_8x16b = _mm_add_epi16(res_l_8x16b, res_l_CD_8x16b);
   4251             res_h_8x16b = _mm_add_epi16(res_h_8x16b, res_h_CD_8x16b);
   4252 
   4253             res_l_8x16b = _mm_srai_epi16(res_l_8x16b, 6);
   4254             res_h_8x16b = _mm_srai_epi16(res_h_8x16b, 6);
   4255 
   4256             res_16x8b = _mm_packus_epi16(res_l_8x16b, res_h_8x16b);
   4257 
   4258             _mm_storeu_si128((__m128i *)pu1_dst, res_16x8b);
   4259 
   4260             ht -= 4;
   4261             pu1_src += src_strd;
   4262             pu1_dst += dst_strd;
   4263         }
   4264         while(ht > 0);
   4265     }
   4266 }
   4267