Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /**
     21  *******************************************************************************
     22  * @file
     23  *  ih264e_half_pel_ssse3.c
     24  *
     25  * @brief
     26  *  Contains the x86 intrinsic function definitions for 6-tap vertical filter
     27  *  and cascaded 2D filter used in motion estimation in H264 encoder.
     28  *
     29  * @author
     30  *  Ittiam
     31  *
     32  * @par List of Functions:
     33  *  ih264e_sixtapfilter_horz_ssse3
     34  *  ih264e_sixtap_filter_2dvh_vert_ssse3
     35  *
     36  * @remarks
     37  *  None
     38  *
     39  *******************************************************************************
     40  */
     41 
     42 /*****************************************************************************/
     43 /* File Includes                                                             */
     44 /*****************************************************************************/
     45 
     46 /* System include files */
     47 #include <stdio.h>
     48 #include <assert.h>
     49 #include <limits.h>
     50 
     51 /* User include files */
     52 #include "ih264_typedefs.h"
     53 #include "ithread.h"
     54 #include "ih264_platform_macros.h"
     55 #include "ih264_defs.h"
     56 #include "ih264e_half_pel.h"
     57 #include "ih264_macros.h"
     58 #include "ih264e_debug.h"
     59 #include "ih264_inter_pred_filters.h"
     60 #include "ih264_mem_fns.h"
     61 #include "ih264_padding.h"
     62 #include "ih264_intra_pred_filters.h"
     63 #include "ih264_deblk_edge_filters.h"
     64 
     65 
     66 /*****************************************************************************/
     67 /* Function Definitions                                                      */
     68 /*****************************************************************************/
     69 /*
     70 *******************************************************************************
     71 *
     72 * @brief
     73 *  Interprediction luma filter for horizontal input(Filter run for width = 17
     74 *  and height =16)
     75 *
     76 * @par Description:
     77 *  Applies a 6 tap horizontal filter .The output is  clipped to 8 bits sec.
     78 *  8.4.2.2.1 titled "Luma sample interpolation process"
     79 *
     80 * @param[in] pu1_src
     81 *  UWORD8 pointer to the source
     82 *
     83 * @param[out] pu1_dst
     84 *  UWORD8 pointer to the destination
     85 *
     86 * @param[in] src_strd
     87 *  integer source stride
     88 *
     89 * @param[in] dst_strd
     90 *  integer destination stride
     91 *
     92 * @returns
     93 *  None
     94 *
     95 * @remarks
     96 *  None
     97 *
     98 *******************************************************************************
     99 */
    100 void ih264e_sixtapfilter_horz_ssse3(UWORD8 *pu1_src,
    101                                     UWORD8 *pu1_dst,
    102                                     WORD32 src_strd,
    103                                     WORD32 dst_strd)
    104 {
    105     WORD32 ht;
    106     WORD32 tmp;
    107 
    108     __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
    109     __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
    110 
    111     __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
    112     __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
    113 
    114     __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    115     __m128i const_val16_8x16b;
    116 
    117     ht = 16;
    118     pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
    119 
    120     coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    121     coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    122     coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
    123                                                  //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    124     const_val16_8x16b = _mm_set1_epi16(16);
    125 
    126     //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
    127     //Row0 :                         b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
    128     //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
    129 
    130     do
    131     {
    132         src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);                     //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
    133         src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8));               //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
    134 
    135         src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1);                      //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
    136         src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1);                      //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
    137 
    138         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
    139         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
    140 
    141         res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b);    //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
    142                                                                                  //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
    143         res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b);    //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
    144                                                                                  //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
    145 
    146         src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
    147         src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
    148 
    149         src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a3 a4 a5 a6 a7 a8 a9....a15 0  0  0
    150         src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b3 b4 b5 b6 b7 b8 b9....b15 0  0  0
    151 
    152         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
    153         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
    154 
    155         res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b);    //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
    156                                                                                  //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
    157         res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b);    //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
    158                                                                                  //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
    159 
    160         src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2);                          //a4 a5 a6 a7 a8 a9....a15 0  0  0  0
    161         src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2);                          //b4 b5 b6 b7 b8 b9....b15 0  0  0  0
    162 
    163         src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2);                  //a5 a6 a7 a8 a9....a15 0  0  0  0  0
    164         src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2);                  //b5 b6 b7 b8 b9....b15 0  0  0  0  0
    165 
    166         src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b);     //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
    167         src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b);     //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
    168 
    169         res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b);    //a4*c4+a5*c5 a5*c4+a6*c5  a6*c4+a7*c5   a7*c4+a8*c5
    170                                                                                  //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
    171         res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b);    //b4*c4+b5*c5 b5*c4+b6*c5  b6*c4+b7*c5   b7*c4+b8*c5
    172                                                                                  //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
    173         res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
    174         res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
    175         res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
    176         res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
    177         res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
    178         res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
    179 
    180         tmp = ((pu1_src[18] + pu1_src[19]) << 2) - pu1_src[17] - pu1_src[20];
    181         tmp = pu1_src[16] + pu1_src[21] + (tmp << 2) + tmp;
    182 
    183         res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5);                    //shifting right by 5 bits.
    184         res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
    185         tmp = (tmp + 16) >> 5;
    186 
    187         src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
    188         pu1_dst[16] = CLIP_U8(tmp);
    189 
    190         _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);
    191 
    192         ht--;
    193         pu1_src += src_strd;
    194         pu1_dst += dst_strd;
    195     }
    196     while(ht > 0);
    197 }
    198 
    199 /*
    200 *******************************************************************************
    201 *
    202 * @brief
    203 *   This function implements a two stage cascaded six tap filter. It
    204 *    applies the six tap filter in the vertical direction on the
    205 *    predictor values, followed by applying the same filter in the
    206 *    horizontal direction on the output of the first stage. The six tap
    207 *    filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
    208 *    interpolation process" (Filter run for width = 17 and height =17)
    209 *
    210 * @par Description:
    211 *    The function interpolates the predictors first in the vertical direction
    212 *    and then in the horizontal direction to output the (1/2,1/2). The output
    213 *    of the first stage of the filter is stored in the buffer pointed to by
    214 *    pi16_pred1(only in C) in 16 bit precision.
    215 *
    216 * @param[in] pu1_src
    217 *  UWORD8 pointer to the source
    218 *
    219 * @param[out] pu1_dst1
    220 *  UWORD8 pointer to the destination(Vertical filtered output)
    221 *
    222 * @param[out] pu1_dst2
    223 *  UWORD8 pointer to the destination(out put after applying horizontal filter
    224 *  to the intermediate vertical output)
    225 *
    226 * @param[in] src_strd
    227 *  integer source stride
    228 
    229 * @param[in] dst_strd
    230 *  integer destination stride of pu1_dst
    231 *
    232 * @param[in]pi16_pred1
    233 *  Pointer to 16bit intermediate buffer(used only in c)
    234 *
    235 * @param[in] pi16_pred1_strd
    236 *  integer destination stride of pi16_pred1
    237 *
    238 * @returns
    239 *  None
    240 *
    241 * @remarks
    242 *  None
    243 *
    244 *******************************************************************************
    245 */
    246 void ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 *pu1_src,
    247                                           UWORD8 *pu1_dst1,
    248                                           UWORD8 *pu1_dst2,
    249                                           WORD32 src_strd,
    250                                           WORD32 dst_strd,
    251                                           WORD32 *pi4_pred1,
    252                                           WORD32 pred1_strd)
    253 {
    254     WORD32 ht;
    255     WORD16 *pi2_pred1;
    256 
    257     ht = 17;
    258     pi2_pred1 = (WORD16 *)pi4_pred1;
    259     pred1_strd = pred1_strd << 1;
    260 
    261     // Vertical 6-tap filter
    262     {
    263         __m128i src1_r0_16x8b, src1_r1_16x8b, src1_r2_16x8b;
    264         __m128i src1_r3_16x8b, src1_r4_16x8b, src1_r5_16x8b;
    265         __m128i src2_r0_16x8b, src2_r1_16x8b, src2_r2_16x8b;
    266         __m128i src2_r3_16x8b, src2_r4_16x8b, src2_r5_16x8b;
    267 
    268         __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
    269 
    270         __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
    271         __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
    272 
    273         coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
    274         coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
    275         coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
    276                                                      //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    277 
    278         pu1_src -= 2;
    279         pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
    280 
    281         // Loading first five rows to start first row processing.
    282         // 22 values loaded in each row.
    283         src1_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    284         src2_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
    285         pu1_src += src_strd;
    286 
    287         src1_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    288         src2_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
    289         pu1_src += src_strd;
    290 
    291         src1_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    292         src2_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
    293         pu1_src += src_strd;
    294 
    295         src1_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    296         src2_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
    297         pu1_src += src_strd;
    298 
    299         src1_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    300         src2_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
    301         pu1_src += src_strd;
    302 
    303         do
    304         {
    305             src1_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
    306             src2_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
    307 
    308             src_r0r1_16x8b = _mm_unpacklo_epi8(src1_r0_16x8b, src1_r1_16x8b);
    309             src_r2r3_16x8b = _mm_unpacklo_epi8(src1_r2_16x8b, src1_r3_16x8b);
    310             src_r4r5_16x8b = _mm_unpacklo_epi8(src1_r4_16x8b, src1_r5_16x8b);
    311 
    312             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    313             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    314             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    315 
    316             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    317             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
    318 
    319             _mm_storeu_si128((__m128i *)pi2_pred1, res_t1_8x16b);
    320 
    321             src_r0r1_16x8b = _mm_unpackhi_epi8(src1_r0_16x8b, src1_r1_16x8b);
    322             src_r2r3_16x8b = _mm_unpackhi_epi8(src1_r2_16x8b, src1_r3_16x8b);
    323             src_r4r5_16x8b = _mm_unpackhi_epi8(src1_r4_16x8b, src1_r5_16x8b);
    324 
    325             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    326             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    327             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    328 
    329             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    330             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
    331 
    332             _mm_storeu_si128((__m128i *)(pi2_pred1 + 8), res_t1_8x16b);
    333 
    334             src_r0r1_16x8b = _mm_unpacklo_epi8(src2_r0_16x8b, src2_r1_16x8b);
    335             src_r2r3_16x8b = _mm_unpacklo_epi8(src2_r2_16x8b, src2_r3_16x8b);
    336             src_r4r5_16x8b = _mm_unpacklo_epi8(src2_r4_16x8b, src2_r5_16x8b);
    337 
    338             res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
    339             res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
    340             res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
    341 
    342             res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
    343             res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
    344 
    345             _mm_storeu_si128((__m128i *)(pi2_pred1 + 14), res_t1_8x16b);
    346 
    347             src1_r0_16x8b = src1_r1_16x8b;
    348             src1_r1_16x8b = src1_r2_16x8b;
    349             src1_r2_16x8b = src1_r3_16x8b;
    350             src1_r3_16x8b = src1_r4_16x8b;
    351             src1_r4_16x8b = src1_r5_16x8b;
    352 
    353             src2_r0_16x8b = src2_r1_16x8b;
    354             src2_r1_16x8b = src2_r2_16x8b;
    355             src2_r2_16x8b = src2_r3_16x8b;
    356             src2_r3_16x8b = src2_r4_16x8b;
    357             src2_r4_16x8b = src2_r5_16x8b;
    358 
    359             ht--;
    360             pu1_src += src_strd;
    361             pi2_pred1 += pred1_strd;
    362         }
    363         while(ht > 0);
    364     }
    365 
    366     ht = 17;
    367     pi2_pred1 = (WORD16 *)pi4_pred1;
    368 
    369     // Horizontal 6-tap filter
    370     {
    371         WORD32 temp;
    372 
    373         __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
    374         __m128i src_r4_8x16b, src_r5_8x16b;
    375         __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
    376         __m128i res_vert1_8x16b, res_vert2_8x16b, res_16x8b;
    377 
    378         __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
    379         __m128i res_c0_8x16b, res_c1_8x16b;
    380 
    381         __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
    382         __m128i const_val512_4x32b, const_val16_8x16b;
    383 
    384         coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); //c0 c1 c0 c1 c0 c1 c0 c1
    385         coeff2_3_8x16b = _mm_set1_epi32(0x00140014); //c2 c3 c2 c3 c2 c3 c2 c3
    386         coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); //c4 c5 c4 c5 c4 c5 c4 c5
    387                                                      //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
    388         const_val512_4x32b = _mm_set1_epi32(512);
    389         const_val16_8x16b = _mm_set1_epi16(16);
    390 
    391         do
    392         {
    393             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1));
    394             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 1));
    395             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 2));
    396             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 3));
    397             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 4));
    398             src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 5));
    399 
    400             res_vert1_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
    401             res_vert1_8x16b = _mm_srai_epi16(res_vert1_8x16b, 5); //shifting right by 5 bits.
    402 
    403             src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
    404             src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
    405             src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
    406 
    407             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
    408             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
    409             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
    410 
    411             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
    412             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
    413             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
    414             res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
    415 
    416             src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
    417             src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
    418             src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
    419 
    420             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
    421             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
    422             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
    423 
    424             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
    425             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
    426             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
    427             res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
    428 
    429             res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
    430 
    431             src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8));
    432             src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 1));
    433             src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 2));
    434             src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 3));
    435             src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 4));
    436             src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 5));
    437 
    438             res_vert2_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
    439             res_vert2_8x16b = _mm_srai_epi16(res_vert2_8x16b, 5); //shifting right by 5 bits.
    440 
    441             src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
    442             src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
    443             src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
    444 
    445             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
    446             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
    447             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
    448 
    449             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
    450             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
    451             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
    452             res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10);
    453 
    454             src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
    455             src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
    456             src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
    457 
    458             res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
    459             res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
    460             res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
    461 
    462             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
    463             res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
    464             res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
    465             res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
    466 
    467             res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
    468 
    469             res_16x8b = _mm_packus_epi16(res_vert1_8x16b, res_vert2_8x16b);
    470             _mm_storeu_si128((__m128i *)pu1_dst1, res_16x8b);
    471             pu1_dst1[16] = CLIP_U8((pi2_pred1[18] + 16) >> 5);
    472 
    473             res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b);
    474             _mm_storeu_si128((__m128i *)pu1_dst2, res_16x8b);
    475             temp = ((pi2_pred1[18] + pi2_pred1[19]) << 2) - pi2_pred1[17] - pi2_pred1[20];
    476             temp = pi2_pred1[16] + pi2_pred1[21] + (temp << 2) + temp;
    477             pu1_dst2[16] = CLIP_U8((temp + 512) >> 10);
    478 
    479             ht--;
    480             pi2_pred1 += pred1_strd;
    481             pu1_dst1 += dst_strd;
    482             pu1_dst2 += dst_strd;
    483         }
    484         while(ht > 0);
    485     }
    486 }
    487