Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /*****************************************************************************/
     21 /*                                                                           */
     22 /*  File Name         : ih264_deblk_luma_ssse3.c                             */
     23 /*                                                                           */
     24 /*  Description       : Contains function definitions for deblocking         */
     25 /*                                                                           */
     26 /*  List of Functions : ih264_deblk_luma_vert_bs4_ssse3()                    */
     27 /*                      ih264_deblk_luma_horz_bs4_ssse3()                    */
     28 /*                      ih264_deblk_luma_vert_bslt4_ssse3()                  */
     29 /*                      ih264_deblk_luma_horz_bslt4_ssse3()                  */
     30 /*                      ih264_deblk_luma_vert_bs4_mbaff_ssse3()              */
     31 /*                      ih264_deblk_luma_vert_bslt4_mbaff_ssse3()            */
     32 /*                                                                           */
     33 /*  Issues / Problems : None                                                 */
     34 /*                                                                           */
     35 /*  Revision History  :                                                      */
     36 /*                                                                           */
     37 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
     38 /*         12 02 2015   Naveen Kumar P  Added luma deblocking ssse3          */
     39 /*                                      intrinsics                           */
     40 /*                                                                           */
     41 /*****************************************************************************/
     42 
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 
     47 /* System include files */
     48 #include <stdio.h>
     49 
     50 /* User include files */
     51 #include "ih264_typedefs.h"
     52 #include "ih264_platform_macros.h"
     53 #include "ih264_deblk_edge_filters.h"
     54 #include "ih264_macros.h"
     55 
     56 /*****************************************************************************/
     57 /* Function Definitions                                                      */
     58 /*****************************************************************************/
     59 
     60 /*****************************************************************************/
     61 /*                                                                           */
     62 /*  Function Name : ih264_deblk_luma_vert_bs4_ssse3()                        */
     63 /*                                                                           */
     64 /*  Description   : This function performs filtering of a luma block         */
     65 /*                  vertical edge when the boundary strength is set to 4.    */
     66 /*                                                                           */
     67 /*  Inputs        : pu1_src    - pointer to the src sample q0                */
     68 /*                  src_strd   - source stride                               */
     69 /*                  alpha      - alpha value for the boundary                */
     70 /*                  beta       - beta value for the boundary                 */
     71 /*                                                                           */
     72 /*  Globals       : None                                                     */
     73 /*                                                                           */
     74 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
     75 /*                  title "Filtering process for edges for bS equal to 4" in */
     76 /*                  ITU T Rec H.264.                                         */
     77 /*                                                                           */
     78 /*  Outputs       : None                                                     */
     79 /*                                                                           */
     80 /*  Returns       : None                                                     */
     81 /*                                                                           */
     82 /*  Issues        : None                                                     */
     83 /*                                                                           */
     84 /*  Revision History:                                                        */
     85 /*                                                                           */
     86 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
     87 /*         12 02 2015   Naveen Kumar P  Initial version                      */
     88 /*                                                                           */
     89 /*****************************************************************************/
     90 void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
     91                                      WORD32 src_strd,
     92                                      WORD32 alpha,
     93                                      WORD32 beta)
     94 {
     95     __m128i zero = _mm_setzero_si128();
     96     __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
     97     __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
     98     __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
     99     __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
    100     __m128i q0_16x8_1;
    101     __m128i p0_16x8_1;
    102     __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
    103     __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
    104     __m128i temp1, temp2, temp3, temp4, temp5, temp6;
    105     __m128i Alpha_8x16, Beta_8x16;
    106     __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
    107     __m128i const_val2_16x8 = _mm_set1_epi16(2);
    108     __m128i line1, line2, line3, line4, line5, line6, line7, line8;
    109 
    110     Alpha_8x16 = _mm_set1_epi16(alpha);
    111     Beta_8x16 = _mm_set1_epi16(beta);
    112 
    113     line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
    114     line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
    115     line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
    116     line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
    117     line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
    118     line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
    119     line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
    120     line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
    121 
    122     temp1 = _mm_unpacklo_epi8(line1, line2);
    123     temp2 = _mm_unpacklo_epi8(line3, line4);
    124     temp3 = _mm_unpacklo_epi8(line5, line6);
    125     temp4 = _mm_unpacklo_epi8(line7, line8);
    126 
    127     line1 = _mm_unpacklo_epi16(temp1, temp2);
    128     line2 = _mm_unpackhi_epi16(temp1, temp2);
    129     line3 = _mm_unpacklo_epi16(temp3, temp4);
    130     line4 = _mm_unpackhi_epi16(temp3, temp4);
    131 
    132     p1_8x16 = _mm_unpacklo_epi32(line1, line3);
    133     p0_8x16 = _mm_unpackhi_epi32(line1, line3);
    134     q0_8x16 = _mm_unpacklo_epi32(line2, line4);
    135     q1_8x16 = _mm_unpackhi_epi32(line2, line4);
    136 
    137     line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
    138     line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
    139     line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
    140     line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
    141     line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
    142     line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
    143     line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
    144     line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
    145 
    146     temp1 = _mm_unpacklo_epi8(line1, line2);
    147     temp2 = _mm_unpacklo_epi8(line3, line4);
    148     temp3 = _mm_unpacklo_epi8(line5, line6);
    149     temp4 = _mm_unpacklo_epi8(line7, line8);
    150 
    151     line1 = _mm_unpacklo_epi16(temp1, temp2);
    152     line2 = _mm_unpackhi_epi16(temp1, temp2);
    153     line3 = _mm_unpacklo_epi16(temp3, temp4);
    154     line4 = _mm_unpackhi_epi16(temp3, temp4);
    155 
    156     temp1 = _mm_unpacklo_epi32(line1, line3);
    157     temp2 = _mm_unpackhi_epi32(line1, line3);
    158     temp3 = _mm_unpacklo_epi32(line2, line4);
    159     temp4 = _mm_unpackhi_epi32(line2, line4);
    160 
    161     p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
    162     p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
    163     q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
    164     q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
    165     p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
    166     p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
    167     q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
    168     q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
    169 
    170     //Cond1 (ABS(p0 - q0) < alpha)
    171     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
    172     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
    173     temp1 = _mm_add_epi8(temp1, temp2);
    174 
    175     temp2 = _mm_unpacklo_epi8(temp1, zero);
    176     temp1 = _mm_unpackhi_epi8(temp1, zero);
    177 
    178     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
    179     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
    180 
    181     flag1_16x8 = _mm_packs_epi16(temp2, temp1);
    182 
    183     //Cond2 (ABS(q1 - q0) < beta)
    184     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
    185     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
    186     temp1 = _mm_add_epi8(temp1, temp2);
    187 
    188     temp2 = _mm_unpacklo_epi8(temp1, zero);
    189     temp1 = _mm_unpackhi_epi8(temp1, zero);
    190 
    191     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    192     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
    193 
    194     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
    195 
    196     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
    197 
    198     //Cond3 (ABS(p1 - p0) < beta)
    199     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
    200     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
    201     temp1 = _mm_add_epi8(temp1, temp2);
    202 
    203     temp2 = _mm_unpacklo_epi8(temp1, zero);
    204     temp1 = _mm_unpackhi_epi8(temp1, zero);
    205 
    206     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    207     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
    208 
    209     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
    210 
    211     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
    212     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
    213 
    214     // (ABS(p0 - q0) < ((alpha >> 2) + 2))
    215     temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
    216     temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
    217     temp1 = _mm_add_epi8(temp1, temp2);
    218     Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
    219     Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
    220 
    221     temp2 = _mm_unpacklo_epi8(temp1, zero);
    222     temp1 = _mm_unpackhi_epi8(temp1, zero);
    223     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
    224     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
    225 
    226     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
    227     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
    228 
    229     // (ABS(p2 - p0) < beta)
    230     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
    231     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
    232     temp1 = _mm_add_epi8(temp1, temp2);
    233 
    234     temp2 = _mm_unpacklo_epi8(temp1, zero);
    235     temp1 = _mm_unpackhi_epi8(temp1, zero);
    236     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    237     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
    238 
    239     flag3_16x8 = _mm_packs_epi16(temp2, temp1);
    240     flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
    241 
    242     // (ABS(q2 - q0) < beta)
    243     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
    244     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
    245     temp1 = _mm_add_epi8(temp1, temp2);
    246 
    247     temp2 = _mm_unpacklo_epi8(temp1, zero);
    248     temp1 = _mm_unpackhi_epi8(temp1, zero);
    249     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    250     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
    251 
    252     flag4_16x8 = _mm_packs_epi16(temp2, temp1);
    253     flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
    254 
    255     // First 8 pixels
    256     p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
    257     p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
    258     p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
    259     p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
    260     q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
    261     q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
    262     q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
    263     q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
    264 
    265     // p0_1 and q0_1
    266     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
    267     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
    268     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
    269     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
    270     temp3 = _mm_slli_epi16(p1_8x16, 1);
    271     temp4 = _mm_slli_epi16(q1_8x16, 1);
    272     temp1 = _mm_add_epi16(temp5, temp3);
    273     temp2 = _mm_add_epi16(temp6, temp4);
    274     p0_16x8_1 = _mm_srai_epi16(temp1, 2);
    275     q0_16x8_1 = _mm_srai_epi16(temp2, 2);
    276 
    277     // p1_2 and q1_2
    278     temp6 = _mm_add_epi16(temp6, p0_8x16);
    279     temp5 = _mm_add_epi16(temp5, q0_8x16);
    280     temp1 = _mm_add_epi16(temp6, p2_8x16);
    281     temp2 = _mm_add_epi16(temp5, q2_8x16);
    282     p1_16x8_2 = _mm_srai_epi16(temp1, 2);
    283     q1_16x8_2 = _mm_srai_epi16(temp2, 2);
    284 
    285     // p0_2 and q0_2
    286     temp1 = _mm_add_epi16(temp3, p2_8x16);
    287     temp2 = _mm_add_epi16(temp4, q2_8x16);
    288     temp1 = _mm_add_epi16(temp1, q1_8x16);
    289     temp2 = _mm_add_epi16(temp2, p1_8x16);
    290     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
    291     temp3 = _mm_slli_epi16(temp3, 1);
    292     temp1 = _mm_add_epi16(temp1, temp3);
    293     temp2 = _mm_add_epi16(temp2, temp3);
    294     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
    295     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
    296     p0_16x8_2 = _mm_srai_epi16(temp1, 3);
    297     q0_16x8_2 = _mm_srai_epi16(temp2, 3);
    298 
    299     // p2_2 and q2_2
    300     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
    301     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
    302     temp3 = _mm_slli_epi16(p2_8x16, 1);
    303     temp4 = _mm_slli_epi16(q2_8x16, 1);
    304     temp3 = _mm_add_epi16(p2_8x16, temp3);
    305     temp4 = _mm_add_epi16(q2_8x16, temp4);
    306     temp5 = _mm_slli_epi16(p3_8x16, 1);
    307     temp6 = _mm_slli_epi16(q3_8x16, 1);
    308     temp1 = _mm_add_epi16(temp1, temp3);
    309     temp2 = _mm_add_epi16(temp2, temp4);
    310     temp1 = _mm_add_epi16(temp1, temp5);
    311     temp2 = _mm_add_epi16(temp2, temp6);
    312     p2_16x8_2 = _mm_srai_epi16(temp1, 3);
    313     q2_16x8_2 = _mm_srai_epi16(temp2, 3);
    314 
    315     // Second 8 pixels and packing with first 8 pixels
    316     p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
    317     p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
    318     p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
    319     p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
    320     q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
    321     q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
    322     q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
    323     q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
    324 
    325     // p0_1 and q0_1
    326     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
    327     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
    328     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
    329     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
    330     temp3 = _mm_slli_epi16(p1_8x16, 1);
    331     temp4 = _mm_slli_epi16(q1_8x16, 1);
    332     temp1 = _mm_add_epi16(temp5, temp3);
    333     temp2 = _mm_add_epi16(temp6, temp4);
    334     temp1 = _mm_srai_epi16(temp1, 2);
    335     temp2 = _mm_srai_epi16(temp2, 2);
    336     p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
    337     q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
    338 
    339     // p1_2 and q1_2
    340     temp6 = _mm_add_epi16(temp6, p0_8x16);
    341     temp5 = _mm_add_epi16(temp5, q0_8x16);
    342     temp1 = _mm_add_epi16(temp6, p2_8x16);
    343     temp2 = _mm_add_epi16(temp5, q2_8x16);
    344     temp1 = _mm_srai_epi16(temp1, 2);
    345     temp2 = _mm_srai_epi16(temp2, 2);
    346     p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
    347     q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
    348 
    349     // p0_2 and q0_2
    350     temp1 = _mm_add_epi16(temp3, p2_8x16);
    351     temp2 = _mm_add_epi16(temp4, q2_8x16);
    352     temp1 = _mm_add_epi16(temp1, q1_8x16);
    353     temp2 = _mm_add_epi16(temp2, p1_8x16);
    354     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
    355     temp3 = _mm_slli_epi16(temp3, 1);
    356     temp1 = _mm_add_epi16(temp1, temp3);
    357     temp2 = _mm_add_epi16(temp2, temp3);
    358     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
    359     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
    360     temp1 = _mm_srai_epi16(temp1, 3);
    361     temp2 = _mm_srai_epi16(temp2, 3);
    362     p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
    363     q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
    364 
    365     // p2_2 and q2_2
    366     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
    367     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
    368     temp3 = _mm_slli_epi16(p2_8x16, 1);
    369     temp4 = _mm_slli_epi16(q2_8x16, 1);
    370     temp3 = _mm_add_epi16(p2_8x16, temp3);
    371     temp4 = _mm_add_epi16(q2_8x16, temp4);
    372     temp5 = _mm_slli_epi16(p3_8x16, 1);
    373     temp6 = _mm_slli_epi16(q3_8x16, 1);
    374     temp1 = _mm_add_epi16(temp1, temp3);
    375     temp2 = _mm_add_epi16(temp2, temp4);
    376     temp1 = _mm_add_epi16(temp1, temp5);
    377     temp2 = _mm_add_epi16(temp2, temp6);
    378     temp1 = _mm_srai_epi16(temp1, 3);
    379     temp2 = _mm_srai_epi16(temp2, 3);
    380     p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
    381     q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
    382 
    383     // p0 and q0
    384     p0_16x8 = _mm_and_si128(p0_16x8,
    385                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
    386     p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
    387     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
    388     q0_16x8 = _mm_and_si128(q0_16x8,
    389                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
    390     q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
    391     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
    392 
    393     // p0 and q0
    394     p0_16x8 = _mm_and_si128(p0_16x8,
    395                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    396     p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
    397     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
    398     q0_16x8 = _mm_and_si128(q0_16x8,
    399                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    400     q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
    401     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
    402 
    403     // p1 and q1
    404     p1_16x8 = _mm_and_si128(p1_16x8,
    405                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    406     p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
    407     p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
    408     q1_16x8 = _mm_and_si128(q1_16x8,
    409                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    410     q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
    411     q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
    412 
    413     // p2 and q2
    414     p2_16x8 = _mm_and_si128(p2_16x8,
    415                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    416     p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
    417     p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
    418     q2_16x8 = _mm_and_si128(q2_16x8,
    419                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    420     q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
    421     q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
    422 
    423     temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
    424     temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
    425     temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
    426     temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
    427 
    428     p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
    429     p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
    430     q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
    431     q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
    432 
    433     line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
    434     line2 = _mm_srli_si128(line1, 8);
    435     line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
    436     line4 = _mm_srli_si128(line3, 8);
    437     line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
    438     line6 = _mm_srli_si128(line5, 8);
    439     line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
    440     line8 = _mm_srli_si128(line7, 8);
    441 
    442     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
    443     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
    444     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
    445     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
    446     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
    447     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
    448     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
    449     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
    450 
    451     temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
    452     temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
    453     temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
    454     temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
    455 
    456     p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
    457     p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
    458     q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
    459     q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
    460 
    461     line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
    462     line2 = _mm_srli_si128(line1, 8);
    463     line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
    464     line4 = _mm_srli_si128(line3, 8);
    465     line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
    466     line6 = _mm_srli_si128(line5, 8);
    467     line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
    468     line8 = _mm_srli_si128(line7, 8);
    469 
    470     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
    471     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
    472     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
    473     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
    474     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
    475     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
    476     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
    477     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
    478 
    479 }
    480 
    481 /*****************************************************************************/
    482 /*                                                                           */
    483 /*  Function Name : ih264_deblk_luma_horz_bs4_ssse3()                        */
    484 /*                                                                           */
    485 /*  Description   : This function performs filtering of a luma block         */
    486 /*                  horizontal edge when the boundary strength is set to 4.  */
    487 /*                                                                           */
    488 /*  Inputs        : pu1_src    - pointer to the src sample q0                */
    489 /*                  src_strd   - source stride                               */
    490 /*                  alpha      - alpha value for the boundary                */
    491 /*                  beta       - beta value for the boundary                 */
    492 /*                                                                           */
    493 /*  Globals       : None                                                     */
    494 /*                                                                           */
    495 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
    496 /*                  title "Filtering process for edges for bS equal to 4" in */
    497 /*                  ITU T Rec H.264.                                         */
    498 /*                                                                           */
    499 /*  Outputs       : None                                                     */
    500 /*                                                                           */
    501 /*  Returns       : None                                                     */
    502 /*                                                                           */
    503 /*  Issues        : None                                                     */
    504 /*                                                                           */
    505 /*  Revision History:                                                        */
    506 /*                                                                           */
    507 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
    508 /*         12 02 2015   Naveen Kumar P  Initial version                      */
    509 /*                                                                           */
    510 /*****************************************************************************/
    511 void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
    512                                      WORD32 src_strd,
    513                                      WORD32 alpha,
    514                                      WORD32 beta)
    515 {
    516     WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
    517     WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
    518     UWORD8 *pu1_HorzPixel;
    519     __m128i zero = _mm_setzero_si128();
    520     __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
    521     __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
    522     __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
    523     __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
    524     __m128i q0_16x8_1;
    525     __m128i p0_16x8_1;
    526     __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
    527     __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
    528     __m128i temp1, temp2, temp3, temp4, temp5, temp6;
    529     __m128i Alpha_8x16, Beta_8x16;
    530     __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
    531     __m128i const_val2_16x8 = _mm_set1_epi16(2);
    532 
    533     pu1_HorzPixel = pu1_src - (src_strd << 2);
    534 
    535     i16_posQ1 = src_strd;
    536     i16_posQ2 = X2(src_strd);
    537     i16_posQ3 = X3(src_strd);
    538     i16_posP0 = X3(src_strd);
    539     i16_posP1 = X2(src_strd);
    540     i16_posP2 = src_strd;
    541     i16_posP3 = 0;
    542 
    543     Alpha_8x16 = _mm_set1_epi16(alpha);
    544     Beta_8x16 = _mm_set1_epi16(beta);
    545 
    546     p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
    547     p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
    548     p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
    549     p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
    550     q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
    551     q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
    552     q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
    553     q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
    554 
    555     //Cond1 (ABS(p0 - q0) < alpha)
    556     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
    557     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
    558     temp1 = _mm_add_epi8(temp1, temp2);
    559 
    560     temp2 = _mm_unpacklo_epi8(temp1, zero);
    561     temp1 = _mm_unpackhi_epi8(temp1, zero);
    562 
    563     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
    564     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
    565 
    566     flag1_16x8 = _mm_packs_epi16(temp2, temp1);
    567 
    568     //Cond2 (ABS(q1 - q0) < beta)
    569     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
    570     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
    571     temp1 = _mm_add_epi8(temp1, temp2);
    572 
    573     temp2 = _mm_unpacklo_epi8(temp1, zero);
    574     temp1 = _mm_unpackhi_epi8(temp1, zero);
    575 
    576     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    577     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
    578 
    579     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
    580 
    581     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
    582 
    583     //Cond3 (ABS(p1 - p0) < beta)
    584     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
    585     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
    586     temp1 = _mm_add_epi8(temp1, temp2);
    587 
    588     temp2 = _mm_unpacklo_epi8(temp1, zero);
    589     temp1 = _mm_unpackhi_epi8(temp1, zero);
    590 
    591     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    592     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
    593 
    594     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
    595 
    596     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
    597     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
    598 
    599     // (ABS(p0 - q0) < ((alpha >> 2) + 2))
    600     temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
    601     temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
    602     temp1 = _mm_add_epi8(temp1, temp2);
    603     Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
    604     Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
    605 
    606     temp2 = _mm_unpacklo_epi8(temp1, zero);
    607     temp1 = _mm_unpackhi_epi8(temp1, zero);
    608     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
    609     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
    610 
    611     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
    612     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
    613 
    614     // (ABS(p2 - p0) < beta)
    615     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
    616     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
    617     temp1 = _mm_add_epi8(temp1, temp2);
    618 
    619     temp2 = _mm_unpacklo_epi8(temp1, zero);
    620     temp1 = _mm_unpackhi_epi8(temp1, zero);
    621     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    622     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
    623 
    624     flag3_16x8 = _mm_packs_epi16(temp2, temp1);
    625     flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
    626 
    627     // (ABS(q2 - q0) < beta)
    628     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
    629     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
    630     temp1 = _mm_add_epi8(temp1, temp2);
    631 
    632     temp2 = _mm_unpacklo_epi8(temp1, zero);
    633     temp1 = _mm_unpackhi_epi8(temp1, zero);
    634     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
    635     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
    636 
    637     flag4_16x8 = _mm_packs_epi16(temp2, temp1);
    638     flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
    639 
    640     // First 8 pixels
    641     p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
    642     p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
    643     p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
    644     p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
    645     q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
    646     q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
    647     q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
    648     q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
    649 
    650     // p0_1 and q0_1
    651     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
    652     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
    653     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
    654     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
    655     temp3 = _mm_slli_epi16(p1_8x16, 1);
    656     temp4 = _mm_slli_epi16(q1_8x16, 1);
    657     temp1 = _mm_add_epi16(temp5, temp3);
    658     temp2 = _mm_add_epi16(temp6, temp4);
    659     p0_16x8_1 = _mm_srai_epi16(temp1, 2);
    660     q0_16x8_1 = _mm_srai_epi16(temp2, 2);
    661 
    662     // p1_2 and q1_2
    663     temp6 = _mm_add_epi16(temp6, p0_8x16);
    664     temp5 = _mm_add_epi16(temp5, q0_8x16);
    665     temp1 = _mm_add_epi16(temp6, p2_8x16);
    666     temp2 = _mm_add_epi16(temp5, q2_8x16);
    667     p1_16x8_2 = _mm_srai_epi16(temp1, 2);
    668     q1_16x8_2 = _mm_srai_epi16(temp2, 2);
    669 
    670     // p0_2 and q0_2
    671     temp1 = _mm_add_epi16(temp3, p2_8x16);
    672     temp2 = _mm_add_epi16(temp4, q2_8x16);
    673     temp1 = _mm_add_epi16(temp1, q1_8x16);
    674     temp2 = _mm_add_epi16(temp2, p1_8x16);
    675     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
    676     temp3 = _mm_slli_epi16(temp3, 1);
    677     temp1 = _mm_add_epi16(temp1, temp3);
    678     temp2 = _mm_add_epi16(temp2, temp3);
    679     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
    680     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
    681     p0_16x8_2 = _mm_srai_epi16(temp1, 3);
    682     q0_16x8_2 = _mm_srai_epi16(temp2, 3);
    683 
    684     // p2_2 and q2_2
    685     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
    686     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
    687     temp3 = _mm_slli_epi16(p2_8x16, 1);
    688     temp4 = _mm_slli_epi16(q2_8x16, 1);
    689     temp3 = _mm_add_epi16(p2_8x16, temp3);
    690     temp4 = _mm_add_epi16(q2_8x16, temp4);
    691     temp5 = _mm_slli_epi16(p3_8x16, 1);
    692     temp6 = _mm_slli_epi16(q3_8x16, 1);
    693     temp1 = _mm_add_epi16(temp1, temp3);
    694     temp2 = _mm_add_epi16(temp2, temp4);
    695     temp1 = _mm_add_epi16(temp1, temp5);
    696     temp2 = _mm_add_epi16(temp2, temp6);
    697     p2_16x8_2 = _mm_srai_epi16(temp1, 3);
    698     q2_16x8_2 = _mm_srai_epi16(temp2, 3);
    699 
    700     // Second 8 pixels and packing with first 8 pixels
    701     p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
    702     p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
    703     p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
    704     p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
    705     q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
    706     q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
    707     q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
    708     q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
    709 
    710     // p0_1 and q0_1
    711     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
    712     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
    713     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
    714     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
    715     temp3 = _mm_slli_epi16(p1_8x16, 1);
    716     temp4 = _mm_slli_epi16(q1_8x16, 1);
    717     temp1 = _mm_add_epi16(temp5, temp3);
    718     temp2 = _mm_add_epi16(temp6, temp4);
    719     temp1 = _mm_srai_epi16(temp1, 2);
    720     temp2 = _mm_srai_epi16(temp2, 2);
    721     p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
    722     q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
    723 
    724     // p1_2 and q1_2
    725     temp6 = _mm_add_epi16(temp6, p0_8x16);
    726     temp5 = _mm_add_epi16(temp5, q0_8x16);
    727     temp1 = _mm_add_epi16(temp6, p2_8x16);
    728     temp2 = _mm_add_epi16(temp5, q2_8x16);
    729     temp1 = _mm_srai_epi16(temp1, 2);
    730     temp2 = _mm_srai_epi16(temp2, 2);
    731     p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
    732     q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
    733 
    734     // p0_2 and q0_2
    735     temp1 = _mm_add_epi16(temp3, p2_8x16);
    736     temp2 = _mm_add_epi16(temp4, q2_8x16);
    737     temp1 = _mm_add_epi16(temp1, q1_8x16);
    738     temp2 = _mm_add_epi16(temp2, p1_8x16);
    739     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
    740     temp3 = _mm_slli_epi16(temp3, 1);
    741     temp1 = _mm_add_epi16(temp1, temp3);
    742     temp2 = _mm_add_epi16(temp2, temp3);
    743     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
    744     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
    745     temp1 = _mm_srai_epi16(temp1, 3);
    746     temp2 = _mm_srai_epi16(temp2, 3);
    747     p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
    748     q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
    749 
    750     // p2_2 and q2_2
    751     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
    752     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
    753     temp3 = _mm_slli_epi16(p2_8x16, 1);
    754     temp4 = _mm_slli_epi16(q2_8x16, 1);
    755     temp3 = _mm_add_epi16(p2_8x16, temp3);
    756     temp4 = _mm_add_epi16(q2_8x16, temp4);
    757     temp5 = _mm_slli_epi16(p3_8x16, 1);
    758     temp6 = _mm_slli_epi16(q3_8x16, 1);
    759     temp1 = _mm_add_epi16(temp1, temp3);
    760     temp2 = _mm_add_epi16(temp2, temp4);
    761     temp1 = _mm_add_epi16(temp1, temp5);
    762     temp2 = _mm_add_epi16(temp2, temp6);
    763     temp1 = _mm_srai_epi16(temp1, 3);
    764     temp2 = _mm_srai_epi16(temp2, 3);
    765     p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
    766     q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
    767 
    768     // p0 and q0
    769     p0_16x8 = _mm_and_si128(p0_16x8,
    770                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
    771     p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
    772     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
    773     q0_16x8 = _mm_and_si128(q0_16x8,
    774                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
    775     q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
    776     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
    777 
    778     // p0 and q0
    779     p0_16x8 = _mm_and_si128(p0_16x8,
    780                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    781     p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
    782     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
    783     q0_16x8 = _mm_and_si128(q0_16x8,
    784                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    785     q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
    786     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
    787 
    788     // p1 and q1
    789     p1_16x8 = _mm_and_si128(p1_16x8,
    790                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    791     p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
    792     p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
    793     q1_16x8 = _mm_and_si128(q1_16x8,
    794                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    795     q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
    796     q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
    797 
    798     // p2 and q2
    799     p2_16x8 = _mm_and_si128(p2_16x8,
    800                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
    801     p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
    802     p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
    803     q2_16x8 = _mm_and_si128(q2_16x8,
    804                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
    805     q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
    806     q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
    807 
    808     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
    809     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
    810     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
    811 
    812     _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
    813     _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
    814     _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
    815 
    816 }
    817 
    818 /*****************************************************************************/
    819 /*                                                                           */
    820 /*  Function Name : ih264_deblk_luma_vert_bslt4_ssse3()                      */
    821 /*                                                                           */
    822 /*  Description   : This function performs filtering of a luma block         */
    823 /*                  vertical edge when the boundary strength is less than 4. */
    824 /*                                                                           */
    825 /*  Inputs        : pu1_src       - pointer to the src sample q0             */
    826 /*                  src_strd      - source stride                            */
    827 /*                  alpha         - alpha value for the boundary             */
    828 /*                  beta          - beta value for the boundary              */
    829 /*                  u4_bs         - packed Boundary strength array           */
    830 /*                  pu1_cliptab   - tc0_table                                */
    831 /*                                                                           */
    832 /*  Globals       : None                                                     */
    833 /*                                                                           */
    834 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
    835 /*                  title "Filtering process for edges for bS less than 4"   */
    836 /*                  in ITU T Rec H.264.                                      */
    837 /*                                                                           */
    838 /*  Outputs       : None                                                     */
    839 /*                                                                           */
    840 /*  Returns       : None                                                     */
    841 /*                                                                           */
    842 /*  Issues        : None                                                     */
    843 /*                                                                           */
    844 /*  Revision History:                                                        */
    845 /*                                                                           */
    846 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
    847 /*         12 02 2015   Naveen Kumar P  Initial version                      */
    848 /*                                                                           */
    849 /*****************************************************************************/
    850 void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
    851                                        WORD32 src_strd,
    852                                        WORD32 alpha,
    853                                        WORD32 beta,
    854                                        UWORD32 u4_bs,
    855                                        const UWORD8 *pu1_cliptab)
    856 {
    857     UWORD8 u1_Bs, u1_Bs1;
    858 
    859     WORD32 j = 0;
    860 
    861     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
    862     __m128i int1, int2, int3, int4, high1, high2;
    863     __m128i flag, flag1, i_C, i_C0;
    864     __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
    865                     temp1;
    866     __m128i zero = _mm_setzero_si128();
    867 
    868     for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
    869     {
    870         //Transpose
    871         linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
    872         lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
    873         linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
    874         lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
    875 
    876         linea = _mm_unpacklo_epi8(linea, zero);
    877         lineb = _mm_unpacklo_epi8(lineb, zero);
    878         linec = _mm_unpacklo_epi8(linec, zero);
    879         lined = _mm_unpacklo_epi8(lined, zero);
    880 
    881         int1 = _mm_unpacklo_epi16(linea, lineb);
    882         lineb = _mm_unpackhi_epi16(linea, lineb);
    883 
    884         int2 = _mm_unpacklo_epi16(linec, lined);
    885         lined = _mm_unpackhi_epi16(linec, lined);
    886 
    887         linea = _mm_unpacklo_epi16(int1, int2);
    888         int1 = _mm_unpackhi_epi16(int1, int2);
    889 
    890         linec = _mm_unpacklo_epi16(lineb, lined);
    891         high1 = _mm_unpackhi_epi16(lineb, lined);
    892 
    893         linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
    894         linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
    895         lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
    896         lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
    897 
    898         linee = _mm_unpacklo_epi8(linee, zero);
    899         linef = _mm_unpacklo_epi8(linef, zero);
    900         lineg = _mm_unpacklo_epi8(lineg, zero);
    901         lineh = _mm_unpacklo_epi8(lineh, zero);
    902 
    903         int2 = _mm_unpacklo_epi16(linee, linef);
    904         linef = _mm_unpackhi_epi16(linee, linef);
    905 
    906         int3 = _mm_unpacklo_epi16(lineg, lineh);
    907         lineh = _mm_unpackhi_epi16(lineg, lineh);
    908 
    909         linee = _mm_unpacklo_epi16(int2, int3);
    910         int2 = _mm_unpackhi_epi16(int2, int3);
    911 
    912         lineg = _mm_unpacklo_epi16(linef, lineh);
    913         high2 = _mm_unpackhi_epi16(linef, lineh);
    914 
    915         int4 = _mm_unpacklo_epi16(linea, linee);
    916         lineb = _mm_unpackhi_epi16(linea, linee);
    917 
    918         int3 = _mm_unpacklo_epi16(int1, int2);
    919         lined = _mm_unpackhi_epi16(int1, int2);
    920 
    921         int2 = _mm_unpacklo_epi16(linec, lineg);
    922         linef = _mm_unpackhi_epi16(linec, lineg);
    923 
    924         linea = int4;
    925         linec = int3;
    926         linee = int2;
    927 
    928         lineg = _mm_unpacklo_epi16(high1, high2);
    929         lineh = _mm_unpackhi_epi16(high1, high2);
    930 
    931         //end of transpose
    932 
    933         u1_Bs = (u4_bs >> 24) & 0xff;
    934         u1_Bs1 = (u4_bs >> 16) & 0xff;
    935         u4_bs <<= 16;
    936 
    937         flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
    938                               u1_Bs1, u1_Bs);
    939         flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
    940         flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
    941 
    942         i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
    943                              pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
    944                              pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
    945                              pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
    946 
    947         diff = _mm_subs_epi16(linec, lined); //Condn 1
    948         diff = _mm_abs_epi16(diff);
    949         const1 = _mm_set1_epi16(alpha);
    950         flag = _mm_cmpgt_epi16(const1, diff);
    951 
    952         diff = _mm_subs_epi16(linee, lined); //Condtn 2
    953         diff = _mm_abs_epi16(diff);
    954         const1 = _mm_set1_epi16(beta);
    955         flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
    956 
    957         diff = _mm_subs_epi16(lineb, linec); //Condtn 3
    958         diff = _mm_abs_epi16(diff);
    959         flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
    960 
    961         flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
    962 
    963         //Adding Ap<Beta and Aq<Beta
    964         i_Ap = _mm_subs_epi16(linea, linec);
    965         i_Ap = _mm_abs_epi16(i_Ap);
    966         const2 = _mm_cmpgt_epi16(const1, i_Ap);
    967         const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
    968         i_C = _mm_add_epi16(i_C0, const2);
    969 
    970         i_Aq = _mm_subs_epi16(linef, lined);
    971         i_Aq = _mm_abs_epi16(i_Aq);
    972         const2 = _mm_cmpgt_epi16(const1, i_Aq);
    973         const2 = _mm_subs_epi16(zero, const2);
    974         i_C = _mm_add_epi16(i_C, const2);
    975 
    976         //Calculate in_macro
    977         diff = _mm_subs_epi16(lined, linec);
    978         diff = _mm_slli_epi16(diff, 2);
    979         const2 = _mm_subs_epi16(lineb, linee);
    980         diff = _mm_add_epi16(diff, const2);
    981         const2 = _mm_set1_epi16(4);
    982         diff = _mm_add_epi16(diff, const2);
    983         in_macro = _mm_srai_epi16(diff, 3);
    984 
    985         in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
    986         i_C = _mm_subs_epi16(zero, i_C);
    987         in_macro = _mm_max_epi16(i_C, in_macro);
    988 
    989         //Compute and store
    990         in_macrotemp = _mm_add_epi16(linec, in_macro);
    991         in_macrotemp = _mm_and_si128(in_macrotemp, flag);
    992         temp = _mm_and_si128(linec,
    993                              _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
    994         temp = _mm_add_epi16(temp, in_macrotemp);
    995         //temp= _mm_packus_epi16 (temp, zero);
    996         //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
    997 
    998         in_macrotemp = _mm_subs_epi16(lined, in_macro);
    999         in_macrotemp = _mm_and_si128(in_macrotemp, flag);
   1000         temp1 = _mm_and_si128(lined,
   1001                               _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
   1002         temp1 = _mm_add_epi16(temp1, in_macrotemp);
   1003         //temp1= _mm_packus_epi16 (temp1, zero);
   1004         //_mm_storel_epi64(pu1_src+i, in_macrotemp);
   1005 
   1006         //If Ap<Beta
   1007         flag1 = _mm_cmpgt_epi16(const1, i_Ap);
   1008         flag1 = _mm_and_si128(flag, flag1);
   1009         in_macrotemp = _mm_add_epi16(linec, lined);
   1010         in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
   1011         in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
   1012         in_macro = _mm_add_epi16(in_macrotemp, linea);
   1013         in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
   1014         in_macro = _mm_srai_epi16(in_macro, 1);
   1015 
   1016         in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
   1017         i_C0 = _mm_subs_epi16(zero, i_C0);
   1018         in_macro = _mm_max_epi16(i_C0, in_macro);
   1019 
   1020         in_macro = _mm_and_si128(in_macro, flag1);
   1021         lineb = _mm_add_epi16(lineb, in_macro);
   1022         //in_macro= _mm_packus_epi16 (i_p1, zero);
   1023         //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
   1024 
   1025         flag1 = _mm_cmpgt_epi16(const1, i_Aq);
   1026         flag1 = _mm_and_si128(flag, flag1);
   1027         in_macro = _mm_add_epi16(in_macrotemp, linef);
   1028         in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
   1029         in_macro = _mm_srai_epi16(in_macro, 1);
   1030 
   1031         i_C0 = _mm_abs_epi16(i_C0);
   1032         in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
   1033         i_C0 = _mm_subs_epi16(zero, i_C0);
   1034         in_macro = _mm_max_epi16(i_C0, in_macro);
   1035 
   1036         in_macro = _mm_and_si128(in_macro, flag1);
   1037         linee = _mm_add_epi16(linee, in_macro);
   1038         //in_macro= _mm_packus_epi16 (i_q1, zero);
   1039         //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
   1040         linec = temp;
   1041         lined = temp1;
   1042         //End of filtering
   1043 
   1044         int1 = _mm_unpacklo_epi16(linea, linee);
   1045         linee = _mm_unpackhi_epi16(linea, linee);
   1046 
   1047         int2 = _mm_unpacklo_epi16(linec, lineg);
   1048         lineg = _mm_unpackhi_epi16(linec, lineg);
   1049 
   1050         linea = _mm_unpacklo_epi16(int1, int2);
   1051         int3 = _mm_unpackhi_epi16(int1, int2);
   1052 
   1053         linec = _mm_unpacklo_epi16(linee, lineg);
   1054         lineg = _mm_unpackhi_epi16(linee, lineg);
   1055 
   1056         int1 = _mm_unpacklo_epi16(lineb, linef);
   1057         linef = _mm_unpackhi_epi16(lineb, linef);
   1058 
   1059         int2 = _mm_unpacklo_epi16(lined, lineh);
   1060         lineh = _mm_unpackhi_epi16(lined, lineh);
   1061 
   1062         lineb = _mm_unpacklo_epi16(int1, int2);
   1063         int4 = _mm_unpackhi_epi16(int1, int2);
   1064 
   1065         lined = _mm_unpacklo_epi16(linef, lineh);
   1066         lineh = _mm_unpackhi_epi16(linef, lineh);
   1067 
   1068         int1 = _mm_unpackhi_epi16(linea, lineb);
   1069         linea = _mm_unpacklo_epi16(linea, lineb);
   1070 
   1071         int2 = _mm_unpacklo_epi16(int3, int4);
   1072         high1 = _mm_unpackhi_epi16(int3, int4);
   1073 
   1074         lineb = _mm_unpacklo_epi16(linec, lined);
   1075         linef = _mm_unpackhi_epi16(linec, lined);
   1076 
   1077         lined = _mm_unpacklo_epi16(lineg, lineh);
   1078         lineh = _mm_unpackhi_epi16(lineg, lineh);
   1079 
   1080         linee = int1;
   1081         lineg = high1;
   1082         linec = int2;
   1083         //End of inverse transpose
   1084 
   1085         //Packs and stores
   1086         linea = _mm_packus_epi16(linea, zero);
   1087         _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
   1088 
   1089         lineb = _mm_packus_epi16(lineb, zero);
   1090         _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
   1091 
   1092         linec = _mm_packus_epi16(linec, zero);
   1093         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
   1094 
   1095         lined = _mm_packus_epi16(lined, zero);
   1096         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
   1097 
   1098         linee = _mm_packus_epi16(linee, zero);
   1099         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
   1100 
   1101         linef = _mm_packus_epi16(linef, zero);
   1102         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
   1103 
   1104         lineg = _mm_packus_epi16(lineg, zero);
   1105         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
   1106 
   1107         lineh = _mm_packus_epi16(lineh, zero);
   1108         _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
   1109 
   1110     }
   1111 }
   1112 
   1113 /*****************************************************************************/
   1114 /*                                                                           */
   1115 /*  Function Name : ih264_deblk_luma_horz_bslt4_ssse3()                      */
   1116 /*                                                                           */
   1117 /*  Description   : This function performs filtering of a luma block         */
   1118 /*                  horizontal edge when boundary strength is less than 4.   */
   1119 /*                                                                           */
   1120 /*  Inputs        : pu1_src       - pointer to the src sample q0             */
   1121 /*                  src_strd      - source stride                            */
   1122 /*                  alpha         - alpha value for the boundary             */
   1123 /*                  beta          - beta value for the boundary              */
   1124 /*                  u4_bs         - packed Boundary strength array           */
   1125 /*                  pu1_cliptab   - tc0_table                                */
   1126 /*                                                                           */
   1127 /*  Globals       : None                                                     */
   1128 /*                                                                           */
   1129 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
   1130 /*                  title "Filtering process for edges for bS less than 4"   */
   1131 /*                  in ITU T Rec H.264.                                      */
   1132 /*                                                                           */
   1133 /*  Outputs       : None                                                     */
   1134 /*                                                                           */
   1135 /*  Returns       : None                                                     */
   1136 /*                                                                           */
   1137 /*  Issues        : None                                                     */
   1138 /*                                                                           */
   1139 /*  Revision History:                                                        */
   1140 /*                                                                           */
   1141 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
   1142 /*         12 02 2015   Naveen Kumar P  Initial version                      */
   1143 /*                                                                           */
   1144 /*****************************************************************************/
   1145 void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
   1146                                        WORD32 src_strd,
   1147                                        WORD32 alpha,
   1148                                        WORD32 beta,
   1149                                        UWORD32 u4_bs,
   1150                                        const UWORD8 *pu1_cliptab)
   1151 {
   1152     WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
   1153     UWORD8 *pu1_HorzPixel;
   1154     __m128i zero = _mm_setzero_si128();
   1155     __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
   1156     __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
   1157     __m128i temp1, temp2;
   1158     __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
   1159     __m128i in_macro_16x8, in_macro_hi_16x8;
   1160     __m128i const_val4_8x16;
   1161     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
   1162     UWORD8 clip0, clip1, clip2, clip3;
   1163 
   1164     pu1_HorzPixel = pu1_src - (src_strd << 2);
   1165 
   1166     i16_posQ1 = src_strd;
   1167     i16_posQ2 = X2(src_strd);
   1168     i16_posP0 = X3(src_strd);
   1169     i16_posP1 = X2(src_strd);
   1170     i16_posP2 = src_strd;
   1171 
   1172     q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
   1173     q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
   1174 
   1175     u1_Bs0 = (u4_bs >> 24) & 0xff;
   1176     u1_Bs1 = (u4_bs >> 16) & 0xff;
   1177     u1_Bs2 = (u4_bs >> 8) & 0xff;
   1178     u1_Bs3 = (u4_bs >> 0) & 0xff;
   1179     clip0 = pu1_cliptab[u1_Bs0];
   1180     clip1 = pu1_cliptab[u1_Bs1];
   1181     clip2 = pu1_cliptab[u1_Bs2];
   1182     clip3 = pu1_cliptab[u1_Bs3];
   1183 
   1184     Alpha_8x16 = _mm_set1_epi16(alpha);
   1185     Beta_8x16 = _mm_set1_epi16(beta);
   1186 
   1187     bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
   1188                                  u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
   1189                                  u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
   1190 
   1191     C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
   1192                            clip2, clip1, clip1, clip1, clip1, clip0, clip0,
   1193                            clip0, clip0);
   1194 
   1195     bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
   1196     bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
   1197     C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
   1198     C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
   1199 
   1200     p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
   1201     p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
   1202     p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
   1203     q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
   1204 
   1205     //Cond1 (ABS(p0 - q0) < alpha)
   1206     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
   1207     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
   1208     temp1 = _mm_add_epi8(temp1, temp2);
   1209 
   1210     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1211     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1212 
   1213     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
   1214     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
   1215 
   1216     flag1_16x8 = _mm_packs_epi16(temp2, temp1);
   1217     flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
   1218 
   1219     //Cond2 (ABS(q1 - q0) < beta)
   1220     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
   1221     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
   1222     temp1 = _mm_add_epi8(temp1, temp2);
   1223 
   1224     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1225     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1226 
   1227     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1228     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
   1229 
   1230     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
   1231 
   1232     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1233 
   1234     //Cond3 (ABS(p1 - p0) < beta)
   1235     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
   1236     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
   1237     temp1 = _mm_add_epi8(temp1, temp2);
   1238 
   1239     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1240     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1241 
   1242     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1243     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
   1244 
   1245     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
   1246 
   1247     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
   1248     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1249 
   1250     // (ABS(p2 - p0) < beta)
   1251     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
   1252     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
   1253     temp1 = _mm_add_epi8(temp1, temp2);
   1254 
   1255     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1256     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1257     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1258     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
   1259 
   1260     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
   1261     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1262 
   1263     temp2 = _mm_subs_epi16(zero, temp2);
   1264     temp1 = _mm_subs_epi16(zero, temp1);
   1265 
   1266     C_8x16 = _mm_add_epi16(C0_8x16, temp2);
   1267     C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
   1268 
   1269     // (ABS(q2 - q0) < beta)
   1270     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
   1271     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
   1272     temp1 = _mm_add_epi8(temp1, temp2);
   1273 
   1274     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1275     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1276     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1277     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
   1278 
   1279     flag3_16x8 = _mm_packs_epi16(temp2, temp1);
   1280     flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
   1281 
   1282     temp2 = _mm_subs_epi16(zero, temp2);
   1283     temp1 = _mm_subs_epi16(zero, temp1);
   1284 
   1285     C_8x16 = _mm_add_epi16(C_8x16, temp2);
   1286     C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
   1287 
   1288     const_val4_8x16 = _mm_set1_epi16(4);
   1289     temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
   1290                            _mm_unpacklo_epi8(p0_16x8, zero));
   1291     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
   1292                            _mm_unpacklo_epi8(q1_16x8, zero));
   1293     temp1 = _mm_slli_epi16(temp1, 2);
   1294     temp1 = _mm_add_epi16(temp1, temp2);
   1295     temp1 = _mm_add_epi16(temp1, const_val4_8x16);
   1296     in_macro_16x8 = _mm_srai_epi16(temp1, 3);
   1297 
   1298     temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
   1299                            _mm_unpackhi_epi8(p0_16x8, zero));
   1300     temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
   1301                            _mm_unpackhi_epi8(q1_16x8, zero));
   1302     temp1 = _mm_slli_epi16(temp1, 2);
   1303     temp1 = _mm_add_epi16(temp1, temp2);
   1304     temp1 = _mm_add_epi16(temp1, const_val4_8x16);
   1305     in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
   1306 
   1307     in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
   1308     in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
   1309     C_8x16 = _mm_subs_epi16(zero, C_8x16);
   1310     C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
   1311     in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
   1312     in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
   1313 
   1314     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
   1315     temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
   1316 
   1317     temp1 = _mm_packus_epi16(temp1, temp2);
   1318 
   1319     temp1 = _mm_and_si128(temp1, flag1_16x8);
   1320     temp2 = _mm_and_si128(p0_16x8,
   1321                           _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
   1322 
   1323     temp1 = _mm_add_epi8(temp1, temp2);
   1324 
   1325     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
   1326 
   1327     temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
   1328     temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
   1329 
   1330     temp1 = _mm_packus_epi16(temp1, temp2);
   1331 
   1332     temp1 = _mm_and_si128(temp1, flag1_16x8);
   1333     temp2 = _mm_and_si128(q0_16x8,
   1334                           _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
   1335 
   1336     temp1 = _mm_add_epi8(temp1, temp2);
   1337     _mm_storeu_si128((__m128i *)(pu1_src), temp1);
   1338 
   1339     //if(Ap < Beta)
   1340     temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
   1341                           _mm_unpacklo_epi8(p0_16x8, zero));
   1342     temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
   1343     //temp2 = _mm_subs_epi16(zero,temp2);
   1344     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
   1345     temp2 = _mm_add_epi16(temp1, temp2);
   1346     in_macro_16x8 = _mm_srai_epi16(temp2, 1);
   1347 
   1348     temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
   1349                           _mm_unpackhi_epi8(p0_16x8, zero));
   1350     temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
   1351     //temp2 = _mm_subs_epi16(zero,temp2);
   1352     temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
   1353     temp2 = _mm_add_epi16(temp1, temp2);
   1354     in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
   1355 
   1356     in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
   1357     in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
   1358     C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
   1359     C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
   1360     in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
   1361     in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
   1362 
   1363     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
   1364     temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
   1365 
   1366     temp1 = _mm_packus_epi16(temp1, temp2);
   1367 
   1368     temp1 = _mm_and_si128(temp1, flag2_16x8);
   1369     temp2 = _mm_and_si128(p1_16x8,
   1370                           _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
   1371     temp1 = _mm_add_epi8(temp1, temp2);
   1372     _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
   1373 
   1374     //if(Aq < Beta)
   1375     temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
   1376                           _mm_unpacklo_epi8(p0_16x8, zero));
   1377     temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
   1378     //temp2 = _mm_slli_epi16 (temp2, 1);
   1379     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
   1380     temp2 = _mm_add_epi16(temp1, temp2);
   1381     in_macro_16x8 = _mm_srai_epi16(temp2, 1);
   1382 
   1383     temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
   1384                           _mm_unpackhi_epi8(p0_16x8, zero));
   1385     temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
   1386     //temp2 = _mm_slli_epi16 (temp2, 1);
   1387     temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
   1388     temp2 = _mm_add_epi16(temp1, temp2);
   1389     in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
   1390 
   1391     in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
   1392     in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
   1393     C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
   1394     C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
   1395     in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
   1396     in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
   1397 
   1398     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
   1399     temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
   1400 
   1401     temp1 = _mm_packus_epi16(temp1, temp2);
   1402 
   1403     temp1 = _mm_and_si128(temp1, flag3_16x8);
   1404     temp2 = _mm_and_si128(q1_16x8,
   1405                           _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
   1406     temp1 = _mm_add_epi8(temp1, temp2);
   1407 
   1408     _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
   1409 
   1410 }
   1411 
   1412 /*****************************************************************************/
   1413 /*                                                                           */
   1414 /*  Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3()                  */
   1415 /*                                                                           */
   1416 /*  Description   : This function performs filtering of a luma block         */
   1417 /*                  vertical edge when boundary strength is set to 4.        */
   1418 /*                                                                           */
   1419 /*  Inputs        : pu1_src       - pointer to the src sample q0             */
   1420 /*                  src_strd      - source stride                            */
   1421 /*                  alpha         - alpha value for the boundary             */
   1422 /*                  beta          - beta value for the boundary              */
   1423 /*                                                                           */
   1424 /*  Globals       : None                                                     */
   1425 /*                                                                           */
   1426 /*  Processing    : When the function is called twice, this operation is as  */
   1427 /*                  described in Sec. 8.7.2.3 under the title "Filtering     */
   1428 /*                  process for edges for bS equal to 4" in ITU T Rec H.264. */
   1429 /*                                                                           */
   1430 /*  Outputs       : None                                                     */
   1431 /*                                                                           */
   1432 /*  Returns       : None                                                     */
   1433 /*                                                                           */
   1434 /*  Issues        : None                                                     */
   1435 /*                                                                           */
   1436 /*  Revision History:                                                        */
   1437 /*                                                                           */
   1438 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
   1439 /*         12 02 2015   Naveen Kumar P  Initial version                      */
   1440 /*                                                                           */
   1441 /*****************************************************************************/
   1442 void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
   1443                                            WORD32 src_strd,
   1444                                            WORD32 alpha,
   1445                                            WORD32 beta)
   1446 {
   1447     __m128i zero = _mm_setzero_si128();
   1448     __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
   1449     __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
   1450     __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
   1451     __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
   1452     __m128i q0_16x8_1;
   1453     __m128i p0_16x8_1;
   1454     __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
   1455     __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
   1456     __m128i temp1, temp2, temp3, temp4, temp5, temp6;
   1457     __m128i Alpha_8x16, Beta_8x16;
   1458     __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
   1459     __m128i const_val2_16x8 = _mm_set1_epi16(2);
   1460     __m128i line1, line2, line3, line4, line5, line6, line7, line8;
   1461 
   1462     Alpha_8x16 = _mm_set1_epi16(alpha);
   1463     Beta_8x16 = _mm_set1_epi16(beta);
   1464 
   1465     line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
   1466     line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
   1467     line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
   1468     line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
   1469     line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
   1470     line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
   1471     line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
   1472     line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
   1473 
   1474     temp1 = _mm_unpacklo_epi8(line1, line2);
   1475     temp2 = _mm_unpacklo_epi8(line3, line4);
   1476     temp3 = _mm_unpacklo_epi8(line5, line6);
   1477     temp4 = _mm_unpacklo_epi8(line7, line8);
   1478 
   1479     line1 = _mm_unpacklo_epi16(temp1, temp2);
   1480     line2 = _mm_unpackhi_epi16(temp1, temp2);
   1481     line3 = _mm_unpacklo_epi16(temp3, temp4);
   1482     line4 = _mm_unpackhi_epi16(temp3, temp4);
   1483 
   1484     p1_8x16 = _mm_unpacklo_epi32(line1, line3);
   1485     p0_8x16 = _mm_unpackhi_epi32(line1, line3);
   1486     q0_8x16 = _mm_unpacklo_epi32(line2, line4);
   1487     q1_8x16 = _mm_unpackhi_epi32(line2, line4);
   1488 
   1489     p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
   1490     p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
   1491     q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
   1492     q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
   1493     p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
   1494     p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
   1495     q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
   1496     q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
   1497 
   1498     //Cond1 (ABS(p0 - q0) < alpha)
   1499     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
   1500     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
   1501     temp1 = _mm_add_epi8(temp1, temp2);
   1502 
   1503     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1504     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1505 
   1506     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
   1507     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
   1508 
   1509     flag1_16x8 = _mm_packs_epi16(temp2, temp1);
   1510 
   1511     //Cond2 (ABS(q1 - q0) < beta)
   1512     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
   1513     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
   1514     temp1 = _mm_add_epi8(temp1, temp2);
   1515 
   1516     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1517     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1518 
   1519     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1520     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
   1521 
   1522     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
   1523 
   1524     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1525 
   1526     //Cond3 (ABS(p1 - p0) < beta)
   1527     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
   1528     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
   1529     temp1 = _mm_add_epi8(temp1, temp2);
   1530 
   1531     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1532     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1533 
   1534     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1535     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
   1536 
   1537     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
   1538 
   1539     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
   1540     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1541 
   1542     // (ABS(p0 - q0) < ((alpha >> 2) + 2))
   1543     temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
   1544     temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
   1545     temp1 = _mm_add_epi8(temp1, temp2);
   1546     Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
   1547     Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
   1548 
   1549     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1550     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1551     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
   1552     temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
   1553 
   1554     flag2_16x8 = _mm_packs_epi16(temp2, temp1);
   1555     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1556 
   1557     // (ABS(p2 - p0) < beta)
   1558     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
   1559     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
   1560     temp1 = _mm_add_epi8(temp1, temp2);
   1561 
   1562     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1563     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1564     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1565     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
   1566 
   1567     flag3_16x8 = _mm_packs_epi16(temp2, temp1);
   1568     flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
   1569 
   1570     // (ABS(q2 - q0) < beta)
   1571     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
   1572     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
   1573     temp1 = _mm_add_epi8(temp1, temp2);
   1574 
   1575     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1576     temp1 = _mm_unpackhi_epi8(temp1, zero);
   1577     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1578     temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
   1579 
   1580     flag4_16x8 = _mm_packs_epi16(temp2, temp1);
   1581     flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
   1582 
   1583     // First 8 pixels
   1584     p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
   1585     p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
   1586     p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
   1587     p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
   1588     q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
   1589     q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
   1590     q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
   1591     q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
   1592 
   1593     // p0_1 and q0_1
   1594     temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
   1595     temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
   1596     temp5 = _mm_add_epi16(temp1, const_val2_16x8);
   1597     temp6 = _mm_add_epi16(temp2, const_val2_16x8);
   1598     temp3 = _mm_slli_epi16(p1_8x16, 1);
   1599     temp4 = _mm_slli_epi16(q1_8x16, 1);
   1600     temp1 = _mm_add_epi16(temp5, temp3);
   1601     temp2 = _mm_add_epi16(temp6, temp4);
   1602     p0_16x8_1 = _mm_srai_epi16(temp1, 2);
   1603     q0_16x8_1 = _mm_srai_epi16(temp2, 2);
   1604 
   1605     // p1_2 and q1_2
   1606     temp6 = _mm_add_epi16(temp6, p0_8x16);
   1607     temp5 = _mm_add_epi16(temp5, q0_8x16);
   1608     temp1 = _mm_add_epi16(temp6, p2_8x16);
   1609     temp2 = _mm_add_epi16(temp5, q2_8x16);
   1610     p1_16x8_2 = _mm_srai_epi16(temp1, 2);
   1611     q1_16x8_2 = _mm_srai_epi16(temp2, 2);
   1612 
   1613     // p0_2 and q0_2
   1614     temp1 = _mm_add_epi16(temp3, p2_8x16);
   1615     temp2 = _mm_add_epi16(temp4, q2_8x16);
   1616     temp1 = _mm_add_epi16(temp1, q1_8x16);
   1617     temp2 = _mm_add_epi16(temp2, p1_8x16);
   1618     temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
   1619     temp3 = _mm_slli_epi16(temp3, 1);
   1620     temp1 = _mm_add_epi16(temp1, temp3);
   1621     temp2 = _mm_add_epi16(temp2, temp3);
   1622     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
   1623     temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
   1624     p0_16x8_2 = _mm_srai_epi16(temp1, 3);
   1625     q0_16x8_2 = _mm_srai_epi16(temp2, 3);
   1626 
   1627     // p2_2 and q2_2
   1628     temp1 = _mm_add_epi16(temp6, const_val2_16x8);
   1629     temp2 = _mm_add_epi16(temp5, const_val2_16x8);
   1630     temp3 = _mm_slli_epi16(p2_8x16, 1);
   1631     temp4 = _mm_slli_epi16(q2_8x16, 1);
   1632     temp3 = _mm_add_epi16(p2_8x16, temp3);
   1633     temp4 = _mm_add_epi16(q2_8x16, temp4);
   1634     temp5 = _mm_slli_epi16(p3_8x16, 1);
   1635     temp6 = _mm_slli_epi16(q3_8x16, 1);
   1636     temp1 = _mm_add_epi16(temp1, temp3);
   1637     temp2 = _mm_add_epi16(temp2, temp4);
   1638     temp1 = _mm_add_epi16(temp1, temp5);
   1639     temp2 = _mm_add_epi16(temp2, temp6);
   1640     p2_16x8_2 = _mm_srai_epi16(temp1, 3);
   1641     q2_16x8_2 = _mm_srai_epi16(temp2, 3);
   1642 
   1643     // p0_1 and q0_1
   1644     p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
   1645     q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
   1646 
   1647     // p1_2 and q1_2
   1648     p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
   1649     q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
   1650 
   1651     // p0_2 and q0_2
   1652     p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
   1653     q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
   1654 
   1655     // p2_2 and q2_2
   1656     p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
   1657     q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
   1658 
   1659     // p0 and q0
   1660     p0_16x8 = _mm_and_si128(p0_16x8,
   1661                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
   1662     p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
   1663     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
   1664     q0_16x8 = _mm_and_si128(q0_16x8,
   1665                             _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
   1666     q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
   1667     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
   1668 
   1669     // p0 and q0
   1670     p0_16x8 = _mm_and_si128(p0_16x8,
   1671                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
   1672     p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
   1673     p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
   1674     q0_16x8 = _mm_and_si128(q0_16x8,
   1675                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
   1676     q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
   1677     q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
   1678 
   1679     // p1 and q1
   1680     p1_16x8 = _mm_and_si128(p1_16x8,
   1681                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
   1682     p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
   1683     p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
   1684     q1_16x8 = _mm_and_si128(q1_16x8,
   1685                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
   1686     q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
   1687     q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
   1688 
   1689     // p2 and q2
   1690     p2_16x8 = _mm_and_si128(p2_16x8,
   1691                             _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
   1692     p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
   1693     p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
   1694     q2_16x8 = _mm_and_si128(q2_16x8,
   1695                             _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
   1696     q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
   1697     q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
   1698 
   1699     temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
   1700     temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
   1701     temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
   1702     temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
   1703 
   1704     p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
   1705     p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
   1706     q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
   1707     q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
   1708 
   1709     line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
   1710     line2 = _mm_srli_si128(line1, 8);
   1711     line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
   1712     line4 = _mm_srli_si128(line3, 8);
   1713     line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
   1714     line6 = _mm_srli_si128(line5, 8);
   1715     line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
   1716     line8 = _mm_srli_si128(line7, 8);
   1717 
   1718     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
   1719     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
   1720     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
   1721     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
   1722     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
   1723     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
   1724     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
   1725     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
   1726 
   1727 }
   1728 
   1729 /*****************************************************************************/
   1730 /*                                                                           */
   1731 /*  Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3()                */
   1732 /*                                                                           */
   1733 /*  Description   : This function performs filtering of a luma block         */
   1734 /*                  vertical edge when boundary strength is less than 4.     */
   1735 /*                                                                           */
   1736 /*  Inputs        : pu1_src       - pointer to the src sample q0             */
   1737 /*                  src_strd      - source stride                            */
   1738 /*                  alpha         - alpha value for the boundary             */
   1739 /*                  beta          - beta value for the boundary              */
   1740 /*                  u4_bs         - packed Boundary strength array           */
   1741 /*                  pu1_cliptab   - tc0_table                                */
   1742 /*                                                                           */
   1743 /*  Globals       : None                                                     */
   1744 /*                                                                           */
   1745 /*  Processing    : When the function is called twice, this operation is as  */
   1746 /*                  described in Sec. 8.7.2.3 under the title "Filtering     */
   1747 /*                  process for edges for bS less than 4" in ITU T Rec H.264.*/
   1748 /*                                                                           */
   1749 /*  Outputs       : None                                                     */
   1750 /*                                                                           */
   1751 /*  Returns       : None                                                     */
   1752 /*                                                                           */
   1753 /*  Issues        : None                                                     */
   1754 /*                                                                           */
   1755 /*  Revision History:                                                        */
   1756 /*                                                                           */
   1757 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
   1758 /*         12 02 2015   Naveen Kumar P  Initial version                      */
   1759 /*                                                                           */
   1760 /*****************************************************************************/
   1761 void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
   1762                                              WORD32 src_strd,
   1763                                              WORD32 alpha,
   1764                                              WORD32 beta,
   1765                                              UWORD32 u4_bs,
   1766                                              const UWORD8 *pu1_cliptab)
   1767 {
   1768     __m128i zero = _mm_setzero_si128();
   1769     __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
   1770     __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
   1771     __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
   1772     __m128i temp1, temp2, temp3, temp4;
   1773     __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
   1774     __m128i in_macro_16x8;
   1775     __m128i const_val4_8x16;
   1776     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
   1777     UWORD8 clip0, clip1, clip2, clip3;
   1778     __m128i line1, line2, line3, line4, line5, line6, line7, line8;
   1779     __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
   1780     __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
   1781 
   1782     line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
   1783     line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
   1784     line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
   1785     line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
   1786     line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
   1787     line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
   1788     line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
   1789     line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
   1790 
   1791     temp1 = _mm_unpacklo_epi8(line1, line2);
   1792     temp2 = _mm_unpacklo_epi8(line3, line4);
   1793     temp3 = _mm_unpacklo_epi8(line5, line6);
   1794     temp4 = _mm_unpacklo_epi8(line7, line8);
   1795 
   1796     line1 = _mm_unpacklo_epi16(temp1, temp2);
   1797     line2 = _mm_unpackhi_epi16(temp1, temp2);
   1798     line3 = _mm_unpacklo_epi16(temp3, temp4);
   1799     line4 = _mm_unpackhi_epi16(temp3, temp4);
   1800 
   1801     temp1 = _mm_unpacklo_epi32(line1, line3);
   1802     temp2 = _mm_unpackhi_epi32(line1, line3);
   1803     temp3 = _mm_unpacklo_epi32(line2, line4);
   1804     temp4 = _mm_unpackhi_epi32(line2, line4);
   1805 
   1806     p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
   1807     p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
   1808     q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
   1809     q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
   1810     p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
   1811     p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
   1812     q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
   1813     q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
   1814 
   1815     u1_Bs0 = (u4_bs >> 24) & 0xff;
   1816     u1_Bs1 = (u4_bs >> 16) & 0xff;
   1817     u1_Bs2 = (u4_bs >> 8) & 0xff;
   1818     u1_Bs3 = (u4_bs >> 0) & 0xff;
   1819     clip0 = pu1_cliptab[u1_Bs0];
   1820     clip1 = pu1_cliptab[u1_Bs1];
   1821     clip2 = pu1_cliptab[u1_Bs2];
   1822     clip3 = pu1_cliptab[u1_Bs3];
   1823 
   1824     Alpha_8x16 = _mm_set1_epi16(alpha);
   1825     Beta_8x16 = _mm_set1_epi16(beta);
   1826 
   1827     bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
   1828                                  u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
   1829 
   1830     C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
   1831                            clip1, clip1, clip0, clip0);
   1832 
   1833     bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
   1834     bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
   1835     C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
   1836 
   1837     //Cond1 (ABS(p0 - q0) < alpha)
   1838     temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
   1839     temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
   1840     temp1 = _mm_add_epi8(temp1, temp2);
   1841 
   1842     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1843     temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
   1844 
   1845     flag1_16x8 = _mm_packs_epi16(temp2, zero);
   1846     flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
   1847 
   1848     //Cond2 (ABS(q1 - q0) < beta)
   1849     temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
   1850     temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
   1851     temp1 = _mm_add_epi8(temp1, temp2);
   1852 
   1853     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1854     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1855 
   1856     flag2_16x8 = _mm_packs_epi16(temp2, zero);
   1857     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1858 
   1859     //Cond3 (ABS(p1 - p0) < beta)
   1860     temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
   1861     temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
   1862     temp1 = _mm_add_epi8(temp1, temp2);
   1863 
   1864     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1865     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1866 
   1867     flag2_16x8 = _mm_packs_epi16(temp2, zero);
   1868 
   1869     // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
   1870     flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1871 
   1872     // (ABS(p2 - p0) < beta)
   1873     temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
   1874     temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
   1875     temp1 = _mm_add_epi8(temp1, temp2);
   1876 
   1877     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1878     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1879 
   1880     flag2_16x8 = _mm_packs_epi16(temp2, zero);
   1881     flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
   1882 
   1883     temp2 = _mm_subs_epi16(zero, temp2);
   1884 
   1885     C_8x16 = _mm_add_epi16(C0_8x16, temp2);
   1886 
   1887     // (ABS(q2 - q0) < beta)
   1888     temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
   1889     temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
   1890     temp1 = _mm_add_epi8(temp1, temp2);
   1891 
   1892     temp2 = _mm_unpacklo_epi8(temp1, zero);
   1893     temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
   1894 
   1895     flag3_16x8 = _mm_packs_epi16(temp2, zero);
   1896     flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
   1897 
   1898     temp2 = _mm_subs_epi16(zero, temp2);
   1899 
   1900     C_8x16 = _mm_add_epi16(C_8x16, temp2);
   1901 
   1902     const_val4_8x16 = _mm_set1_epi16(4);
   1903     temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
   1904                            _mm_unpacklo_epi8(p0_16x8, zero));
   1905     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
   1906                            _mm_unpacklo_epi8(q1_16x8, zero));
   1907     temp1 = _mm_slli_epi16(temp1, 2);
   1908     temp1 = _mm_add_epi16(temp1, temp2);
   1909     temp1 = _mm_add_epi16(temp1, const_val4_8x16);
   1910     in_macro_16x8 = _mm_srai_epi16(temp1, 3);
   1911 
   1912     in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
   1913     C_8x16 = _mm_subs_epi16(zero, C_8x16);
   1914     in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
   1915 
   1916     // p0
   1917     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
   1918 
   1919     temp1 = _mm_packus_epi16(temp1, zero);
   1920 
   1921     p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
   1922     p0_16x8_2 = _mm_and_si128(
   1923                     p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
   1924 
   1925     p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
   1926 
   1927     // q0
   1928     temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
   1929 
   1930     temp1 = _mm_packus_epi16(temp1, zero);
   1931 
   1932     q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
   1933     q0_16x8_2 = _mm_and_si128(
   1934                     q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
   1935 
   1936     q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
   1937 
   1938     //if(Ap < Beta)
   1939     temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
   1940                           _mm_unpacklo_epi8(p0_16x8, zero));
   1941     temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
   1942     //temp2 = _mm_subs_epi16(zero,temp2);
   1943     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
   1944     temp2 = _mm_add_epi16(temp1, temp2);
   1945     in_macro_16x8 = _mm_srai_epi16(temp2, 1);
   1946 
   1947     in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
   1948     C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
   1949     in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
   1950 
   1951     // p1
   1952     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
   1953 
   1954     temp1 = _mm_packus_epi16(temp1, zero);
   1955 
   1956     p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
   1957     p1_16x8 = _mm_and_si128(p1_16x8,
   1958                             _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
   1959     p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
   1960 
   1961     //if(Aq < Beta)
   1962     temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
   1963                           _mm_unpacklo_epi8(p0_16x8, zero));
   1964     temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
   1965     //temp2 = _mm_slli_epi16 (temp2, 1);
   1966     temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
   1967     temp2 = _mm_add_epi16(temp1, temp2);
   1968     in_macro_16x8 = _mm_srai_epi16(temp2, 1);
   1969 
   1970     in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
   1971     C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
   1972     in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
   1973 
   1974     temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
   1975 
   1976     // q1
   1977     temp1 = _mm_packus_epi16(temp1, zero);
   1978 
   1979     q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
   1980     q1_16x8 = _mm_and_si128(q1_16x8,
   1981                             _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
   1982     q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
   1983 
   1984     temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
   1985     temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
   1986     temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
   1987     temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
   1988 
   1989     line7 = _mm_unpacklo_epi16(temp1, temp2);
   1990     temp1 = _mm_unpackhi_epi16(temp1, temp2);
   1991     line8 = _mm_unpacklo_epi16(temp3, temp4);
   1992     temp2 = _mm_unpackhi_epi16(temp3, temp4);
   1993 
   1994     line1 = _mm_unpacklo_epi32(line7, line8);
   1995     line2 = _mm_srli_si128(line1, 8);
   1996     line3 = _mm_unpackhi_epi32(line7, line8);
   1997     line4 = _mm_srli_si128(line3, 8);
   1998     line5 = _mm_unpacklo_epi32(temp1, temp2);
   1999     line6 = _mm_srli_si128(line5, 8);
   2000     line7 = _mm_unpackhi_epi32(temp1, temp2);
   2001     line8 = _mm_srli_si128(line7, 8);
   2002 
   2003     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
   2004     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
   2005     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
   2006     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
   2007     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
   2008     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
   2009     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
   2010     _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
   2011 }
   2012 
   2013