Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2  *
      3  * Copyright (C) 2015 The Android Open Source Project
      4  *
      5  * Licensed under the Apache License, Version 2.0 (the "License");
      6  * you may not use this file except in compliance with the License.
      7  * You may obtain a copy of the License at:
      8  *
      9  * http://www.apache.org/licenses/LICENSE-2.0
     10  *
     11  * Unless required by applicable law or agreed to in writing, software
     12  * distributed under the License is distributed on an "AS IS" BASIS,
     13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14  * See the License for the specific language governing permissions and
     15  * limitations under the License.
     16  *
     17  *****************************************************************************
     18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 */
     20 /*****************************************************************************/
     21 /*                                                                           */
     22 /*  File Name         : ih264_deblk_chroma_ssse3.c                           */
     23 /*                                                                           */
     24 /*  Description       : Contains function definitions for deblocking         */
     25 /*                                                                           */
     26 /*  List of Functions : ih264_deblk_chroma_vert_bs4_ssse3()                  */
     27 /*                      ih264_deblk_chroma_horz_bs4_ssse3()                  */
     28 /*                      ih264_deblk_chroma_vert_bslt4_ssse3()                */
     29 /*                      ih264_deblk_chroma_horz_bslt4_ssse3()                */
     30 /*                      ih264_deblk_chroma_vert_bs4_mbaff_ssse3()            */
     31 /*                      ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()          */
     32 /*                                                                           */
     33 /*  Issues / Problems : None                                                 */
     34 /*                                                                           */
     35 /*  Revision History  :                                                      */
     36 /*                                                                           */
     37 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
     38 /*         12 02 2015   Naveen Kumar P  Added chrom deblocking ssse3         */
     39 /*                                      intrinsics                           */
     40 /*                                                                           */
     41 /*****************************************************************************/
     42 
     43 /*****************************************************************************/
     44 /* File Includes                                                             */
     45 /*****************************************************************************/
     46 
     47 /* System include files */
     48 #include <stdio.h>
     49 
     50 /* User include files */
     51 #include "ih264_typedefs.h"
     52 #include "ih264_platform_macros.h"
     53 #include "ih264_deblk_edge_filters.h"
     54 #include "ih264_macros.h"
     55 
     56 /*****************************************************************************/
     57 /* Function Definitions                                                      */
     58 /*****************************************************************************/
     59 
     60 /*****************************************************************************/
     61 /*                                                                           */
     62 /*  Function Name : ih264_deblk_chroma_vert_bs4_ssse3()                      */
     63 /*                                                                           */
     64 /*  Description   : This function performs filtering of a chroma block       */
     65 /*                  vertical edge when the boundary strength is set to 4 in  */
     66 /*                  high profile.                                            */
     67 /*                                                                           */
     68 /*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
     69 /*                  src_strd   - source stride                               */
     70 /*                  alpha_cb   - alpha value for the boundary in U           */
     71 /*                  beta_cb    - beta value for the boundary in U            */
     72 /*                  alpha_cr   - alpha value for the boundary in V           */
     73 /*                  beta_cr    - beta value for the boundary in V            */
     74 /*                                                                           */
     75 /*  Globals       : None                                                     */
     76 /*                                                                           */
     77 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
     78 /*                  title "Filtering process for edges for bS equal to 4" in */
     79 /*                  ITU T Rec H.264 with alpha and beta values different in  */
     80 /*                  U and V.                                                 */
     81 /*                                                                           */
     82 /*  Outputs       : None                                                     */
     83 /*                                                                           */
     84 /*  Returns       : None                                                     */
     85 /*                                                                           */
     86 /*  Issues        : None                                                     */
     87 /*                                                                           */
     88 /*  Revision History:                                                        */
     89 /*                                                                           */
     90 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
     91 /*         12 02 2015   Naveen Kumar P  Initial version                      */
     92 /*                                                                           */
     93 /*****************************************************************************/
     94 void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
     95                                        WORD32 src_strd,
     96                                        WORD32 alpha_cb,
     97                                        WORD32 beta_cb,
     98                                        WORD32 alpha_cr,
     99                                        WORD32 beta_cr)
    100 {
    101     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
    102     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
    103     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
    104     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
    105     __m128i temp1, temp2, temp3, temp4;
    106 
    107     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
    108     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
    109     __m128i flag1, flag2;
    110     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
    111     __m128i zero = _mm_setzero_si128();
    112     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
    113 
    114     /* Load and transpose the pixel values */
    115     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
    116     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
    117     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
    118     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
    119     linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
    120     linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
    121     lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
    122     lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
    123 
    124     temp1 = _mm_unpacklo_epi16(linea, lineb);
    125     temp2 = _mm_unpacklo_epi16(linec, lined);
    126     temp3 = _mm_unpacklo_epi16(linee, linef);
    127     temp4 = _mm_unpacklo_epi16(lineg, lineh);
    128 
    129     p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
    130     p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
    131     q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
    132     q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
    133 
    134     p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
    135     p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
    136     q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
    137     q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
    138     /* End of transpose */
    139 
    140     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
    141     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
    142     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
    143     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
    144 
    145     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    146     diff = _mm_abs_epi16(diff);
    147     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    148     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    149 
    150     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    151     diff = _mm_abs_epi16(diff);
    152     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    153     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    154 
    155     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    156     diff = _mm_abs_epi16(diff);
    157     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    158 
    159     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
    160     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
    161     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    162     temp1 = _mm_add_epi16(temp1, temp2);
    163     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
    164 
    165     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
    166     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
    167     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    168     temp1 = _mm_add_epi16(temp1, temp2);
    169     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
    170 
    171     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
    172     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
    173     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
    174     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
    175 
    176     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    177     diff = _mm_abs_epi16(diff);
    178     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    179     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    180 
    181     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    182     diff = _mm_abs_epi16(diff);
    183     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    184     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    185 
    186     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    187     diff = _mm_abs_epi16(diff);
    188     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    189 
    190     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
    191     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
    192     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    193     temp1 = _mm_add_epi16(temp1, temp2);
    194     p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
    195 
    196     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
    197     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
    198     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    199     temp1 = _mm_add_epi16(temp1, temp2);
    200     q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
    201 
    202     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
    203     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
    204 
    205     flag1 = _mm_packs_epi16(flag1, flag2);
    206 
    207     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
    208                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    209     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
    210     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
    211 
    212     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
    213                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    214     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
    215     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
    216 
    217     /* Inverse-transpose and store back */
    218     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
    219     temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
    220     temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
    221     temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
    222 
    223     linea = _mm_unpacklo_epi32(temp1, temp3);
    224     lineb = _mm_srli_si128(linea, 8);
    225     linec = _mm_unpackhi_epi32(temp1, temp3);
    226     lined = _mm_srli_si128(linec, 8);
    227     linee = _mm_unpacklo_epi32(temp2, temp4);
    228     linef = _mm_srli_si128(linee, 8);
    229     lineg = _mm_unpackhi_epi32(temp2, temp4);
    230     lineh = _mm_srli_si128(lineg, 8);
    231 
    232     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
    233     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
    234     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
    235     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
    236     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
    237     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
    238     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
    239     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
    240 
    241 }
    242 
    243 /*****************************************************************************/
    244 /*                                                                           */
    245 /*  Function Name : ih264_deblk_chroma_horz_bs4_ssse3()                      */
    246 /*                                                                           */
    247 /*  Description   : This function performs filtering of a chroma block       */
    248 /*                  horizontal edge when the boundary strength is set to 4   */
    249 /*                  in high profile.                                         */
    250 /*                                                                           */
    251 /*  Inputs        : pu1_src    - pointer to the src sample q0 of U           */
    252 /*                  src_strd   - source stride                               */
    253 /*                  alpha_cb   - alpha value for the boundary in U           */
    254 /*                  beta_cb    - beta value for the boundary in U            */
    255 /*                  alpha_cr   - alpha value for the boundary in V           */
    256 /*                  beta_cr    - beta value for the boundary in V            */
    257 /*                                                                           */
    258 /*  Globals       : None                                                     */
    259 /*                                                                           */
    260 /*  Processing    : This operation is described in Sec. 8.7.2.4 under the    */
    261 /*                  title "Filtering process for edges for bS equal to 4" in */
    262 /*                  ITU T Rec H.264 with alpha and beta values different in  */
    263 /*                  U and V.                                                 */
    264 /*                                                                           */
    265 /*  Outputs       : None                                                     */
    266 /*                                                                           */
    267 /*  Returns       : None                                                     */
    268 /*                                                                           */
    269 /*  Issues        : None                                                     */
    270 /*                                                                           */
    271 /*  Revision History:                                                        */
    272 /*                                                                           */
    273 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
    274 /*         12 02 2015   Naveen Kumar P  Initial version                      */
    275 /*                                                                           */
    276 /*****************************************************************************/
    277 void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
    278                                        WORD32 src_strd,
    279                                        WORD32 alpha_cb,
    280                                        WORD32 beta_cb,
    281                                        WORD32 alpha_cr,
    282                                        WORD32 beta_cr)
    283 {
    284     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
    285     WORD16 i16_posP1, i16_posP0, i16_posQ1;
    286 
    287     UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
    288     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
    289     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
    290     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
    291     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
    292     __m128i flag1, flag2;
    293     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
    294     __m128i zero = _mm_setzero_si128();
    295     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
    296     __m128i temp1, temp2;
    297 
    298     pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
    299 
    300     i16_posQ1 = src_strd;
    301     i16_posP0 = src_strd;
    302     i16_posP1 = 0;
    303 
    304     q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
    305     q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
    306     p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
    307     p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
    308 
    309     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
    310     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
    311     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
    312     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
    313 
    314     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    315     diff = _mm_abs_epi16(diff);
    316     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    317     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    318 
    319     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    320     diff = _mm_abs_epi16(diff);
    321     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    322     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    323 
    324     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    325     diff = _mm_abs_epi16(diff);
    326     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    327 
    328     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
    329     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
    330     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    331     temp1 = _mm_add_epi16(temp1, temp2);
    332     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
    333 
    334     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
    335     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
    336     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    337     temp1 = _mm_add_epi16(temp1, temp2);
    338     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
    339 
    340     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
    341     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
    342     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
    343     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
    344 
    345     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    346     diff = _mm_abs_epi16(diff);
    347     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    348     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    349 
    350     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    351     diff = _mm_abs_epi16(diff);
    352     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    353     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    354 
    355     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    356     diff = _mm_abs_epi16(diff);
    357     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    358 
    359     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
    360     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
    361     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    362     temp1 = _mm_add_epi16(temp1, temp2);
    363     p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
    364 
    365     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
    366     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
    367     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    368     temp1 = _mm_add_epi16(temp1, temp2);
    369     q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
    370 
    371     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
    372     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
    373 
    374     flag1 = _mm_packs_epi16(flag1, flag2);
    375 
    376     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
    377                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    378     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
    379     p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
    380     _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
    381 
    382     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
    383                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    384     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
    385     q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
    386     _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
    387 
    388 }
    389 
    390 /*****************************************************************************/
    391 /*                                                                           */
    392 /*  Function Name : ih264_deblk_chroma_vert_bslt4_ssse3()                    */
    393 /*                                                                           */
    394 /*  Description   : This function performs filtering of a chroma block       */
    395 /*                  vertical edge when the boundary strength is less than 4  */
    396 /*                  in high profile.                                         */
    397 /*                                                                           */
    398 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
    399 /*                  src_strd         - source stride                         */
    400 /*                  alpha_cb         - alpha value for the boundary in U     */
    401 /*                  beta_cb          - beta value for the boundary in U      */
    402 /*                  alpha_cr         - alpha value for the boundary in V     */
    403 /*                  beta_cr          - beta value for the boundary in V      */
    404 /*                  u4_bs            - packed Boundary strength array        */
    405 /*                  pu1_cliptab_cb   - tc0_table for U                       */
    406 /*                  pu1_cliptab_cr   - tc0_table for V                       */
    407 /*                                                                           */
    408 /*  Globals       : None                                                     */
    409 /*                                                                           */
    410 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
    411 /*                  title "Filtering process for edges for bS less than 4"   */
    412 /*                  in ITU T Rec H.264 with alpha and beta values different  */
    413 /*                  in U and V.                                              */
    414 /*                                                                           */
    415 /*  Outputs       : None                                                     */
    416 /*                                                                           */
    417 /*  Returns       : None                                                     */
    418 /*                                                                           */
    419 /*  Issues        : None                                                     */
    420 /*                                                                           */
    421 /*  Revision History:                                                        */
    422 /*                                                                           */
    423 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
    424 /*         12 02 2015   Naveen Kumar P  Initial version                      */
    425 /*                                                                           */
    426 /*****************************************************************************/
    427 void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
    428                                          WORD32 src_strd,
    429                                          WORD32 alpha_cb,
    430                                          WORD32 beta_cb,
    431                                          WORD32 alpha_cr,
    432                                          WORD32 beta_cr,
    433                                          UWORD32 u4_bs,
    434                                          const UWORD8 *pu1_cliptab_cb,
    435                                          const UWORD8 *pu1_cliptab_cr)
    436 {
    437     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
    438     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
    439     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
    440     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
    441     __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
    442     __m128i temp1, temp2, temp3, temp4;
    443 
    444     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
    445     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
    446     __m128i flag_bs, flag1, flag2;
    447     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
    448     __m128i zero = _mm_setzero_si128();
    449     __m128i C0_uv_8x16;
    450     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
    451 
    452     u1_Bs0 = (u4_bs >> 24) & 0xff;
    453     u1_Bs1 = (u4_bs >> 16) & 0xff;
    454     u1_Bs2 = (u4_bs >> 8) & 0xff;
    455     u1_Bs3 = (u4_bs >> 0) & 0xff;
    456 
    457     flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
    458                            u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
    459                            u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
    460     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
    461     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
    462 
    463     /* Load and transpose the pixel values */
    464     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
    465     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
    466     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
    467     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
    468     linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
    469     linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
    470     lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
    471     lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
    472 
    473     temp1 = _mm_unpacklo_epi16(linea, lineb);
    474     temp2 = _mm_unpacklo_epi16(linec, lined);
    475     temp3 = _mm_unpacklo_epi16(linee, linef);
    476     temp4 = _mm_unpacklo_epi16(lineg, lineh);
    477 
    478     p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
    479     p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
    480     q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
    481     q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
    482 
    483     p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
    484     p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
    485     q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
    486     q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
    487     /* End of transpose */
    488 
    489     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
    490     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
    491     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
    492     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
    493 
    494     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    495     diff = _mm_abs_epi16(diff);
    496     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    497     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    498 
    499     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    500     diff = _mm_abs_epi16(diff);
    501     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    502     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    503 
    504     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    505     diff = _mm_abs_epi16(diff);
    506     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    507 
    508     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
    509     diff = _mm_slli_epi16(diff, 2);
    510     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
    511     diff = _mm_add_epi16(diff, diff1);
    512     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
    513     in_macro = _mm_srai_epi16(diff, 3);
    514 
    515     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
    516                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
    517                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
    518                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
    519 
    520     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
    521 
    522     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
    523     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
    524     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
    525 
    526     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
    527     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
    528 
    529     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
    530     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
    531     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
    532     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
    533 
    534     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    535     diff = _mm_abs_epi16(diff);
    536     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    537     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    538 
    539     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    540     diff = _mm_abs_epi16(diff);
    541     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    542     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    543 
    544     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    545     diff = _mm_abs_epi16(diff);
    546     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    547 
    548     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
    549     diff = _mm_slli_epi16(diff, 2);
    550     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
    551     diff = _mm_add_epi16(diff, diff1);
    552     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
    553     in_macro = _mm_srai_epi16(diff, 3);
    554 
    555     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
    556                                pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
    557                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
    558                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
    559 
    560     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
    561 
    562     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
    563     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
    564     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
    565 
    566     p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
    567     q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
    568 
    569     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
    570     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
    571 
    572     flag1 = _mm_packs_epi16(flag1, flag2);
    573     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
    574 
    575     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
    576                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    577     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
    578     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
    579 
    580     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
    581                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    582     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
    583     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
    584 
    585     /* Inverse-transpose and store back */
    586     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
    587     temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
    588     temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
    589     temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
    590 
    591     linea = _mm_unpacklo_epi32(temp1, temp3);
    592     lineb = _mm_srli_si128(linea, 8);
    593     linec = _mm_unpackhi_epi32(temp1, temp3);
    594     lined = _mm_srli_si128(linec, 8);
    595     linee = _mm_unpacklo_epi32(temp2, temp4);
    596     linef = _mm_srli_si128(linee, 8);
    597     lineg = _mm_unpackhi_epi32(temp2, temp4);
    598     lineh = _mm_srli_si128(lineg, 8);
    599 
    600     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
    601     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
    602     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
    603     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
    604     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
    605     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
    606     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
    607     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
    608 
    609 }
    610 
    611 /*****************************************************************************/
    612 /*                                                                           */
    613 /*  Function Name : ih264_deblk_chroma_horz_bslt4_ssse3()                    */
    614 /*                                                                           */
    615 /*  Description   : This function performs filtering of a chroma block       */
    616 /*                  horizontal edge when the boundary strength is less than  */
    617 /*                  4 in high profile.                                       */
    618 /*                                                                           */
    619 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
    620 /*                  src_strd         - source stride                         */
    621 /*                  alpha_cb         - alpha value for the boundary in U     */
    622 /*                  beta_cb          - beta value for the boundary in U      */
    623 /*                  alpha_cr         - alpha value for the boundary in V     */
    624 /*                  beta_cr          - beta value for the boundary in V      */
    625 /*                  u4_bs            - packed Boundary strength array        */
    626 /*                  pu1_cliptab_cb   - tc0_table for U                       */
    627 /*                  pu1_cliptab_cr   - tc0_table for V                       */
    628 /*                                                                           */
    629 /*  Globals       : None                                                     */
    630 /*                                                                           */
    631 /*  Processing    : This operation is described in Sec. 8.7.2.3 under the    */
    632 /*                  title "Filtering process for edges for bS less than 4"   */
    633 /*                  in ITU T Rec H.264 with alpha and beta values different  */
    634 /*                  in U and V.                                              */
    635 /*                                                                           */
    636 /*  Outputs       : None                                                     */
    637 /*                                                                           */
    638 /*  Returns       : None                                                     */
    639 /*                                                                           */
    640 /*  Issues        : None                                                     */
    641 /*                                                                           */
    642 /*  Revision History:                                                        */
    643 /*                                                                           */
    644 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
    645 /*         12 02 2015   Naveen Kumar P  Initial version                      */
    646 /*                                                                           */
    647 /*****************************************************************************/
    648 void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
    649                                          WORD32 src_strd,
    650                                          WORD32 alpha_cb,
    651                                          WORD32 beta_cb,
    652                                          WORD32 alpha_cr,
    653                                          WORD32 beta_cr,
    654                                          UWORD32 u4_bs,
    655                                          const UWORD8 *pu1_cliptab_cb,
    656                                          const UWORD8 *pu1_cliptab_cr)
    657 {
    658     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
    659     WORD16 i16_posP1, i16_posP0, i16_posQ1;
    660     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
    661 
    662     UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
    663     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
    664     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
    665     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
    666     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
    667     __m128i flag_bs, flag1, flag2;
    668     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
    669     __m128i zero = _mm_setzero_si128();
    670     __m128i C0_uv_8x16;
    671     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
    672 
    673     pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
    674 
    675     i16_posQ1 = src_strd;
    676     i16_posP0 = src_strd;
    677     i16_posP1 = 0;
    678 
    679     u1_Bs0 = (u4_bs >> 24) & 0xff;
    680     u1_Bs1 = (u4_bs >> 16) & 0xff;
    681     u1_Bs2 = (u4_bs >> 8) & 0xff;
    682     u1_Bs3 = (u4_bs >> 0) & 0xff;
    683 
    684     flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
    685                            u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
    686                            u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
    687     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
    688     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
    689 
    690     q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
    691     q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
    692     p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
    693     p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
    694 
    695     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
    696     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
    697     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
    698     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
    699 
    700     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    701     diff = _mm_abs_epi16(diff);
    702     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    703     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    704 
    705     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    706     diff = _mm_abs_epi16(diff);
    707     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    708     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    709 
    710     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    711     diff = _mm_abs_epi16(diff);
    712     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    713 
    714     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
    715     diff = _mm_slli_epi16(diff, 2);
    716     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
    717     diff = _mm_add_epi16(diff, diff1);
    718     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
    719     in_macro = _mm_srai_epi16(diff, 3);
    720 
    721     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
    722                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
    723                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
    724                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
    725 
    726     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
    727 
    728     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
    729     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
    730     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
    731 
    732     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
    733     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
    734 
    735     q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
    736     q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
    737     p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
    738     p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
    739 
    740     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    741     diff = _mm_abs_epi16(diff);
    742     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    743     flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    744 
    745     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    746     diff = _mm_abs_epi16(diff);
    747     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    748     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    749 
    750     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    751     diff = _mm_abs_epi16(diff);
    752     flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    753 
    754     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
    755     diff = _mm_slli_epi16(diff, 2);
    756     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
    757     diff = _mm_add_epi16(diff, diff1);
    758     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
    759     in_macro = _mm_srai_epi16(diff, 3);
    760 
    761     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
    762                                pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
    763                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
    764                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
    765 
    766     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
    767 
    768     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
    769     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
    770     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
    771 
    772     p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
    773     q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
    774 
    775     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
    776     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
    777 
    778     flag1 = _mm_packs_epi16(flag1, flag2);
    779     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
    780 
    781     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
    782                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    783     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
    784     p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
    785     _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
    786 
    787     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
    788                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    789     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
    790     q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
    791     _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
    792 
    793 }
    794 
    795 /*****************************************************************************/
    796 /*                                                                           */
    797 /*  Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3()                */
    798 /*                                                                           */
    799 /*  Description   : This function performs filtering of a chroma block       */
    800 /*                  vertical edge when boundary strength is set to 4 in high */
    801 /*                  profile.                                                 */
    802 /*                                                                           */
    803 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
    804 /*                  src_strd         - source stride                         */
    805 /*                  alpha_cb         - alpha value for the boundary in U     */
    806 /*                  beta_cb          - beta value for the boundary in U      */
    807 /*                  alpha_cr         - alpha value for the boundary in V     */
    808 /*                  beta_cr          - beta value for the boundary in V      */
    809 /*                  u4_bs            - packed Boundary strength array        */
    810 /*                  pu1_cliptab_cb   - tc0_table for U                       */
    811 /*                  pu1_cliptab_cr   - tc0_table for V                       */
    812 /*                                                                           */
    813 /*  Globals       : None                                                     */
    814 /*                                                                           */
    815 /*  Processing    : When the function is called twice, this operation is as  */
    816 /*                  described in Sec. 8.7.2.4 under the title "Filtering     */
    817 /*                  process for edges for bS equal to 4" in ITU T Rec H.264  */
    818 /*                  with alpha and beta values different in U and V.         */
    819 /*                                                                           */
    820 /*  Outputs       : None                                                     */
    821 /*                                                                           */
    822 /*  Returns       : None                                                     */
    823 /*                                                                           */
    824 /*  Issues        : None                                                     */
    825 /*                                                                           */
    826 /*  Revision History:                                                        */
    827 /*                                                                           */
    828 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
    829 /*         12 02 2015   Naveen Kumar P  Initial version                      */
    830 /*                                                                           */
    831 /*****************************************************************************/
    832 void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
    833                                              WORD32 src_strd,
    834                                              WORD32 alpha_cb,
    835                                              WORD32 beta_cb,
    836                                              WORD32 alpha_cr,
    837                                              WORD32 beta_cr)
    838 {
    839     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
    840     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
    841     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
    842     __m128i linea, lineb, linec, lined;
    843     __m128i temp1, temp2;
    844 
    845     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
    846     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
    847     __m128i flag1;
    848     __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
    849     __m128i zero = _mm_setzero_si128();
    850     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
    851 
    852     /* Load and transpose the pixel values */
    853     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
    854     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
    855     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
    856     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
    857 
    858     temp1 = _mm_unpacklo_epi16(linea, lineb);
    859     temp2 = _mm_unpacklo_epi16(linec, lined);
    860 
    861     p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
    862     p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
    863     q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
    864     q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
    865     /* End of transpose */
    866 
    867     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
    868     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
    869     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
    870     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
    871 
    872     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
    873     diff = _mm_abs_epi16(diff);
    874     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
    875     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
    876 
    877     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
    878     diff = _mm_abs_epi16(diff);
    879     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
    880     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    881 
    882     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
    883     diff = _mm_abs_epi16(diff);
    884     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
    885 
    886     temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
    887     temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
    888     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    889     temp1 = _mm_add_epi16(temp1, temp2);
    890     p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
    891 
    892     temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
    893     temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
    894     temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
    895     temp1 = _mm_add_epi16(temp1, temp2);
    896     q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
    897 
    898     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
    899     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
    900 
    901     flag1 = _mm_packs_epi16(flag1, flag1);
    902 
    903     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
    904                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    905     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
    906     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
    907 
    908     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
    909                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
    910     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
    911     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
    912 
    913     /* Inverse-transpose and store back */
    914     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
    915     temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
    916 
    917     linea = _mm_unpacklo_epi32(temp1, temp2);
    918     lineb = _mm_srli_si128(linea, 8);
    919     linec = _mm_unpackhi_epi32(temp1, temp2);
    920     lined = _mm_srli_si128(linec, 8);
    921 
    922     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
    923     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
    924     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
    925     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
    926 
    927 }
    928 
    929 /*****************************************************************************/
    930 /*                                                                           */
    931 /*  Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3()              */
    932 /*                                                                           */
    933 /*  Description   : This function performs filtering of a chroma block       */
    934 /*                  vertical edge when boundary strength is less than 4 in   */
    935 /*                  high profile.                                            */
    936 /*                                                                           */
    937 /*  Inputs        : pu1_src          - pointer to the src sample q0 of U     */
    938 /*                  src_strd         - source stride                         */
    939 /*                  alpha_cb         - alpha value for the boundary in U     */
    940 /*                  beta_cb          - beta value for the boundary in U      */
    941 /*                  alpha_cr         - alpha value for the boundary in V     */
    942 /*                  beta_cr          - beta value for the boundary in V      */
    943 /*                  u4_bs            - packed Boundary strength array        */
    944 /*                  pu1_cliptab_cb   - tc0_table for U                       */
    945 /*                  pu1_cliptab_cr   - tc0_table for V                       */
    946 /*                                                                           */
    947 /*  Globals       : None                                                     */
    948 /*                                                                           */
    949 /*  Processing    : When the function is called twice, this operation is as  */
    950 /*                  described in Sec. 8.7.2.4 under the title "Filtering     */
    951 /*                  process for edges for bS less than 4" in ITU T Rec H.264 */
    952 /*                  with alpha and beta values different in U and V.         */
    953 /*                                                                           */
    954 /*  Outputs       : None                                                     */
    955 /*                                                                           */
    956 /*  Returns       : None                                                     */
    957 /*                                                                           */
    958 /*  Issues        : None                                                     */
    959 /*                                                                           */
    960 /*  Revision History:                                                        */
    961 /*                                                                           */
    962 /*         DD MM YYYY   Author(s)       Changes (Describe the changes made)  */
    963 /*         12 02 2015   Naveen Kumar P  Initial version                      */
    964 /*                                                                           */
    965 /*****************************************************************************/
    966 void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
    967                                                WORD32 src_strd,
    968                                                WORD32 alpha_cb,
    969                                                WORD32 beta_cb,
    970                                                WORD32 alpha_cr,
    971                                                WORD32 beta_cr,
    972                                                UWORD32 u4_bs,
    973                                                const UWORD8 *pu1_cliptab_cb,
    974                                                const UWORD8 *pu1_cliptab_cr)
    975 {
    976     UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
    977     UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
    978     WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
    979     WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
    980     __m128i linea, lineb, linec, lined;
    981     __m128i temp1, temp2;
    982 
    983     __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
    984     __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
    985     __m128i flag_bs, flag1;
    986     __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
    987     __m128i zero = _mm_setzero_si128();
    988     __m128i C0_uv_8x16;
    989     __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
    990 
    991     u1_Bs0 = (u4_bs >> 24) & 0xff;
    992     u1_Bs1 = (u4_bs >> 16) & 0xff;
    993     u1_Bs2 = (u4_bs >> 8) & 0xff;
    994     u1_Bs3 = (u4_bs >> 0) & 0xff;
    995 
    996     flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
    997                            u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
    998     flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
    999     flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
   1000 
   1001     /* Load and transpose the pixel values */
   1002     linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
   1003     lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
   1004     linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
   1005     lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
   1006 
   1007     temp1 = _mm_unpacklo_epi16(linea, lineb);
   1008     temp2 = _mm_unpacklo_epi16(linec, lined);
   1009 
   1010     p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
   1011     p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
   1012     q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
   1013     q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
   1014     /* End of transpose */
   1015 
   1016     q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
   1017     q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
   1018     p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
   1019     p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
   1020 
   1021     diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
   1022     diff = _mm_abs_epi16(diff);
   1023     alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
   1024     flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
   1025 
   1026     diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
   1027     diff = _mm_abs_epi16(diff);
   1028     beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
   1029     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
   1030 
   1031     diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
   1032     diff = _mm_abs_epi16(diff);
   1033     flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
   1034 
   1035     diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
   1036     diff = _mm_slli_epi16(diff, 2);
   1037     diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
   1038     diff = _mm_add_epi16(diff, diff1);
   1039     diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
   1040     in_macro = _mm_srai_epi16(diff, 3);
   1041 
   1042     C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
   1043                                pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
   1044                                pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
   1045                                pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
   1046 
   1047     C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
   1048 
   1049     in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
   1050     C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
   1051     in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
   1052 
   1053     p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
   1054     q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
   1055 
   1056     p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
   1057     q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
   1058 
   1059     flag1 = _mm_packs_epi16(flag1, flag1);
   1060     flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
   1061 
   1062     p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
   1063                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
   1064     p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
   1065     p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
   1066 
   1067     q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
   1068                                  _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
   1069     q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
   1070     q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
   1071 
   1072     /* Inverse-transpose and store back */
   1073     temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
   1074     temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
   1075 
   1076     linea = _mm_unpacklo_epi32(temp1, temp2);
   1077     lineb = _mm_srli_si128(linea, 8);
   1078     linec = _mm_unpackhi_epi32(temp1, temp2);
   1079     lined = _mm_srli_si128(linec, 8);
   1080 
   1081     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
   1082     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
   1083     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
   1084     _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
   1085 
   1086 }
   1087 
   1088