Home | History | Annotate | Download | only in x86
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_deblck_atom_intr.c
     22 *
     23 * @brief
     24 *  Contains function definitions for deblocking filters
     25 *
     26 * @author
     27 *  Rishab
     28 *
     29 * @par List of Functions:
     30 *   - ihevc_deblk_luma_vert_ssse3()
     31 *   - ihevc_deblk_luma_horz_ssse3()
     32 *   - ihevc_deblk_chroma_vert_ssse3()
     33 *   - ihevc_deblk_chroma_horz_ssse3()
     34 *
     35 * @remarks
     36 *  None
     37 *
     38 *******************************************************************************
     39 */
     40 #include <stdlib.h>
     41 #include <stdio.h>
     42 #include <assert.h>
     43 #include "ihevc_typedefs.h"
     44 #include "ihevc_platform_macros.h"
     45 #include "ihevc_macros.h"
     46 #include "ihevc_deblk.h"
     47 #include "ihevc_deblk_tables.h"
     48 #include "ihevc_debug.h"
     49 
     50 #include "ihevc_tables_x86_intr.h"
     51 
     52 #include <immintrin.h>
     53 /**
     54 *******************************************************************************
     55 *
     56 * @brief
     57 *       Decision process and filtering for the luma block vertical edge.
     58 *
     59 * @par Description:
     60 *     The decision process for the luma block vertical edge is  carried out and
     61 *     an appropriate filter is applied. The  boundary filter strength, bs should
     62 *     be greater than 0.  The pcm flags and the transquant bypass flags should
     63 *     be  taken care of by the calling function.
     64 *
     65 * @param[in] pu1_src
     66 *  Pointer to the src sample q(0,0)
     67 *
     68 * @param[in] src_strd
     69 *  Source stride
     70 *
     71 * @param[in] bs
     72 *  Boundary filter strength of q(0,0)
     73 *
     74 * @param[in] quant_param_p
     75 *  quantization parameter of p block
     76 *
     77 * @param[in] quant_param_q
     78 *  quantization parameter of p block
     79 *
     80 * @param[in] beta_offset_div2
     81 *
     82 *
     83 * @param[in] tc_offset_div2
     84 *
     85 *
     86 * @param[in] filter_flag_p
     87 *  flag whether to filter the p block
     88 *
     89 * @param[in] filter_flag_q
     90 *  flag whether to filter the q block
     91 *
     92 * @returns
     93 *
     94 * @remarks
     95 *  None
     96 *
     97 *******************************************************************************
     98 */
     99 
    100 void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
    101                                  WORD32 src_strd,
    102                                  WORD32 bs,
    103                                  WORD32 quant_param_p,
    104                                  WORD32 quant_param_q,
    105                                  WORD32 beta_offset_div2,
    106                                  WORD32 tc_offset_div2,
    107                                  WORD32 filter_flag_p,
    108                                  WORD32 filter_flag_q)
    109 {
    110     WORD32 qp_luma, beta_indx, tc_indx;
    111     WORD32 beta, tc;
    112     WORD32 d, dp, dq, d_sam0, d_sam3;
    113 
    114     WORD32 d3, d0, de_0, de_1, de_2, de_3;
    115     WORD32 de, dep, deq;
    116     __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
    117 
    118 
    119     {
    120         __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
    121         __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
    122 
    123 
    124 
    125         ASSERT((bs > 0) && (bs <= 3));
    126         ASSERT(filter_flag_p || filter_flag_q);
    127 
    128         qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
    129         beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
    130 
    131         /* BS based on implementation can take value 3 if it is intra/inter egde          */
    132         /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
    133         /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
    134         /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
    135 
    136         tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
    137 
    138         beta = gai4_ihevc_beta_table[beta_indx];
    139         tc = gai4_ihevc_tc_table[tc_indx];
    140         if(0 == tc)
    141         {
    142             return;
    143         }
    144         src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
    145         src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
    146 
    147         coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
    148         mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
    149 
    150         src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
    151         mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
    152 
    153         mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
    154 
    155 
    156         //to get all 1's of 8 bit in (1)
    157         temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
    158         temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
    159         //accumulating values foe dp3 dq3 , dp0 dq0 values
    160         mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
    161 
    162         temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
    163         // to get all 1,-1 sets of 16 bits in (0)
    164         temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
    165         //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
    166         mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
    167         //to get 16 bit 1's
    168         temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
    169 
    170 
    171         // dq3 dp3 dq0 dp0
    172         mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
    173         mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
    174         mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
    175         // dq dp d3 d0
    176         mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
    177         //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
    178         mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
    179         //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
    180         mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
    181 
    182         ///store back in a single variable
    183         temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
    184         temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
    185         mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
    186 
    187         d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
    188         d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
    189         dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
    190         dq = _mm_cvtsi128_si32(mask_16x8b);
    191         //getting d
    192         d = d0 + d3;
    193 
    194         ///store back in a single variable
    195         temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
    196         temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
    197         mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
    198 
    199         de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
    200         de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
    201         de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
    202         de_3 = _mm_cvtsi128_si32(mask_16x8b);
    203 
    204         de = 0;
    205         dep = 0;
    206         deq = 0;
    207         if(d < beta)
    208         {
    209             d_sam0 = 0;
    210             if((2 * d0 < (beta >> 2))
    211                             && (de_2 < (beta >> 3))
    212                             && (de_0 < ((5 * tc + 1) >> 1)))
    213             {
    214                 d_sam0 = 1;
    215             }
    216 
    217             d_sam3 = 0;
    218             if((2 * d3 < (beta >> 2))
    219                             && (de_3 < (beta >> 3))
    220                             && de_1 < ((5 * tc + 1) >> 1))
    221             {
    222                 d_sam3 = 1;
    223             }
    224 
    225             de = (d_sam0 & d_sam3) + 1;
    226             dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
    227             deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
    228             if(tc <= 1)
    229             {
    230                 dep = 0;
    231                 deq = 0;
    232             }
    233         }
    234 
    235     }
    236 
    237     if(de != 0)
    238     {
    239 
    240 
    241         src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
    242         src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
    243 
    244         if(de == 2)
    245         {
    246             __m128i temp_pq_str0_16x8b;
    247             __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
    248             __m128i temp_pq2_str0_16x8b;
    249             __m128i temp_pq_str1_16x8b;
    250             __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
    251             __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
    252             __m128i const2_8x16b, const2tc_8x16b;
    253             LWORD64 mask, tc2;
    254             tc = tc << 1;
    255             mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
    256             tc2 = ((LWORD64)tc);
    257 
    258             const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
    259             //q'0-q'1-2 ,p'0-p'1-2
    260             src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
    261             src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
    262 
    263             const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
    264             temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
    265             temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
    266             //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
    267             temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
    268             temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
    269 
    270             const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
    271             //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
    272             temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
    273 
    274             temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
    275 
    276             //q'1-2, p'1-2
    277             temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
    278             temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
    279 
    280             temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
    281             temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
    282 
    283             temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
    284             temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
    285             // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
    286             temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
    287             // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
    288             temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
    289 
    290             temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
    291             temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
    292 
    293             //clipping mask design
    294             temp_str1_16x8b = _mm_setzero_si128();
    295             temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
    296             const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
    297             temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
    298             const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
    299 
    300             //clipping mask design
    301             temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
    302             const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
    303             //calculating Clipping MAX for all pixel values.
    304             temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
    305             temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
    306 
    307 
    308             //q'2-q'0-2,p'2-p'0-2
    309             temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
    310             temp_str3_16x8b     = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
    311 
    312             temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
    313             temp_str3_16x8b     = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
    314 
    315             const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
    316             //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
    317             temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
    318 
    319             temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
    320 
    321             //calculating Clipping MIN for all pixel values.
    322             temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
    323             temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
    324             //q'0-q'1-2 ,p'0-p'1-2
    325             temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
    326             temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
    327             //q'1-2 p'1-2
    328             temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
    329             //to get 2 in 16 bit
    330             const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
    331             //to get q33 q23 q13 q03, p33 p23 p13 p03
    332             temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
    333             temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
    334             temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
    335 
    336             //q'1, p'1 (adding 2)
    337             temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
    338             //q'0-q'1,p'0-p'1
    339             temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
    340             //q'2-q'1,p'2-p'1
    341             temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
    342             //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
    343             temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
    344             //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
    345             temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
    346 
    347             //normalisation of all modified pixels
    348             temp_pq_str0_16x8b  = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
    349             temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
    350             temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
    351 
    352             //getting p0 p1 together and p2 p3 together
    353             temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
    354             temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
    355             //getting q1 q0 together and  q3 q2 together
    356             temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
    357             temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
    358             //getting p's of row0 row1 together and of row2 row3 together
    359             temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
    360             temp_str2_16x8b    = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
    361             //getting q's of row0 row1 together and of row2 row3 together
    362             temp_str0_16x8b    = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
    363             temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
    364             //getting values for respective rows in 16 bit
    365             src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
    366             src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
    367             src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
    368             src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
    369             //packing values to 8 bit
    370             src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
    371             src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
    372             //Clipping MAX
    373             src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
    374             src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
    375             //Clipping MIN
    376             src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
    377             src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
    378             //separating row 2 and row 3
    379             src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
    380             src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
    381 
    382         }
    383 
    384         else
    385         {
    386 
    387             __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
    388             __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
    389             __m128i coefdelta_0_8x16b, mask_pq_8x16b;
    390             __m128i const2_8x16b, consttc_8x16b;
    391 
    392             LWORD64 mask1;
    393             mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
    394 
    395             consttc_8x16b = _mm_set1_epi32(tc);
    396 
    397 
    398             src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
    399             src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
    400 
    401             tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
    402             tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
    403 
    404             tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
    405             tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
    406             //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
    407             tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
    408 
    409             coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
    410             // (-3q1+9q0),(-9p0+3p1)
    411             tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
    412             //converting to 16 bit
    413             consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
    414             //getting -tc store
    415             tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
    416             //calc 10 *tc = 2*tc +8*tc ; 2*tc
    417             tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
    418             //calc 10 *tc = 2*tc +8*tc ; 8*tc
    419             tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
    420             //getting -tc store
    421             tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
    422             //calc 10 *tc
    423             tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
    424             //const 1
    425             const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
    426             tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
    427             const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
    428             //getting the mask values
    429             mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
    430             //loaded coef for delta1 calculation
    431             coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
    432             //(-2q1+q0),(p0-2p1)
    433             tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
    434             //const 8
    435             const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
    436             //rearranging the mask values
    437             mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
    438             //normalisation of the filter
    439             tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
    440             tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
    441 
    442             //getting deltaq0
    443             tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
    444             //packing  d3q d2q d1q d0q d3p d2p d1p d0p
    445             tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
    446             //absolute delta
    447             tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
    448             //Clipping of delta0
    449             tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
    450             //mask for |delta| < 10*tc
    451             tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
    452             //Clipping of delta0
    453             tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
    454 
    455 
    456             //delta 1 calc starts
    457 
    458             //getting q32 q22 q12 q02 p32 p12 p22 p02
    459             tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
    460             tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
    461             tmp_delta1_8x16b =  _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
    462             tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
    463             //constant 1
    464             const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
    465             //tc>>1 16 bit
    466             consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
    467 
    468             //getting -tc>>1 store  16 bit
    469             tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
    470             //2*delta0
    471             tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
    472 
    473             //getting  all respective q's and p's together
    474             tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
    475             tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
    476             //final adds for deltap1 and deltaq1
    477             tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
    478             tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
    479             tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
    480             tmp2_const_8x16b = _mm_setzero_si128();
    481             tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
    482 
    483             // clipping delta1
    484             tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
    485             // clipping delta1
    486             tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
    487 
    488             //getting the mask ready
    489             mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
    490             //masking of the delta values |delta|<10*tc
    491             tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
    492             tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
    493             //packing dq1 dq0 dp0 dp1
    494             tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
    495             tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
    496             tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
    497             tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
    498 
    499             //masking of the delta values dep, deq , filter_p ,filter_q
    500             tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
    501             tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
    502             //converting 8bit to 16 bit
    503             src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
    504             src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
    505             src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
    506             src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
    507             //shuffle values loaded
    508             tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
    509             tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
    510             //arranging each row delta in different registers
    511             tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
    512             tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
    513             tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
    514             tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
    515 
    516             //adding the respective delta
    517             src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
    518             src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
    519             src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
    520             src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
    521             //saturating to 8 bit
    522             src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
    523             src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
    524             //separating different rows
    525             src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
    526             src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
    527         }
    528 
    529         _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
    530         _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
    531         _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
    532         _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
    533     }
    534 }
    535 
    536 void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
    537                                  WORD32 src_strd,
    538                                  WORD32 bs,
    539                                  WORD32 quant_param_p,
    540                                  WORD32 quant_param_q,
    541                                  WORD32 beta_offset_div2,
    542                                  WORD32 tc_offset_div2,
    543                                  WORD32 filter_flag_p,
    544                                  WORD32 filter_flag_q)
    545 {
    546     WORD32 qp_luma, beta_indx, tc_indx;
    547     WORD32 beta, tc;
    548 
    549     WORD32 d0, d3, dp, dq, d;
    550     WORD32 de_0, de_1, de_2, de_3;
    551     WORD32 d_sam0, d_sam3;
    552     WORD32 de, dep, deq;
    553 
    554     __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
    555     __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
    556 
    557 
    558 
    559 
    560     {
    561         __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
    562         __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
    563         __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
    564 
    565         ASSERT((bs > 0));
    566         ASSERT(filter_flag_p || filter_flag_q);
    567 
    568         qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
    569         beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
    570 
    571         /* BS based on implementation can take value 3 if it is intra/inter egde          */
    572         /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
    573         /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2)          */
    574         /* the above desired functionallity is achieved by doing (2*(bs>>1))              */
    575 
    576         tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
    577 
    578         beta = gai4_ihevc_beta_table[beta_indx];
    579         tc = gai4_ihevc_tc_table[tc_indx];
    580         if(0 == tc)
    581         {
    582             return;
    583         }
    584         src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
    585         src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
    586         src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
    587         src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
    588         src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
    589         tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
    590         src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
    591         tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
    592 
    593 
    594         src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
    595         src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
    596 
    597         src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
    598         src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
    599 
    600         src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
    601         src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
    602 
    603         src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
    604         src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
    605 
    606         coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
    607         mask_16x8b =  _mm_load_si128((__m128i *)(shuffle_d));
    608 
    609         src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
    610         //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
    611         mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
    612 
    613         mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
    614 
    615 
    616         //to get all 1's of 8 bit in (1)
    617         temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
    618         temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
    619         //accumulating values foe dp3 dq3 , dp0 dq0 values
    620         mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
    621 
    622         temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
    623         // to get all 1,-1 sets of 16 bits in (0)
    624         temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
    625         //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
    626         mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
    627         //to get 16 bit 1's
    628         temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
    629 
    630 
    631         // dq3 dp3 dq0 dp0
    632         mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
    633         mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
    634         mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
    635         // dq dp d3 d0
    636         mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
    637         //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
    638         mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
    639         //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
    640         mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
    641 
    642         ///store back in a single variable
    643         temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
    644         temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
    645         mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
    646 
    647         d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
    648         d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
    649         dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
    650         dq = _mm_cvtsi128_si32(mask_16x8b);
    651         //getting d
    652         d = d0 + d3;
    653 
    654         ///store back in a single variable
    655         temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
    656         temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
    657         mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
    658 
    659         de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
    660         de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
    661         de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
    662         de_3 = _mm_cvtsi128_si32(mask_16x8b);
    663 
    664         de = 0;
    665         dep = 0;
    666         deq = 0;
    667         if(d < beta)
    668         {
    669             d_sam0 = 0;
    670             if((2 * d0 < (beta >> 2))
    671                             && (de_2 < (beta >> 3))
    672                             && (de_0 < ((5 * tc + 1) >> 1)))
    673             {
    674                 d_sam0 = 1;
    675             }
    676 
    677             d_sam3 = 0;
    678             if((2 * d3 < (beta >> 2))
    679                             && (de_3 < (beta >> 3))
    680                             && de_1 < ((5 * tc + 1) >> 1))
    681             {
    682                 d_sam3 = 1;
    683             }
    684 
    685             de = (d_sam0 & d_sam3) + 1;
    686             dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
    687             deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
    688             if(tc <= 1)
    689             {
    690                 dep = 0;
    691                 deq = 0;
    692             }
    693         }
    694 
    695     }
    696 
    697     if(de != 0)
    698     {
    699 
    700         if(2 == de)
    701         {
    702 
    703             __m128i temp_pq0_str0_16x8b;
    704             __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
    705             __m128i temp_pq2_str0_16x8b;
    706             __m128i temp_str0_16x8b, temp_str1_16x8b;
    707             __m128i const2_8x16b, const2tc_8x16b;
    708 
    709             LWORD64 mask, tc2;
    710             tc = tc << 1;
    711             mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
    712             tc2 = ((LWORD64)tc);
    713 
    714             const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
    715             //q'0-q'1-2 ,p'0-p'1-2
    716             temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
    717             temp_str0_16x8b   = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
    718             const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
    719             //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
    720             temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
    721 
    722             const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
    723             temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
    724 
    725             //q'1-2, p'1-2
    726             temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
    727             temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
    728             temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
    729             // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
    730             temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
    731             // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
    732             temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
    733 
    734             temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
    735             temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
    736 
    737             //clipping mask design
    738             temp_str1_16x8b = _mm_setzero_si128();
    739             temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
    740             const2tc_8x16b  = _mm_loadl_epi64((__m128i *)(&tc2));
    741             temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
    742             const2tc_8x16b  = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
    743 
    744             //clipping mask design
    745             temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
    746             const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
    747             //calculating Clipping MAX for all pixel values.
    748             src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
    749             src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
    750             //for clipping calc
    751             src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
    752             //saving the unmodified data of q1 p1 q0 p0
    753             src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
    754             //CLIpping MAX and MIN for q1 p1 q0 p0
    755             src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
    756             src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
    757 
    758 
    759             //q'2-q'0-2,p'2-p'0-2
    760             tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
    761             temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
    762             const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
    763             //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
    764             temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
    765             src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
    766             temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
    767 
    768             //calculating Clipping MAX and MIN for p2 and q2 .
    769             tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
    770             tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
    771             //q'0-q'1-2 ,p'0-p'1-2
    772             temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
    773             temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
    774             //q'1-2 p'1-2
    775             temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
    776             //to get 2 in 16 bit
    777             const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
    778 
    779 
    780             //q'1, p'1 (adding 2)
    781             temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
    782             //q'0-q'1,p'0-p'1
    783             temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
    784             //q'2-q'1,p'2-p'1
    785             temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
    786             //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
    787             temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
    788             //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
    789             temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
    790 
    791             //normalisation of all modified pixels
    792             temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
    793             temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
    794             temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
    795             //q'1 p'1 q'0 p'0
    796             temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
    797             temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
    798             //pack with the unmodified data of q2 and p2
    799             src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
    800             //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2  p'2
    801             temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
    802             src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
    803             temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
    804             src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
    805             //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
    806             src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
    807             src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
    808             src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
    809             src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
    810             src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
    811             src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
    812 
    813             _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
    814             _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
    815             _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
    816             _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
    817             _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
    818             _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
    819 
    820 
    821         }
    822 
    823         else
    824         {
    825 
    826             __m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
    827             __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
    828             __m128i coefdelta_0_8x16b;
    829             __m128i const2_8x16b, consttc_8x16b;
    830 
    831             LWORD64 maskp0, maskp1, maskq0, maskq1;
    832             maskp0 = (LWORD64)filter_flag_p;
    833             maskq0 = (LWORD64)filter_flag_q;
    834             maskp1 = (LWORD64)dep;
    835             maskq1 = (LWORD64)deq;
    836             consttc_8x16b = _mm_set1_epi32(tc);
    837 
    838             tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
    839             tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
    840             //arranged q31 q30 p30 p31  q21 q20 p20 p21  q1 q10 p10 p11 q01 q00 p00 p01
    841             tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
    842 
    843             coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
    844             // (-3q1+9q0),(-9p0+3p1)
    845             tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
    846 
    847             //getting -tc store
    848             tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
    849 
    850             //getting tc in 16 bit
    851             consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
    852             //calc 10 *tc = 2*tc +8*tc ; 2*tc
    853             tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
    854             //calc 10 *tc = 2*tc +8*tc ; 8*tc
    855             tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
    856 
    857             //const 1
    858             const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
    859             //calc 10 *tc
    860             tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
    861             //delta0 without normalisation and clipping
    862             tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
    863 
    864             const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
    865 
    866             //loaded coef for delta1 calculation
    867             coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
    868             //(-2q1+q0),(p0-2p1)
    869             tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
    870             //const 8
    871             const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
    872 
    873             //normalisation of the filter
    874             tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
    875             tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
    876 
    877             //getting deltaq0
    878             tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
    879             //getting -tc
    880             tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
    881             //packing  d03q d02q d01q d0q d03p d02p d01p d00p
    882             tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
    883             //absolute delta
    884             tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
    885 
    886             //Clipping of delta0
    887             tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
    888             //tc>>1 16 bit
    889             consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
    890             //Clipping of delta0
    891             tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
    892 
    893             //(-tc)>>1 16 bit
    894             tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
    895             //mask for |delta| < 10*tc
    896             tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
    897             //delta 1 calc starts
    898 
    899             //getting q32 q22 q12 q02 p32 p12 p22 p02
    900             tmp0_const_8x16b = _mm_setzero_si128();
    901             src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
    902             src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
    903             src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
    904             //constant 1
    905             const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
    906             //2*delta0
    907             tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
    908             //getting  all respective q's and p's together
    909             coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
    910             tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
    911             //final adds for deltap1 and deltaq1
    912             tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
    913             src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
    914             tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
    915             tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
    916 
    917             //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
    918             tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
    919             src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
    920 
    921             //   src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
    922             //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
    923             src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
    924             coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
    925 
    926             src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
    927             src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
    928             //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
    929             src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
    930 
    931             //rearranging the mask values
    932             src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
    933             src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
    934 
    935             src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
    936             src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
    937             src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
    938             src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
    939 
    940             //combining mask delta1
    941             tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
    942             // clipping delta1
    943             tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
    944             //combining mask delat0
    945             tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
    946             // clipping delta1
    947             tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
    948 
    949 
    950             //masking of the delta values |delta|<10*tc
    951             tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
    952             tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
    953             //separating p and q delta 0 and addinq p0 and q0
    954             tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
    955             tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
    956             src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
    957             src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
    958             src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
    959             src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
    960             //separating p and q delta 0 and addinq p0 and q0
    961             tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
    962             tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
    963             src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
    964             src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
    965             src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
    966             src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
    967             //packing p1 q1 and p0 q0 to 8 bit
    968             src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
    969             src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
    970 
    971             src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
    972             src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
    973 
    974             _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
    975             _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
    976             _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
    977             _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
    978 
    979 
    980         }
    981 
    982 
    983 
    984     }
    985 
    986 }
    987 
    988 void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
    989                                    WORD32 src_strd,
    990                                    WORD32 quant_param_p,
    991                                    WORD32 quant_param_q,
    992                                    WORD32 qp_offset_u,
    993                                    WORD32 qp_offset_v,
    994                                    WORD32 tc_offset_div2,
    995                                    WORD32 filter_flag_p,
    996                                    WORD32 filter_flag_q)
    997 {
    998     WORD32 qp_indx_u, qp_chroma_u;
    999     WORD32 qp_indx_v, qp_chroma_v;
   1000     WORD32 tc_indx_u, tc_u;
   1001     WORD32 tc_indx_v, tc_v;
   1002 
   1003     __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
   1004     ASSERT(filter_flag_p || filter_flag_q);
   1005 
   1006     /* chroma processing is done only if BS is 2             */
   1007     /* this function is assumed to be called only if BS is 2 */
   1008     qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
   1009     qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
   1010 
   1011     qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
   1012     qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
   1013 
   1014     tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
   1015     tc_u = gai4_ihevc_tc_table[tc_indx_u];
   1016 
   1017     tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
   1018     tc_v = gai4_ihevc_tc_table[tc_indx_v];
   1019 
   1020     if(0 == tc_u && 0 == tc_v)
   1021     {
   1022         return;
   1023     }
   1024     src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
   1025     tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
   1026     src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
   1027     tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
   1028 
   1029     {
   1030         LWORD64 mask_tc, mask_flag, mask;
   1031         __m128i delta_vu0_16x8b, delta_vu1_16x8b;
   1032         __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
   1033         __m128i min_0_16x8b;
   1034         __m128i const_16x8b;
   1035         mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
   1036         mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
   1037         mask = 0xffff00000000ffffLL;
   1038 
   1039         src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
   1040         src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
   1041 
   1042         mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
   1043         // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
   1044         // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
   1045         delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
   1046         delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
   1047 
   1048         tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
   1049         tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
   1050         // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
   1051         // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
   1052         delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
   1053         delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
   1054 
   1055         delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
   1056         delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
   1057 
   1058         //generating offset 4
   1059         const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
   1060         // filter flag mask and tc mask
   1061         mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
   1062         mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
   1063 
   1064         mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
   1065         mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
   1066         //-tc
   1067         min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
   1068         //converting const 1
   1069         const_16x8b = _mm_srli_epi16(const_16x8b, 15);
   1070 
   1071         //filterp and filterq flag
   1072         mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
   1073         mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
   1074 
   1075         //modified delta with a filter (1 -4 4 -1) available in 16 bit
   1076         delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
   1077         //converting const 4
   1078         const_16x8b = _mm_slli_epi16(const_16x8b, 2);
   1079 
   1080         mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
   1081         //offset addition
   1082         delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
   1083         //eliminating q1
   1084         tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
   1085 
   1086         const_16x8b = _mm_setzero_si128();
   1087         //filter after normalisation
   1088         delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
   1089         mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
   1090 
   1091         //clipping MAX
   1092         delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
   1093         //getting p0 and eliminating p1
   1094         tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
   1095         //clipping MIN
   1096         delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
   1097         //getting q0
   1098         tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
   1099         //masking filter flag
   1100         delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
   1101         delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
   1102 
   1103         // q-delta ,p+delta
   1104         tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
   1105         tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
   1106         //merging q0 and p0 of respective rows
   1107         delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
   1108         delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
   1109         // row 0 and row 1 packed , row2 and row3 packed
   1110         delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
   1111         delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
   1112         //removing older pixel values
   1113         src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
   1114         src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
   1115         //arranging modified pixels
   1116         delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
   1117         delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
   1118         delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
   1119         delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
   1120         //plugging the modified values
   1121         src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
   1122         src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
   1123 
   1124 
   1125         //geting values for row1 and row 3
   1126         tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
   1127         tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
   1128 
   1129         _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
   1130         _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
   1131         _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
   1132         _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
   1133     }
   1134 
   1135 
   1136 
   1137 }
   1138 
   1139 void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
   1140                                    WORD32 src_strd,
   1141                                    WORD32 quant_param_p,
   1142                                    WORD32 quant_param_q,
   1143                                    WORD32 qp_offset_u,
   1144                                    WORD32 qp_offset_v,
   1145                                    WORD32 tc_offset_div2,
   1146                                    WORD32 filter_flag_p,
   1147                                    WORD32 filter_flag_q)
   1148 {
   1149     WORD32 qp_indx_u, qp_chroma_u;
   1150     WORD32 qp_indx_v, qp_chroma_v;
   1151     WORD32 tc_indx_u, tc_u;
   1152     WORD32 tc_indx_v, tc_v;
   1153 
   1154 
   1155     __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
   1156 
   1157     ASSERT(filter_flag_p || filter_flag_q);
   1158 
   1159     /* chroma processing is done only if BS is 2             */
   1160     /* this function is assumed to be called only if BS is 2 */
   1161     qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
   1162     qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
   1163 
   1164     qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
   1165     qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
   1166 
   1167     tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
   1168     tc_u = gai4_ihevc_tc_table[tc_indx_u];
   1169 
   1170     tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
   1171     tc_v = gai4_ihevc_tc_table[tc_indx_v];
   1172 
   1173     if(0 == tc_u && 0 == tc_v)
   1174     {
   1175         return;
   1176     }
   1177     tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
   1178     src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
   1179     src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
   1180     tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
   1181 
   1182     {
   1183         LWORD64 mask_tc, mask_flag;
   1184         __m128i delta_vu0_16x8b, delta_vu1_16x8b;
   1185         __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
   1186         __m128i min_0_16x8b;
   1187         __m128i const_16x8b;
   1188         mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
   1189         mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
   1190 
   1191         tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
   1192         tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
   1193 
   1194         // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
   1195         // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
   1196         delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
   1197         delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
   1198 
   1199         delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
   1200         delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
   1201 
   1202 
   1203         // filter flag mask and tc mask
   1204         mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
   1205         mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
   1206 
   1207         //generating offset 4
   1208         const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
   1209         // filter flag mask and tc mask
   1210         mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
   1211         mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
   1212         //-tc
   1213         min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
   1214         //converting const 1
   1215         const_16x8b = _mm_srli_epi16(const_16x8b, 15);
   1216 
   1217         //filterp
   1218         mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
   1219 
   1220 
   1221         //converting const 4
   1222         const_16x8b = _mm_slli_epi16(const_16x8b, 2);
   1223         //modified delta with a filter (1 -4 4 -1) available in 16 bit
   1224         delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
   1225 
   1226         //filterq flag
   1227         mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
   1228         //offset addition
   1229         delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
   1230         mask_16x8b = _mm_setzero_si128();
   1231         //filter after normalisation
   1232         delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
   1233 
   1234         //converting p0 to 16bit
   1235         src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
   1236         //clipping MAX
   1237         delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
   1238         //converting q0 to 16bit
   1239         src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
   1240         //clipping MIN
   1241         delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
   1242 
   1243         //masking filter flag
   1244         delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
   1245         delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
   1246 
   1247         // q-delta ,p+delta
   1248         src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
   1249         src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
   1250 
   1251         // p0 and q0 packed
   1252         src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
   1253         src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
   1254 
   1255 
   1256 
   1257         _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
   1258         _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
   1259 
   1260     }
   1261 
   1262 
   1263 }
   1264