1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_deblck_atom_intr.c 22 * 23 * @brief 24 * Contains function definitions for deblocking filters 25 * 26 * @author 27 * Rishab 28 * 29 * @par List of Functions: 30 * - ihevc_deblk_luma_vert_ssse3() 31 * - ihevc_deblk_luma_horz_ssse3() 32 * - ihevc_deblk_chroma_vert_ssse3() 33 * - ihevc_deblk_chroma_horz_ssse3() 34 * 35 * @remarks 36 * None 37 * 38 ******************************************************************************* 39 */ 40 #include <stdlib.h> 41 #include <stdio.h> 42 #include <assert.h> 43 #include "ihevc_typedefs.h" 44 #include "ihevc_platform_macros.h" 45 #include "ihevc_macros.h" 46 #include "ihevc_deblk.h" 47 #include "ihevc_deblk_tables.h" 48 #include "ihevc_debug.h" 49 50 #include "ihevc_tables_x86_intr.h" 51 52 #include <immintrin.h> 53 /** 54 ******************************************************************************* 55 * 56 * @brief 57 * Decision process and filtering for the luma block vertical edge. 58 * 59 * @par Description: 60 * The decision process for the luma block vertical edge is carried out and 61 * an appropriate filter is applied. The boundary filter strength, bs should 62 * be greater than 0. The pcm flags and the transquant bypass flags should 63 * be taken care of by the calling function. 64 * 65 * @param[in] pu1_src 66 * Pointer to the src sample q(0,0) 67 * 68 * @param[in] src_strd 69 * Source stride 70 * 71 * @param[in] bs 72 * Boundary filter strength of q(0,0) 73 * 74 * @param[in] quant_param_p 75 * quantization parameter of p block 76 * 77 * @param[in] quant_param_q 78 * quantization parameter of p block 79 * 80 * @param[in] beta_offset_div2 81 * 82 * 83 * @param[in] tc_offset_div2 84 * 85 * 86 * @param[in] filter_flag_p 87 * flag whether to filter the p block 88 * 89 * @param[in] filter_flag_q 90 * flag whether to filter the q block 91 * 92 * @returns 93 * 94 * @remarks 95 * None 96 * 97 ******************************************************************************* 98 */ 99 100 void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src, 101 WORD32 src_strd, 102 WORD32 bs, 103 WORD32 quant_param_p, 104 WORD32 quant_param_q, 105 WORD32 beta_offset_div2, 106 WORD32 tc_offset_div2, 107 WORD32 filter_flag_p, 108 WORD32 filter_flag_q) 109 { 110 WORD32 qp_luma, beta_indx, tc_indx; 111 WORD32 beta, tc; 112 WORD32 d, dp, dq, d_sam0, d_sam3; 113 114 WORD32 d3, d0, de_0, de_1, de_2, de_3; 115 WORD32 de, dep, deq; 116 __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b; 117 118 119 { 120 __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b; 121 __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b; 122 123 124 125 ASSERT((bs > 0) && (bs <= 3)); 126 ASSERT(filter_flag_p || filter_flag_q); 127 128 qp_luma = (quant_param_p + quant_param_q + 1) >> 1; 129 beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51); 130 131 /* BS based on implementation can take value 3 if it is intra/inter egde */ 132 /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */ 133 /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */ 134 /* the above desired functionallity is achieved by doing (2*(bs>>1)) */ 135 136 tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53); 137 138 beta = gai4_ihevc_beta_table[beta_indx]; 139 tc = gai4_ihevc_tc_table[tc_indx]; 140 if(0 == tc) 141 { 142 return; 143 } 144 src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4)); 145 src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd)); 146 147 coef_8x16b = _mm_load_si128((__m128i *)(coef_d)); 148 mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d)); 149 150 src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b); 151 mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b); 152 153 mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b); 154 155 156 //to get all 1's of 8 bit in (1) 157 temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b); 158 temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15); 159 //accumulating values foe dp3 dq3 , dp0 dq0 values 160 mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b); 161 162 temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b); 163 // to get all 1,-1 sets of 16 bits in (0) 164 temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b); 165 //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00 166 mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b); 167 //to get 16 bit 1's 168 temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8); 169 170 171 // dq3 dp3 dq0 dp0 172 mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b); 173 mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec); 174 mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49); 175 // dq dp d3 d0 176 mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b); 177 //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00| 178 mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b); 179 //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00| 180 mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b); 181 182 ///store back in a single variable 183 temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4); 184 temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8); 185 mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12); 186 187 d0 = _mm_cvtsi128_si32(mask_d_result_4x32b); 188 d3 = _mm_cvtsi128_si32(temp_coef0_8x16b); 189 dp = _mm_cvtsi128_si32(temp_coef1_8x16b); 190 dq = _mm_cvtsi128_si32(mask_16x8b); 191 //getting d 192 d = d0 + d3; 193 194 ///store back in a single variable 195 temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4); 196 temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8); 197 mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12); 198 199 de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b); 200 de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b); 201 de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b); 202 de_3 = _mm_cvtsi128_si32(mask_16x8b); 203 204 de = 0; 205 dep = 0; 206 deq = 0; 207 if(d < beta) 208 { 209 d_sam0 = 0; 210 if((2 * d0 < (beta >> 2)) 211 && (de_2 < (beta >> 3)) 212 && (de_0 < ((5 * tc + 1) >> 1))) 213 { 214 d_sam0 = 1; 215 } 216 217 d_sam3 = 0; 218 if((2 * d3 < (beta >> 2)) 219 && (de_3 < (beta >> 3)) 220 && de_1 < ((5 * tc + 1) >> 1)) 221 { 222 d_sam3 = 1; 223 } 224 225 de = (d_sam0 & d_sam3) + 1; 226 dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0; 227 deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0; 228 if(tc <= 1) 229 { 230 dep = 0; 231 deq = 0; 232 } 233 } 234 235 } 236 237 if(de != 0) 238 { 239 240 241 src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd)); 242 src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd)); 243 244 if(de == 2) 245 { 246 __m128i temp_pq_str0_16x8b; 247 __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b; 248 __m128i temp_pq2_str0_16x8b; 249 __m128i temp_pq_str1_16x8b; 250 __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b; 251 __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b; 252 __m128i const2_8x16b, const2tc_8x16b; 253 LWORD64 mask, tc2; 254 tc = tc << 1; 255 mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31); 256 tc2 = ((LWORD64)tc); 257 258 const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b); 259 //q'0-q'1-2 ,p'0-p'1-2 260 src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b); 261 src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b); 262 263 const2_8x16b = _mm_srli_epi16(const2_8x16b, 15); 264 temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16); 265 temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16); 266 //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01 267 temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); 268 temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); 269 270 const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b); 271 //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01 272 temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b); 273 274 temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b); 275 276 //q'1-2, p'1-2 277 temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8); 278 temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8); 279 280 temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); 281 temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); 282 283 temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58); 284 temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58); 285 // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00 286 temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b); 287 // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01 288 temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b); 289 290 temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b); 291 temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b); 292 293 //clipping mask design 294 temp_str1_16x8b = _mm_setzero_si128(); 295 temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); 296 const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2)); 297 temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44); 298 const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b); 299 300 //clipping mask design 301 temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31); 302 const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b); 303 //calculating Clipping MAX for all pixel values. 304 temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b); 305 temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b); 306 307 308 //q'2-q'0-2,p'2-p'0-2 309 temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b); 310 temp_str3_16x8b = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b); 311 312 temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c); 313 temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c); 314 315 const2_8x16b = _mm_slli_epi16(const2_8x16b, 1); 316 //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02 317 temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b); 318 319 temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b); 320 321 //calculating Clipping MIN for all pixel values. 322 temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b); 323 temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b); 324 //q'0-q'1-2 ,p'0-p'1-2 325 temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e); 326 temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b); 327 //q'1-2 p'1-2 328 temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); 329 //to get 2 in 16 bit 330 const2_8x16b = _mm_srli_epi16(const2_8x16b, 8); 331 //to get q33 q23 q13 q03, p33 p23 p13 p03 332 temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8); 333 temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8); 334 temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8); 335 336 //q'1, p'1 (adding 2) 337 temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b); 338 //q'0-q'1,p'0-p'1 339 temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b); 340 //q'2-q'1,p'2-p'1 341 temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b); 342 //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1; 343 temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b); 344 //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1; 345 temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b); 346 347 //normalisation of all modified pixels 348 temp_pq_str0_16x8b = _mm_srai_epi16(temp_pq_str0_16x8b, 3); 349 temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2); 350 temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3); 351 352 //getting p0 p1 together and p2 p3 together 353 temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b); 354 temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b); 355 //getting q1 q0 together and q3 q2 together 356 temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b); 357 temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b); 358 //getting p's of row0 row1 together and of row2 row3 together 359 temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b); 360 temp_str2_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b); 361 //getting q's of row0 row1 together and of row2 row3 together 362 temp_str0_16x8b = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b); 363 temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b); 364 //getting values for respective rows in 16 bit 365 src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b); 366 src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b); 367 src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b); 368 src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b); 369 //packing values to 8 bit 370 src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b); 371 src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b); 372 //Clipping MAX 373 src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b); 374 src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b); 375 //Clipping MIN 376 src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b); 377 src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b); 378 //separating row 2 and row 3 379 src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8); 380 src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8); 381 382 } 383 384 else 385 { 386 387 __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b; 388 __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b; 389 __m128i coefdelta_0_8x16b, mask_pq_8x16b; 390 __m128i const2_8x16b, consttc_8x16b; 391 392 LWORD64 mask1; 393 mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15); 394 395 consttc_8x16b = _mm_set1_epi32(tc); 396 397 398 src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b); 399 src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b); 400 401 tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16); 402 tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16); 403 404 tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08); 405 tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08); 406 //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01 407 tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b); 408 409 coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1); 410 // (-3q1+9q0),(-9p0+3p1) 411 tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b); 412 //converting to 16 bit 413 consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b); 414 //getting -tc store 415 tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b); 416 //calc 10 *tc = 2*tc +8*tc ; 2*tc 417 tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1); 418 //calc 10 *tc = 2*tc +8*tc ; 8*tc 419 tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3); 420 //getting -tc store 421 tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b); 422 //calc 10 *tc 423 tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b); 424 //const 1 425 const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15); 426 tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b); 427 const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31); 428 //getting the mask values 429 mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1)); 430 //loaded coef for delta1 calculation 431 coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1); 432 //(-2q1+q0),(p0-2p1) 433 tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b); 434 //const 8 435 const2_8x16b = _mm_slli_epi32(const2_8x16b, 3); 436 //rearranging the mask values 437 mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b); 438 //normalisation of the filter 439 tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b); 440 tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4); 441 442 //getting deltaq0 443 tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b); 444 //packing d3q d2q d1q d0q d3p d2p d1p d0p 445 tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b); 446 //absolute delta 447 tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b); 448 //Clipping of delta0 449 tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b); 450 //mask for |delta| < 10*tc 451 tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b); 452 //Clipping of delta0 453 tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b); 454 455 456 //delta 1 calc starts 457 458 //getting q32 q22 q12 q02 p32 p12 p22 p02 459 tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0)); 460 tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b); 461 tmp_delta1_8x16b = _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b); 462 tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b); 463 //constant 1 464 const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15); 465 //tc>>1 16 bit 466 consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1); 467 468 //getting -tc>>1 store 16 bit 469 tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b); 470 //2*delta0 471 tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b); 472 473 //getting all respective q's and p's together 474 tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1)); 475 tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b); 476 //final adds for deltap1 and deltaq1 477 tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b); 478 tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b); 479 tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b); 480 tmp2_const_8x16b = _mm_setzero_si128(); 481 tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2); 482 483 // clipping delta1 484 tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b); 485 // clipping delta1 486 tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b); 487 488 //getting the mask ready 489 mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15); 490 //masking of the delta values |delta|<10*tc 491 tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b); 492 tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b); 493 //packing dq1 dq0 dp0 dp1 494 tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b); 495 tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b); 496 tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b); 497 tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b); 498 499 //masking of the delta values dep, deq , filter_p ,filter_q 500 tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b); 501 tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b); 502 //converting 8bit to 16 bit 503 src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b); 504 src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b); 505 src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b); 506 src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b); 507 //shuffle values loaded 508 tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2); 509 tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3); 510 //arranging each row delta in different registers 511 tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b); 512 tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b); 513 tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b); 514 tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b); 515 516 //adding the respective delta 517 src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b); 518 src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b); 519 src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b); 520 src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b); 521 //saturating to 8 bit 522 src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b); 523 src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b); 524 //separating different rows 525 src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8); 526 src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8); 527 } 528 529 _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b); 530 _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b); 531 _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b); 532 _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b); 533 } 534 } 535 536 void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src, 537 WORD32 src_strd, 538 WORD32 bs, 539 WORD32 quant_param_p, 540 WORD32 quant_param_q, 541 WORD32 beta_offset_div2, 542 WORD32 tc_offset_div2, 543 WORD32 filter_flag_p, 544 WORD32 filter_flag_q) 545 { 546 WORD32 qp_luma, beta_indx, tc_indx; 547 WORD32 beta, tc; 548 549 WORD32 d0, d3, dp, dq, d; 550 WORD32 de_0, de_1, de_2, de_3; 551 WORD32 d_sam0, d_sam3; 552 WORD32 de, dep, deq; 553 554 __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b; 555 __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b; 556 557 558 559 560 { 561 __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b; 562 __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b; 563 __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b; 564 565 ASSERT((bs > 0)); 566 ASSERT(filter_flag_p || filter_flag_q); 567 568 qp_luma = (quant_param_p + quant_param_q + 1) >> 1; 569 beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51); 570 571 /* BS based on implementation can take value 3 if it is intra/inter egde */ 572 /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */ 573 /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */ 574 /* the above desired functionallity is achieved by doing (2*(bs>>1)) */ 575 576 tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53); 577 578 beta = gai4_ihevc_beta_table[beta_indx]; 579 tc = gai4_ihevc_tc_table[tc_indx]; 580 if(0 == tc) 581 { 582 return; 583 } 584 src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src)); 585 src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 586 src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd)); 587 src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd)); 588 src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); 589 tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); 590 src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd)); 591 tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd)); 592 593 594 src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); 595 src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b); 596 597 src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); 598 src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b); 599 600 src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b); 601 src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b); 602 603 src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c); 604 src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c); 605 606 coef_8x16b = _mm_load_si128((__m128i *)(coef_d)); 607 mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d)); 608 609 src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b); 610 //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c}; 611 mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b); 612 613 mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b); 614 615 616 //to get all 1's of 8 bit in (1) 617 temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b); 618 temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15); 619 //accumulating values foe dp3 dq3 , dp0 dq0 values 620 mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b); 621 622 temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b); 623 // to get all 1,-1 sets of 16 bits in (0) 624 temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b); 625 //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00 626 mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b); 627 //to get 16 bit 1's 628 temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8); 629 630 631 // dq3 dp3 dq0 dp0 632 mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b); 633 mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec); 634 mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49); 635 // dq dp d3 d0 636 mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b); 637 //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00| 638 mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b); 639 //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00| 640 mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b); 641 642 ///store back in a single variable 643 temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4); 644 temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8); 645 mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12); 646 647 d0 = _mm_cvtsi128_si32(mask_d_result_4x32b); 648 d3 = _mm_cvtsi128_si32(temp_coef0_8x16b); 649 dp = _mm_cvtsi128_si32(temp_coef1_8x16b); 650 dq = _mm_cvtsi128_si32(mask_16x8b); 651 //getting d 652 d = d0 + d3; 653 654 ///store back in a single variable 655 temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4); 656 temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8); 657 mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12); 658 659 de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b); 660 de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b); 661 de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b); 662 de_3 = _mm_cvtsi128_si32(mask_16x8b); 663 664 de = 0; 665 dep = 0; 666 deq = 0; 667 if(d < beta) 668 { 669 d_sam0 = 0; 670 if((2 * d0 < (beta >> 2)) 671 && (de_2 < (beta >> 3)) 672 && (de_0 < ((5 * tc + 1) >> 1))) 673 { 674 d_sam0 = 1; 675 } 676 677 d_sam3 = 0; 678 if((2 * d3 < (beta >> 2)) 679 && (de_3 < (beta >> 3)) 680 && de_1 < ((5 * tc + 1) >> 1)) 681 { 682 d_sam3 = 1; 683 } 684 685 de = (d_sam0 & d_sam3) + 1; 686 dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0; 687 deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0; 688 if(tc <= 1) 689 { 690 dep = 0; 691 deq = 0; 692 } 693 } 694 695 } 696 697 if(de != 0) 698 { 699 700 if(2 == de) 701 { 702 703 __m128i temp_pq0_str0_16x8b; 704 __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b; 705 __m128i temp_pq2_str0_16x8b; 706 __m128i temp_str0_16x8b, temp_str1_16x8b; 707 __m128i const2_8x16b, const2tc_8x16b; 708 709 LWORD64 mask, tc2; 710 tc = tc << 1; 711 mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31); 712 tc2 = ((LWORD64)tc); 713 714 const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b); 715 //q'0-q'1-2 ,p'0-p'1-2 716 temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); 717 temp_str0_16x8b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); 718 const2_8x16b = _mm_srli_epi16(const2_8x16b, 15); 719 //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01 720 temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b); 721 722 const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b); 723 temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b); 724 725 //q'1-2, p'1-2 726 temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b); 727 temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b); 728 temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b); 729 // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00 730 temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b); 731 // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01 732 temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b); 733 734 temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b); 735 temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b); 736 737 //clipping mask design 738 temp_str1_16x8b = _mm_setzero_si128(); 739 temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); 740 const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2)); 741 temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44); 742 const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b); 743 744 //clipping mask design 745 temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31); 746 const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b); 747 //calculating Clipping MAX for all pixel values. 748 src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b); 749 src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b); 750 //for clipping calc 751 src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b); 752 //saving the unmodified data of q1 p1 q0 p0 753 src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b); 754 //CLIpping MAX and MIN for q1 p1 q0 p0 755 src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b); 756 src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b); 757 758 759 //q'2-q'0-2,p'2-p'0-2 760 tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b); 761 temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b); 762 const2_8x16b = _mm_slli_epi16(const2_8x16b, 1); 763 //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03 764 temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b); 765 src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b); 766 temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b); 767 768 //calculating Clipping MAX and MIN for p2 and q2 . 769 tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b); 770 tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b); 771 //q'0-q'1-2 ,p'0-p'1-2 772 temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e); 773 temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b); 774 //q'1-2 p'1-2 775 temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b); 776 //to get 2 in 16 bit 777 const2_8x16b = _mm_srli_epi16(const2_8x16b, 8); 778 779 780 //q'1, p'1 (adding 2) 781 temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b); 782 //q'0-q'1,p'0-p'1 783 temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b); 784 //q'2-q'1,p'2-p'1 785 temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b); 786 //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1; 787 temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b); 788 //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1; 789 temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b); 790 791 //normalisation of all modified pixels 792 temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3); 793 temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2); 794 temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3); 795 //q'1 p'1 q'0 p'0 796 temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b); 797 temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b); 798 //pack with the unmodified data of q2 and p2 799 src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b); 800 //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2 p'2 801 temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b); 802 src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b); 803 temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b); 804 src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b); 805 //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data 806 src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b); 807 src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b); 808 src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8); 809 src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8); 810 src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8); 811 src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8); 812 813 _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b); 814 _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b); 815 _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b); 816 _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b); 817 _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b); 818 _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b); 819 820 821 } 822 823 else 824 { 825 826 __m128i tmp_delta0_8x16b, tmp_delta1_8x16b; 827 __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b; 828 __m128i coefdelta_0_8x16b; 829 __m128i const2_8x16b, consttc_8x16b; 830 831 LWORD64 maskp0, maskp1, maskq0, maskq1; 832 maskp0 = (LWORD64)filter_flag_p; 833 maskq0 = (LWORD64)filter_flag_q; 834 maskp1 = (LWORD64)dep; 835 maskq1 = (LWORD64)deq; 836 consttc_8x16b = _mm_set1_epi32(tc); 837 838 tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b); 839 tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b); 840 //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01 841 tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b); 842 843 coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1); 844 // (-3q1+9q0),(-9p0+3p1) 845 tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b); 846 847 //getting -tc store 848 tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b); 849 850 //getting tc in 16 bit 851 consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b); 852 //calc 10 *tc = 2*tc +8*tc ; 2*tc 853 tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1); 854 //calc 10 *tc = 2*tc +8*tc ; 8*tc 855 tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3); 856 857 //const 1 858 const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15); 859 //calc 10 *tc 860 tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b); 861 //delta0 without normalisation and clipping 862 tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b); 863 864 const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31); 865 866 //loaded coef for delta1 calculation 867 coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1); 868 //(-2q1+q0),(p0-2p1) 869 tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b); 870 //const 8 871 const2_8x16b = _mm_slli_epi32(const2_8x16b, 3); 872 873 //normalisation of the filter 874 tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b); 875 tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4); 876 877 //getting deltaq0 878 tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b); 879 //getting -tc 880 tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b); 881 //packing d03q d02q d01q d0q d03p d02p d01p d00p 882 tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b); 883 //absolute delta 884 tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b); 885 886 //Clipping of delta0 887 tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b); 888 //tc>>1 16 bit 889 consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1); 890 //Clipping of delta0 891 tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b); 892 893 //(-tc)>>1 16 bit 894 tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b); 895 //mask for |delta| < 10*tc 896 tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b); 897 //delta 1 calc starts 898 899 //getting q32 q22 q12 q02 p32 p12 p22 p02 900 tmp0_const_8x16b = _mm_setzero_si128(); 901 src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b); 902 src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b); 903 src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b); 904 //constant 1 905 const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15); 906 //2*delta0 907 tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b); 908 //getting all respective q's and p's together 909 coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1)); 910 tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b); 911 //final adds for deltap1 and deltaq1 912 tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b); 913 src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b); 914 tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b); 915 tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2); 916 917 //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31); 918 tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0))); 919 src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0))); 920 921 // src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p); 922 //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31); 923 src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1))); 924 coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1))); 925 926 src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b); 927 src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b); 928 //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep); 929 src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b); 930 931 //rearranging the mask values 932 src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50); 933 src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50); 934 935 src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31); 936 src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31); 937 src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31); 938 src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31); 939 940 //combining mask delta1 941 tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b); 942 // clipping delta1 943 tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b); 944 //combining mask delat0 945 tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b); 946 // clipping delta1 947 tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b); 948 949 950 //masking of the delta values |delta|<10*tc 951 tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b); 952 tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b); 953 //separating p and q delta 0 and addinq p0 and q0 954 tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b); 955 tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b); 956 src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b); 957 src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b); 958 src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b); 959 src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b); 960 //separating p and q delta 0 and addinq p0 and q0 961 tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b); 962 tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b); 963 src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b); 964 src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b); 965 src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b); 966 src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b); 967 //packing p1 q1 and p0 q0 to 8 bit 968 src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b); 969 src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b); 970 971 src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8); 972 src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8); 973 974 _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b); 975 _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b); 976 _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b); 977 _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b); 978 979 980 } 981 982 983 984 } 985 986 } 987 988 void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src, 989 WORD32 src_strd, 990 WORD32 quant_param_p, 991 WORD32 quant_param_q, 992 WORD32 qp_offset_u, 993 WORD32 qp_offset_v, 994 WORD32 tc_offset_div2, 995 WORD32 filter_flag_p, 996 WORD32 filter_flag_q) 997 { 998 WORD32 qp_indx_u, qp_chroma_u; 999 WORD32 qp_indx_v, qp_chroma_v; 1000 WORD32 tc_indx_u, tc_u; 1001 WORD32 tc_indx_v, tc_v; 1002 1003 __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b; 1004 ASSERT(filter_flag_p || filter_flag_q); 1005 1006 /* chroma processing is done only if BS is 2 */ 1007 /* this function is assumed to be called only if BS is 2 */ 1008 qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1); 1009 qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]); 1010 1011 qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1); 1012 qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]); 1013 1014 tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53); 1015 tc_u = gai4_ihevc_tc_table[tc_indx_u]; 1016 1017 tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53); 1018 tc_v = gai4_ihevc_tc_table[tc_indx_v]; 1019 1020 if(0 == tc_u && 0 == tc_v) 1021 { 1022 return; 1023 } 1024 src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4)); 1025 tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4)); 1026 src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4)); 1027 tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4)); 1028 1029 { 1030 LWORD64 mask_tc, mask_flag, mask; 1031 __m128i delta_vu0_16x8b, delta_vu1_16x8b; 1032 __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b; 1033 __m128i min_0_16x8b; 1034 __m128i const_16x8b; 1035 mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63); 1036 mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u); 1037 mask = 0xffff00000000ffffLL; 1038 1039 src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b); 1040 src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b); 1041 1042 mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv)); 1043 // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01 1044 // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21 1045 delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b); 1046 delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b); 1047 1048 tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b); 1049 tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b); 1050 // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01 1051 // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00 1052 delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0); 1053 delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1); 1054 1055 delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b); 1056 delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b); 1057 1058 //generating offset 4 1059 const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b); 1060 // filter flag mask and tc mask 1061 mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc)); 1062 mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag)); 1063 1064 mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00); 1065 mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31); 1066 //-tc 1067 min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b); 1068 //converting const 1 1069 const_16x8b = _mm_srli_epi16(const_16x8b, 15); 1070 1071 //filterp and filterq flag 1072 mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00); 1073 mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55); 1074 1075 //modified delta with a filter (1 -4 4 -1) available in 16 bit 1076 delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b); 1077 //converting const 4 1078 const_16x8b = _mm_slli_epi16(const_16x8b, 2); 1079 1080 mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask)); 1081 //offset addition 1082 delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b); 1083 //eliminating q1 1084 tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8); 1085 1086 const_16x8b = _mm_setzero_si128(); 1087 //filter after normalisation 1088 delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3); 1089 mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44); 1090 1091 //clipping MAX 1092 delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8); 1093 //getting p0 and eliminating p1 1094 tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8); 1095 //clipping MIN 1096 delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b); 1097 //getting q0 1098 tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8); 1099 //masking filter flag 1100 delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b); 1101 delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b); 1102 1103 // q-delta ,p+delta 1104 tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b); 1105 tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b); 1106 //merging q0 and p0 of respective rows 1107 delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b); 1108 delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b); 1109 // row 0 and row 1 packed , row2 and row3 packed 1110 delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b); 1111 delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b); 1112 //removing older pixel values 1113 src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b); 1114 src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b); 1115 //arranging modified pixels 1116 delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8); 1117 delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8); 1118 delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16); 1119 delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16); 1120 //plugging the modified values 1121 src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b); 1122 src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b); 1123 1124 1125 //geting values for row1 and row 3 1126 tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8); 1127 tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8); 1128 1129 _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b); 1130 _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b); 1131 _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b); 1132 _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b); 1133 } 1134 1135 1136 1137 } 1138 1139 void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src, 1140 WORD32 src_strd, 1141 WORD32 quant_param_p, 1142 WORD32 quant_param_q, 1143 WORD32 qp_offset_u, 1144 WORD32 qp_offset_v, 1145 WORD32 tc_offset_div2, 1146 WORD32 filter_flag_p, 1147 WORD32 filter_flag_q) 1148 { 1149 WORD32 qp_indx_u, qp_chroma_u; 1150 WORD32 qp_indx_v, qp_chroma_v; 1151 WORD32 tc_indx_u, tc_u; 1152 WORD32 tc_indx_v, tc_v; 1153 1154 1155 __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b; 1156 1157 ASSERT(filter_flag_p || filter_flag_q); 1158 1159 /* chroma processing is done only if BS is 2 */ 1160 /* this function is assumed to be called only if BS is 2 */ 1161 qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1); 1162 qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]); 1163 1164 qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1); 1165 qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]); 1166 1167 tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53); 1168 tc_u = gai4_ihevc_tc_table[tc_indx_u]; 1169 1170 tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53); 1171 tc_v = gai4_ihevc_tc_table[tc_indx_v]; 1172 1173 if(0 == tc_u && 0 == tc_v) 1174 { 1175 return; 1176 } 1177 tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd)); 1178 src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd)); 1179 src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 1180 tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 1181 1182 { 1183 LWORD64 mask_tc, mask_flag; 1184 __m128i delta_vu0_16x8b, delta_vu1_16x8b; 1185 __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b; 1186 __m128i min_0_16x8b; 1187 __m128i const_16x8b; 1188 mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63); 1189 mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u); 1190 1191 tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b); 1192 tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b); 1193 1194 // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01 1195 // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00 1196 delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0); 1197 delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1); 1198 1199 delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b); 1200 delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b); 1201 1202 1203 // filter flag mask and tc mask 1204 mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc)); 1205 mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag)); 1206 1207 //generating offset 4 1208 const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b); 1209 // filter flag mask and tc mask 1210 mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00); 1211 mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31); 1212 //-tc 1213 min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b); 1214 //converting const 1 1215 const_16x8b = _mm_srli_epi16(const_16x8b, 15); 1216 1217 //filterp 1218 mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00); 1219 1220 1221 //converting const 4 1222 const_16x8b = _mm_slli_epi16(const_16x8b, 2); 1223 //modified delta with a filter (1 -4 4 -1) available in 16 bit 1224 delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b); 1225 1226 //filterq flag 1227 mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55); 1228 //offset addition 1229 delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b); 1230 mask_16x8b = _mm_setzero_si128(); 1231 //filter after normalisation 1232 delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3); 1233 1234 //converting p0 to 16bit 1235 src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b); 1236 //clipping MAX 1237 delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8); 1238 //converting q0 to 16bit 1239 src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b); 1240 //clipping MIN 1241 delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b); 1242 1243 //masking filter flag 1244 delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b); 1245 delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b); 1246 1247 // q-delta ,p+delta 1248 src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b); 1249 src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b); 1250 1251 // p0 and q0 packed 1252 src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b); 1253 src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b); 1254 1255 1256 1257 _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b); 1258 _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b); 1259 1260 } 1261 1262 1263 } 1264