1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /*****************************************************************************/ 21 /* */ 22 /* File Name : ih264_deblk_chroma_ssse3.c */ 23 /* */ 24 /* Description : Contains function definitions for deblocking */ 25 /* */ 26 /* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */ 27 /* ih264_deblk_chroma_horz_bs4_ssse3() */ 28 /* ih264_deblk_chroma_vert_bslt4_ssse3() */ 29 /* ih264_deblk_chroma_horz_bslt4_ssse3() */ 30 /* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ 31 /* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ 32 /* */ 33 /* Issues / Problems : None */ 34 /* */ 35 /* Revision History : */ 36 /* */ 37 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 38 /* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */ 39 /* intrinsics */ 40 /* */ 41 /*****************************************************************************/ 42 43 /*****************************************************************************/ 44 /* File Includes */ 45 /*****************************************************************************/ 46 47 /* System include files */ 48 #include <stdio.h> 49 50 /* User include files */ 51 #include "ih264_typedefs.h" 52 #include "ih264_platform_macros.h" 53 #include "ih264_deblk_edge_filters.h" 54 #include "ih264_macros.h" 55 56 /*****************************************************************************/ 57 /* Function Definitions */ 58 /*****************************************************************************/ 59 60 /*****************************************************************************/ 61 /* */ 62 /* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */ 63 /* */ 64 /* Description : This function performs filtering of a chroma block */ 65 /* vertical edge when the boundary strength is set to 4 in */ 66 /* high profile. */ 67 /* */ 68 /* Inputs : pu1_src - pointer to the src sample q0 of U */ 69 /* src_strd - source stride */ 70 /* alpha_cb - alpha value for the boundary in U */ 71 /* beta_cb - beta value for the boundary in U */ 72 /* alpha_cr - alpha value for the boundary in V */ 73 /* beta_cr - beta value for the boundary in V */ 74 /* */ 75 /* Globals : None */ 76 /* */ 77 /* Processing : This operation is described in Sec. 8.7.2.4 under the */ 78 /* title "Filtering process for edges for bS equal to 4" in */ 79 /* ITU T Rec H.264 with alpha and beta values different in */ 80 /* U and V. */ 81 /* */ 82 /* Outputs : None */ 83 /* */ 84 /* Returns : None */ 85 /* */ 86 /* Issues : None */ 87 /* */ 88 /* Revision History: */ 89 /* */ 90 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 91 /* 12 02 2015 Naveen Kumar P Initial version */ 92 /* */ 93 /*****************************************************************************/ 94 void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src, 95 WORD32 src_strd, 96 WORD32 alpha_cb, 97 WORD32 beta_cb, 98 WORD32 alpha_cr, 99 WORD32 beta_cr) 100 { 101 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 102 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 103 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 104 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; 105 __m128i temp1, temp2, temp3, temp4; 106 107 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 108 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 109 __m128i flag1, flag2; 110 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; 111 __m128i zero = _mm_setzero_si128(); 112 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 113 114 /* Load and transpose the pixel values */ 115 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); 116 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); 117 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); 118 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); 119 linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); 120 linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); 121 lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); 122 lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); 123 124 temp1 = _mm_unpacklo_epi16(linea, lineb); 125 temp2 = _mm_unpacklo_epi16(linec, lined); 126 temp3 = _mm_unpacklo_epi16(linee, linef); 127 temp4 = _mm_unpacklo_epi16(lineg, lineh); 128 129 p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); 130 p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); 131 q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); 132 q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); 133 134 p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); 135 p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); 136 q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); 137 q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); 138 /* End of transpose */ 139 140 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 141 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 142 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 143 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 144 145 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 146 diff = _mm_abs_epi16(diff); 147 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 148 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 149 150 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 151 diff = _mm_abs_epi16(diff); 152 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 153 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 154 155 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 156 diff = _mm_abs_epi16(diff); 157 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 158 159 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 160 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 161 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 162 temp1 = _mm_add_epi16(temp1, temp2); 163 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 164 165 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 166 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 167 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 168 temp1 = _mm_add_epi16(temp1, temp2); 169 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 170 171 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); 172 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); 173 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); 174 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); 175 176 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 177 diff = _mm_abs_epi16(diff); 178 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 179 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 180 181 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 182 diff = _mm_abs_epi16(diff); 183 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 184 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 185 186 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 187 diff = _mm_abs_epi16(diff); 188 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 189 190 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 191 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 192 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 193 temp1 = _mm_add_epi16(temp1, temp2); 194 p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); 195 196 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 197 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 198 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 199 temp1 = _mm_add_epi16(temp1, temp2); 200 q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); 201 202 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); 203 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); 204 205 flag1 = _mm_packs_epi16(flag1, flag2); 206 207 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 208 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 209 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 210 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 211 212 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 213 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 214 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 215 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 216 217 /* Inverse-transpose and store back */ 218 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); 219 temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); 220 temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); 221 temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); 222 223 linea = _mm_unpacklo_epi32(temp1, temp3); 224 lineb = _mm_srli_si128(linea, 8); 225 linec = _mm_unpackhi_epi32(temp1, temp3); 226 lined = _mm_srli_si128(linec, 8); 227 linee = _mm_unpacklo_epi32(temp2, temp4); 228 linef = _mm_srli_si128(linee, 8); 229 lineg = _mm_unpackhi_epi32(temp2, temp4); 230 lineh = _mm_srli_si128(lineg, 8); 231 232 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); 233 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); 234 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); 235 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); 236 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); 237 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); 238 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); 239 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); 240 241 } 242 243 /*****************************************************************************/ 244 /* */ 245 /* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */ 246 /* */ 247 /* Description : This function performs filtering of a chroma block */ 248 /* horizontal edge when the boundary strength is set to 4 */ 249 /* in high profile. */ 250 /* */ 251 /* Inputs : pu1_src - pointer to the src sample q0 of U */ 252 /* src_strd - source stride */ 253 /* alpha_cb - alpha value for the boundary in U */ 254 /* beta_cb - beta value for the boundary in U */ 255 /* alpha_cr - alpha value for the boundary in V */ 256 /* beta_cr - beta value for the boundary in V */ 257 /* */ 258 /* Globals : None */ 259 /* */ 260 /* Processing : This operation is described in Sec. 8.7.2.4 under the */ 261 /* title "Filtering process for edges for bS equal to 4" in */ 262 /* ITU T Rec H.264 with alpha and beta values different in */ 263 /* U and V. */ 264 /* */ 265 /* Outputs : None */ 266 /* */ 267 /* Returns : None */ 268 /* */ 269 /* Issues : None */ 270 /* */ 271 /* Revision History: */ 272 /* */ 273 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 274 /* 12 02 2015 Naveen Kumar P Initial version */ 275 /* */ 276 /*****************************************************************************/ 277 void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src, 278 WORD32 src_strd, 279 WORD32 alpha_cb, 280 WORD32 beta_cb, 281 WORD32 alpha_cr, 282 WORD32 beta_cr) 283 { 284 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 285 WORD16 i16_posP1, i16_posP0, i16_posQ1; 286 287 UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ 288 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 289 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 290 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 291 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 292 __m128i flag1, flag2; 293 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; 294 __m128i zero = _mm_setzero_si128(); 295 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 296 __m128i temp1, temp2; 297 298 pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); 299 300 i16_posQ1 = src_strd; 301 i16_posP0 = src_strd; 302 i16_posP1 = 0; 303 304 q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); 305 q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); 306 p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); 307 p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); 308 309 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 310 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 311 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 312 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 313 314 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 315 diff = _mm_abs_epi16(diff); 316 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 317 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 318 319 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 320 diff = _mm_abs_epi16(diff); 321 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 322 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 323 324 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 325 diff = _mm_abs_epi16(diff); 326 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 327 328 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 329 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 330 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 331 temp1 = _mm_add_epi16(temp1, temp2); 332 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 333 334 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 335 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 336 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 337 temp1 = _mm_add_epi16(temp1, temp2); 338 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 339 340 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); 341 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); 342 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); 343 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); 344 345 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 346 diff = _mm_abs_epi16(diff); 347 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 348 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 349 350 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 351 diff = _mm_abs_epi16(diff); 352 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 353 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 354 355 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 356 diff = _mm_abs_epi16(diff); 357 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 358 359 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 360 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 361 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 362 temp1 = _mm_add_epi16(temp1, temp2); 363 p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); 364 365 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 366 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 367 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 368 temp1 = _mm_add_epi16(temp1, temp2); 369 q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2); 370 371 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); 372 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); 373 374 flag1 = _mm_packs_epi16(flag1, flag2); 375 376 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 377 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 378 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 379 p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 380 _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); 381 382 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 383 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 384 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 385 q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 386 _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); 387 388 } 389 390 /*****************************************************************************/ 391 /* */ 392 /* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */ 393 /* */ 394 /* Description : This function performs filtering of a chroma block */ 395 /* vertical edge when the boundary strength is less than 4 */ 396 /* in high profile. */ 397 /* */ 398 /* Inputs : pu1_src - pointer to the src sample q0 of U */ 399 /* src_strd - source stride */ 400 /* alpha_cb - alpha value for the boundary in U */ 401 /* beta_cb - beta value for the boundary in U */ 402 /* alpha_cr - alpha value for the boundary in V */ 403 /* beta_cr - beta value for the boundary in V */ 404 /* u4_bs - packed Boundary strength array */ 405 /* pu1_cliptab_cb - tc0_table for U */ 406 /* pu1_cliptab_cr - tc0_table for V */ 407 /* */ 408 /* Globals : None */ 409 /* */ 410 /* Processing : This operation is described in Sec. 8.7.2.3 under the */ 411 /* title "Filtering process for edges for bS less than 4" */ 412 /* in ITU T Rec H.264 with alpha and beta values different */ 413 /* in U and V. */ 414 /* */ 415 /* Outputs : None */ 416 /* */ 417 /* Returns : None */ 418 /* */ 419 /* Issues : None */ 420 /* */ 421 /* Revision History: */ 422 /* */ 423 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 424 /* 12 02 2015 Naveen Kumar P Initial version */ 425 /* */ 426 /*****************************************************************************/ 427 void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src, 428 WORD32 src_strd, 429 WORD32 alpha_cb, 430 WORD32 beta_cb, 431 WORD32 alpha_cr, 432 WORD32 beta_cr, 433 UWORD32 u4_bs, 434 const UWORD8 *pu1_cliptab_cb, 435 const UWORD8 *pu1_cliptab_cr) 436 { 437 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 438 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 439 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 440 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 441 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; 442 __m128i temp1, temp2, temp3, temp4; 443 444 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 445 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 446 __m128i flag_bs, flag1, flag2; 447 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; 448 __m128i zero = _mm_setzero_si128(); 449 __m128i C0_uv_8x16; 450 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 451 452 u1_Bs0 = (u4_bs >> 24) & 0xff; 453 u1_Bs1 = (u4_bs >> 16) & 0xff; 454 u1_Bs2 = (u4_bs >> 8) & 0xff; 455 u1_Bs3 = (u4_bs >> 0) & 0xff; 456 457 flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, 458 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, 459 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); 460 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s 461 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask 462 463 /* Load and transpose the pixel values */ 464 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); 465 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); 466 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); 467 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); 468 linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd)); 469 linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd)); 470 lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd)); 471 lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd)); 472 473 temp1 = _mm_unpacklo_epi16(linea, lineb); 474 temp2 = _mm_unpacklo_epi16(linec, lined); 475 temp3 = _mm_unpacklo_epi16(linee, linef); 476 temp4 = _mm_unpacklo_epi16(lineg, lineh); 477 478 p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2); 479 p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4); 480 q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2); 481 q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4); 482 483 p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16); 484 p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16); 485 q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16); 486 q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16); 487 /* End of transpose */ 488 489 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 490 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 491 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 492 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 493 494 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 495 diff = _mm_abs_epi16(diff); 496 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 497 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 498 499 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 500 diff = _mm_abs_epi16(diff); 501 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 502 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 503 504 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 505 diff = _mm_abs_epi16(diff); 506 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 507 508 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 509 diff = _mm_slli_epi16(diff, 2); 510 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 511 diff = _mm_add_epi16(diff, diff1); 512 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 513 in_macro = _mm_srai_epi16(diff, 3); 514 515 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 516 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 517 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], 518 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); 519 520 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 521 522 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 523 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 524 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 525 526 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); 527 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); 528 529 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); 530 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); 531 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); 532 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); 533 534 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 535 diff = _mm_abs_epi16(diff); 536 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 537 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 538 539 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 540 diff = _mm_abs_epi16(diff); 541 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 542 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 543 544 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 545 diff = _mm_abs_epi16(diff); 546 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 547 548 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 549 diff = _mm_slli_epi16(diff, 2); 550 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 551 diff = _mm_add_epi16(diff, diff1); 552 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 553 in_macro = _mm_srai_epi16(diff, 3); 554 555 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 556 pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 557 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], 558 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); 559 560 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 561 562 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 563 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 564 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 565 566 p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); 567 q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); 568 569 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); 570 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); 571 572 flag1 = _mm_packs_epi16(flag1, flag2); 573 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) 574 575 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 576 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 577 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 578 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 579 580 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 581 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 582 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 583 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 584 585 /* Inverse-transpose and store back */ 586 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); 587 temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8); 588 temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); 589 temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8); 590 591 linea = _mm_unpacklo_epi32(temp1, temp3); 592 lineb = _mm_srli_si128(linea, 8); 593 linec = _mm_unpackhi_epi32(temp1, temp3); 594 lined = _mm_srli_si128(linec, 8); 595 linee = _mm_unpacklo_epi32(temp2, temp4); 596 linef = _mm_srli_si128(linee, 8); 597 lineg = _mm_unpackhi_epi32(temp2, temp4); 598 lineh = _mm_srli_si128(lineg, 8); 599 600 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); 601 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); 602 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); 603 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); 604 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee); 605 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef); 606 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg); 607 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh); 608 609 } 610 611 /*****************************************************************************/ 612 /* */ 613 /* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */ 614 /* */ 615 /* Description : This function performs filtering of a chroma block */ 616 /* horizontal edge when the boundary strength is less than */ 617 /* 4 in high profile. */ 618 /* */ 619 /* Inputs : pu1_src - pointer to the src sample q0 of U */ 620 /* src_strd - source stride */ 621 /* alpha_cb - alpha value for the boundary in U */ 622 /* beta_cb - beta value for the boundary in U */ 623 /* alpha_cr - alpha value for the boundary in V */ 624 /* beta_cr - beta value for the boundary in V */ 625 /* u4_bs - packed Boundary strength array */ 626 /* pu1_cliptab_cb - tc0_table for U */ 627 /* pu1_cliptab_cr - tc0_table for V */ 628 /* */ 629 /* Globals : None */ 630 /* */ 631 /* Processing : This operation is described in Sec. 8.7.2.3 under the */ 632 /* title "Filtering process for edges for bS less than 4" */ 633 /* in ITU T Rec H.264 with alpha and beta values different */ 634 /* in U and V. */ 635 /* */ 636 /* Outputs : None */ 637 /* */ 638 /* Returns : None */ 639 /* */ 640 /* Issues : None */ 641 /* */ 642 /* Revision History: */ 643 /* */ 644 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 645 /* 12 02 2015 Naveen Kumar P Initial version */ 646 /* */ 647 /*****************************************************************************/ 648 void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src, 649 WORD32 src_strd, 650 WORD32 alpha_cb, 651 WORD32 beta_cb, 652 WORD32 alpha_cr, 653 WORD32 beta_cr, 654 UWORD32 u4_bs, 655 const UWORD8 *pu1_cliptab_cb, 656 const UWORD8 *pu1_cliptab_cr) 657 { 658 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 659 WORD16 i16_posP1, i16_posP0, i16_posQ1; 660 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 661 662 UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */ 663 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 664 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 665 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 666 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 667 __m128i flag_bs, flag1, flag2; 668 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; 669 __m128i zero = _mm_setzero_si128(); 670 __m128i C0_uv_8x16; 671 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 672 673 pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1); 674 675 i16_posQ1 = src_strd; 676 i16_posP0 = src_strd; 677 i16_posP1 = 0; 678 679 u1_Bs0 = (u4_bs >> 24) & 0xff; 680 u1_Bs1 = (u4_bs >> 16) & 0xff; 681 u1_Bs2 = (u4_bs >> 8) & 0xff; 682 u1_Bs3 = (u4_bs >> 0) & 0xff; 683 684 flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, 685 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, 686 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); 687 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s 688 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask 689 690 q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv)); 691 q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1)); 692 p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1)); 693 p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0)); 694 695 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 696 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 697 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 698 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 699 700 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 701 diff = _mm_abs_epi16(diff); 702 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 703 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 704 705 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 706 diff = _mm_abs_epi16(diff); 707 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 708 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 709 710 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 711 diff = _mm_abs_epi16(diff); 712 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 713 714 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 715 diff = _mm_slli_epi16(diff, 2); 716 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 717 diff = _mm_add_epi16(diff, diff1); 718 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 719 in_macro = _mm_srai_epi16(diff, 3); 720 721 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 722 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 723 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0], 724 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); 725 726 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 727 728 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 729 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 730 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 731 732 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); 733 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); 734 735 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero); 736 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero); 737 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero); 738 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero); 739 740 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 741 diff = _mm_abs_epi16(diff); 742 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 743 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 744 745 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 746 diff = _mm_abs_epi16(diff); 747 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 748 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 749 750 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 751 diff = _mm_abs_epi16(diff); 752 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 753 754 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 755 diff = _mm_slli_epi16(diff, 2); 756 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 757 diff = _mm_add_epi16(diff, diff1); 758 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 759 in_macro = _mm_srai_epi16(diff, 3); 760 761 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 762 pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 763 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], 764 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]); 765 766 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 767 768 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 769 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 770 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 771 772 p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro); 773 q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro); 774 775 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2); 776 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2); 777 778 flag1 = _mm_packs_epi16(flag1, flag2); 779 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) 780 781 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 782 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 783 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 784 p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 785 _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1); 786 787 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 788 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 789 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 790 q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 791 _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1); 792 793 } 794 795 /*****************************************************************************/ 796 /* */ 797 /* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */ 798 /* */ 799 /* Description : This function performs filtering of a chroma block */ 800 /* vertical edge when boundary strength is set to 4 in high */ 801 /* profile. */ 802 /* */ 803 /* Inputs : pu1_src - pointer to the src sample q0 of U */ 804 /* src_strd - source stride */ 805 /* alpha_cb - alpha value for the boundary in U */ 806 /* beta_cb - beta value for the boundary in U */ 807 /* alpha_cr - alpha value for the boundary in V */ 808 /* beta_cr - beta value for the boundary in V */ 809 /* u4_bs - packed Boundary strength array */ 810 /* pu1_cliptab_cb - tc0_table for U */ 811 /* pu1_cliptab_cr - tc0_table for V */ 812 /* */ 813 /* Globals : None */ 814 /* */ 815 /* Processing : When the function is called twice, this operation is as */ 816 /* described in Sec. 8.7.2.4 under the title "Filtering */ 817 /* process for edges for bS equal to 4" in ITU T Rec H.264 */ 818 /* with alpha and beta values different in U and V. */ 819 /* */ 820 /* Outputs : None */ 821 /* */ 822 /* Returns : None */ 823 /* */ 824 /* Issues : None */ 825 /* */ 826 /* Revision History: */ 827 /* */ 828 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 829 /* 12 02 2015 Naveen Kumar P Initial version */ 830 /* */ 831 /*****************************************************************************/ 832 void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src, 833 WORD32 src_strd, 834 WORD32 alpha_cb, 835 WORD32 beta_cb, 836 WORD32 alpha_cr, 837 WORD32 beta_cr) 838 { 839 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 840 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 841 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 842 __m128i linea, lineb, linec, lined; 843 __m128i temp1, temp2; 844 845 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 846 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 847 __m128i flag1; 848 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8; 849 __m128i zero = _mm_setzero_si128(); 850 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 851 852 /* Load and transpose the pixel values */ 853 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); 854 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); 855 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); 856 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); 857 858 temp1 = _mm_unpacklo_epi16(linea, lineb); 859 temp2 = _mm_unpacklo_epi16(linec, lined); 860 861 p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); 862 p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); 863 q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); 864 q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); 865 /* End of transpose */ 866 867 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 868 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 869 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 870 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 871 872 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 873 diff = _mm_abs_epi16(diff); 874 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 875 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 876 877 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 878 diff = _mm_abs_epi16(diff); 879 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 880 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 881 882 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 883 diff = _mm_abs_epi16(diff); 884 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 885 886 temp1 = _mm_slli_epi16(p1_uv_8x16, 1); 887 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16); 888 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 889 temp1 = _mm_add_epi16(temp1, temp2); 890 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 891 892 temp1 = _mm_slli_epi16(q1_uv_8x16, 1); 893 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16); 894 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2)); 895 temp1 = _mm_add_epi16(temp1, temp2); 896 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2); 897 898 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); 899 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); 900 901 flag1 = _mm_packs_epi16(flag1, flag1); 902 903 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 904 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 905 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 906 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 907 908 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 909 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 910 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 911 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 912 913 /* Inverse-transpose and store back */ 914 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); 915 temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); 916 917 linea = _mm_unpacklo_epi32(temp1, temp2); 918 lineb = _mm_srli_si128(linea, 8); 919 linec = _mm_unpackhi_epi32(temp1, temp2); 920 lined = _mm_srli_si128(linec, 8); 921 922 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); 923 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); 924 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); 925 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); 926 927 } 928 929 /*****************************************************************************/ 930 /* */ 931 /* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */ 932 /* */ 933 /* Description : This function performs filtering of a chroma block */ 934 /* vertical edge when boundary strength is less than 4 in */ 935 /* high profile. */ 936 /* */ 937 /* Inputs : pu1_src - pointer to the src sample q0 of U */ 938 /* src_strd - source stride */ 939 /* alpha_cb - alpha value for the boundary in U */ 940 /* beta_cb - beta value for the boundary in U */ 941 /* alpha_cr - alpha value for the boundary in V */ 942 /* beta_cr - beta value for the boundary in V */ 943 /* u4_bs - packed Boundary strength array */ 944 /* pu1_cliptab_cb - tc0_table for U */ 945 /* pu1_cliptab_cr - tc0_table for V */ 946 /* */ 947 /* Globals : None */ 948 /* */ 949 /* Processing : When the function is called twice, this operation is as */ 950 /* described in Sec. 8.7.2.4 under the title "Filtering */ 951 /* process for edges for bS less than 4" in ITU T Rec H.264 */ 952 /* with alpha and beta values different in U and V. */ 953 /* */ 954 /* Outputs : None */ 955 /* */ 956 /* Returns : None */ 957 /* */ 958 /* Issues : None */ 959 /* */ 960 /* Revision History: */ 961 /* */ 962 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 963 /* 12 02 2015 Naveen Kumar P Initial version */ 964 /* */ 965 /*****************************************************************************/ 966 void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src, 967 WORD32 src_strd, 968 WORD32 alpha_cb, 969 WORD32 beta_cb, 970 WORD32 alpha_cr, 971 WORD32 beta_cr, 972 UWORD32 u4_bs, 973 const UWORD8 *pu1_cliptab_cb, 974 const UWORD8 *pu1_cliptab_cr) 975 { 976 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/ 977 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 978 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb; 979 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb; 980 __m128i linea, lineb, linec, lined; 981 __m128i temp1, temp2; 982 983 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8; 984 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16; 985 __m128i flag_bs, flag1; 986 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro; 987 __m128i zero = _mm_setzero_si128(); 988 __m128i C0_uv_8x16; 989 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2; 990 991 u1_Bs0 = (u4_bs >> 24) & 0xff; 992 u1_Bs1 = (u4_bs >> 16) & 0xff; 993 u1_Bs2 = (u4_bs >> 8) & 0xff; 994 u1_Bs3 = (u4_bs >> 0) & 0xff; 995 996 flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2, 997 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0); 998 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s 999 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask 1000 1001 /* Load and transpose the pixel values */ 1002 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4)); 1003 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd)); 1004 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd)); 1005 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd)); 1006 1007 temp1 = _mm_unpacklo_epi16(linea, lineb); 1008 temp2 = _mm_unpacklo_epi16(linec, lined); 1009 1010 p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2); 1011 p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8); 1012 q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2); 1013 q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8); 1014 /* End of transpose */ 1015 1016 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero); 1017 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero); 1018 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero); 1019 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero); 1020 1021 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1 1022 diff = _mm_abs_epi16(diff); 1023 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr); 1024 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff); 1025 1026 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2 1027 diff = _mm_abs_epi16(diff); 1028 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr); 1029 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 1030 1031 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3 1032 diff = _mm_abs_epi16(diff); 1033 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff)); 1034 1035 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16); 1036 diff = _mm_slli_epi16(diff, 2); 1037 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16); 1038 diff = _mm_add_epi16(diff, diff1); 1039 diff = _mm_add_epi16(diff, _mm_set1_epi16(4)); 1040 in_macro = _mm_srai_epi16(diff, 3); 1041 1042 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3], 1043 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2], 1044 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1], 1045 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]); 1046 1047 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1)); 1048 1049 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3 1050 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16); 1051 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro); 1052 1053 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro); 1054 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro); 1055 1056 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1); 1057 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1); 1058 1059 flag1 = _mm_packs_epi16(flag1, flag1); 1060 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions) 1061 1062 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8, 1063 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 1064 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1); 1065 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2); 1066 1067 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8, 1068 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF))); 1069 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1); 1070 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2); 1071 1072 /* Inverse-transpose and store back */ 1073 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8); 1074 temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8); 1075 1076 linea = _mm_unpacklo_epi32(temp1, temp2); 1077 lineb = _mm_srli_si128(linea, 8); 1078 linec = _mm_unpackhi_epi32(temp1, temp2); 1079 lined = _mm_srli_si128(linec, 8); 1080 1081 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea); 1082 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb); 1083 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec); 1084 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined); 1085 1086 } 1087 1088