1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /*****************************************************************************/ 21 /* */ 22 /* File Name : ih264_deblk_luma_ssse3.c */ 23 /* */ 24 /* Description : Contains function definitions for deblocking */ 25 /* */ 26 /* List of Functions : ih264_deblk_luma_vert_bs4_ssse3() */ 27 /* ih264_deblk_luma_horz_bs4_ssse3() */ 28 /* ih264_deblk_luma_vert_bslt4_ssse3() */ 29 /* ih264_deblk_luma_horz_bslt4_ssse3() */ 30 /* ih264_deblk_luma_vert_bs4_mbaff_ssse3() */ 31 /* ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */ 32 /* */ 33 /* Issues / Problems : None */ 34 /* */ 35 /* Revision History : */ 36 /* */ 37 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 38 /* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */ 39 /* intrinsics */ 40 /* */ 41 /*****************************************************************************/ 42 43 /*****************************************************************************/ 44 /* File Includes */ 45 /*****************************************************************************/ 46 47 /* System include files */ 48 #include <stdio.h> 49 50 /* User include files */ 51 #include "ih264_typedefs.h" 52 #include "ih264_platform_macros.h" 53 #include "ih264_deblk_edge_filters.h" 54 #include "ih264_macros.h" 55 56 /*****************************************************************************/ 57 /* Function Definitions */ 58 /*****************************************************************************/ 59 60 /*****************************************************************************/ 61 /* */ 62 /* Function Name : ih264_deblk_luma_vert_bs4_ssse3() */ 63 /* */ 64 /* Description : This function performs filtering of a luma block */ 65 /* vertical edge when the boundary strength is set to 4. */ 66 /* */ 67 /* Inputs : pu1_src - pointer to the src sample q0 */ 68 /* src_strd - source stride */ 69 /* alpha - alpha value for the boundary */ 70 /* beta - beta value for the boundary */ 71 /* */ 72 /* Globals : None */ 73 /* */ 74 /* Processing : This operation is described in Sec. 8.7.2.4 under the */ 75 /* title "Filtering process for edges for bS equal to 4" in */ 76 /* ITU T Rec H.264. */ 77 /* */ 78 /* Outputs : None */ 79 /* */ 80 /* Returns : None */ 81 /* */ 82 /* Issues : None */ 83 /* */ 84 /* Revision History: */ 85 /* */ 86 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 87 /* 12 02 2015 Naveen Kumar P Initial version */ 88 /* */ 89 /*****************************************************************************/ 90 void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src, 91 WORD32 src_strd, 92 WORD32 alpha, 93 WORD32 beta) 94 { 95 __m128i zero = _mm_setzero_si128(); 96 __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; 97 __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; 98 __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; 99 __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; 100 __m128i q0_16x8_1; 101 __m128i p0_16x8_1; 102 __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; 103 __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; 104 __m128i temp1, temp2, temp3, temp4, temp5, temp6; 105 __m128i Alpha_8x16, Beta_8x16; 106 __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; 107 __m128i const_val2_16x8 = _mm_set1_epi16(2); 108 __m128i line1, line2, line3, line4, line5, line6, line7, line8; 109 110 Alpha_8x16 = _mm_set1_epi16(alpha); 111 Beta_8x16 = _mm_set1_epi16(beta); 112 113 line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); 114 line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); 115 line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); 116 line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); 117 line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); 118 line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); 119 line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); 120 line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); 121 122 temp1 = _mm_unpacklo_epi8(line1, line2); 123 temp2 = _mm_unpacklo_epi8(line3, line4); 124 temp3 = _mm_unpacklo_epi8(line5, line6); 125 temp4 = _mm_unpacklo_epi8(line7, line8); 126 127 line1 = _mm_unpacklo_epi16(temp1, temp2); 128 line2 = _mm_unpackhi_epi16(temp1, temp2); 129 line3 = _mm_unpacklo_epi16(temp3, temp4); 130 line4 = _mm_unpackhi_epi16(temp3, temp4); 131 132 p1_8x16 = _mm_unpacklo_epi32(line1, line3); 133 p0_8x16 = _mm_unpackhi_epi32(line1, line3); 134 q0_8x16 = _mm_unpacklo_epi32(line2, line4); 135 q1_8x16 = _mm_unpackhi_epi32(line2, line4); 136 137 line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd)); 138 line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd)); 139 line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd)); 140 line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd)); 141 line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd)); 142 line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd)); 143 line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd)); 144 line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd)); 145 146 temp1 = _mm_unpacklo_epi8(line1, line2); 147 temp2 = _mm_unpacklo_epi8(line3, line4); 148 temp3 = _mm_unpacklo_epi8(line5, line6); 149 temp4 = _mm_unpacklo_epi8(line7, line8); 150 151 line1 = _mm_unpacklo_epi16(temp1, temp2); 152 line2 = _mm_unpackhi_epi16(temp1, temp2); 153 line3 = _mm_unpacklo_epi16(temp3, temp4); 154 line4 = _mm_unpackhi_epi16(temp3, temp4); 155 156 temp1 = _mm_unpacklo_epi32(line1, line3); 157 temp2 = _mm_unpackhi_epi32(line1, line3); 158 temp3 = _mm_unpacklo_epi32(line2, line4); 159 temp4 = _mm_unpackhi_epi32(line2, line4); 160 161 p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1); 162 p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1); 163 q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4); 164 q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4); 165 p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2); 166 p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2); 167 q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3); 168 q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3); 169 170 //Cond1 (ABS(p0 - q0) < alpha) 171 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 172 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 173 temp1 = _mm_add_epi8(temp1, temp2); 174 175 temp2 = _mm_unpacklo_epi8(temp1, zero); 176 temp1 = _mm_unpackhi_epi8(temp1, zero); 177 178 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 179 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 180 181 flag1_16x8 = _mm_packs_epi16(temp2, temp1); 182 183 //Cond2 (ABS(q1 - q0) < beta) 184 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 185 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 186 temp1 = _mm_add_epi8(temp1, temp2); 187 188 temp2 = _mm_unpacklo_epi8(temp1, zero); 189 temp1 = _mm_unpackhi_epi8(temp1, zero); 190 191 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 192 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 193 194 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 195 196 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 197 198 //Cond3 (ABS(p1 - p0) < beta) 199 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 200 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 201 temp1 = _mm_add_epi8(temp1, temp2); 202 203 temp2 = _mm_unpacklo_epi8(temp1, zero); 204 temp1 = _mm_unpackhi_epi8(temp1, zero); 205 206 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 207 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 208 209 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 210 211 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 212 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 213 214 // (ABS(p0 - q0) < ((alpha >> 2) + 2)) 215 temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); 216 temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); 217 temp1 = _mm_add_epi8(temp1, temp2); 218 Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); 219 Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); 220 221 temp2 = _mm_unpacklo_epi8(temp1, zero); 222 temp1 = _mm_unpackhi_epi8(temp1, zero); 223 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 224 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 225 226 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 227 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 228 229 // (ABS(p2 - p0) < beta) 230 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 231 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 232 temp1 = _mm_add_epi8(temp1, temp2); 233 234 temp2 = _mm_unpacklo_epi8(temp1, zero); 235 temp1 = _mm_unpackhi_epi8(temp1, zero); 236 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 237 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 238 239 flag3_16x8 = _mm_packs_epi16(temp2, temp1); 240 flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); 241 242 // (ABS(q2 - q0) < beta) 243 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 244 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 245 temp1 = _mm_add_epi8(temp1, temp2); 246 247 temp2 = _mm_unpacklo_epi8(temp1, zero); 248 temp1 = _mm_unpackhi_epi8(temp1, zero); 249 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 250 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 251 252 flag4_16x8 = _mm_packs_epi16(temp2, temp1); 253 flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); 254 255 // First 8 pixels 256 p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); 257 p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); 258 p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); 259 p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); 260 q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); 261 q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); 262 q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); 263 q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); 264 265 // p0_1 and q0_1 266 temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 267 temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 268 temp5 = _mm_add_epi16(temp1, const_val2_16x8); 269 temp6 = _mm_add_epi16(temp2, const_val2_16x8); 270 temp3 = _mm_slli_epi16(p1_8x16, 1); 271 temp4 = _mm_slli_epi16(q1_8x16, 1); 272 temp1 = _mm_add_epi16(temp5, temp3); 273 temp2 = _mm_add_epi16(temp6, temp4); 274 p0_16x8_1 = _mm_srai_epi16(temp1, 2); 275 q0_16x8_1 = _mm_srai_epi16(temp2, 2); 276 277 // p1_2 and q1_2 278 temp6 = _mm_add_epi16(temp6, p0_8x16); 279 temp5 = _mm_add_epi16(temp5, q0_8x16); 280 temp1 = _mm_add_epi16(temp6, p2_8x16); 281 temp2 = _mm_add_epi16(temp5, q2_8x16); 282 p1_16x8_2 = _mm_srai_epi16(temp1, 2); 283 q1_16x8_2 = _mm_srai_epi16(temp2, 2); 284 285 // p0_2 and q0_2 286 temp1 = _mm_add_epi16(temp3, p2_8x16); 287 temp2 = _mm_add_epi16(temp4, q2_8x16); 288 temp1 = _mm_add_epi16(temp1, q1_8x16); 289 temp2 = _mm_add_epi16(temp2, p1_8x16); 290 temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 291 temp3 = _mm_slli_epi16(temp3, 1); 292 temp1 = _mm_add_epi16(temp1, temp3); 293 temp2 = _mm_add_epi16(temp2, temp3); 294 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 295 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 296 p0_16x8_2 = _mm_srai_epi16(temp1, 3); 297 q0_16x8_2 = _mm_srai_epi16(temp2, 3); 298 299 // p2_2 and q2_2 300 temp1 = _mm_add_epi16(temp6, const_val2_16x8); 301 temp2 = _mm_add_epi16(temp5, const_val2_16x8); 302 temp3 = _mm_slli_epi16(p2_8x16, 1); 303 temp4 = _mm_slli_epi16(q2_8x16, 1); 304 temp3 = _mm_add_epi16(p2_8x16, temp3); 305 temp4 = _mm_add_epi16(q2_8x16, temp4); 306 temp5 = _mm_slli_epi16(p3_8x16, 1); 307 temp6 = _mm_slli_epi16(q3_8x16, 1); 308 temp1 = _mm_add_epi16(temp1, temp3); 309 temp2 = _mm_add_epi16(temp2, temp4); 310 temp1 = _mm_add_epi16(temp1, temp5); 311 temp2 = _mm_add_epi16(temp2, temp6); 312 p2_16x8_2 = _mm_srai_epi16(temp1, 3); 313 q2_16x8_2 = _mm_srai_epi16(temp2, 3); 314 315 // Second 8 pixels and packing with first 8 pixels 316 p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero); 317 p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero); 318 p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero); 319 p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero); 320 q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero); 321 q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero); 322 q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero); 323 q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero); 324 325 // p0_1 and q0_1 326 temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 327 temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 328 temp5 = _mm_add_epi16(temp1, const_val2_16x8); 329 temp6 = _mm_add_epi16(temp2, const_val2_16x8); 330 temp3 = _mm_slli_epi16(p1_8x16, 1); 331 temp4 = _mm_slli_epi16(q1_8x16, 1); 332 temp1 = _mm_add_epi16(temp5, temp3); 333 temp2 = _mm_add_epi16(temp6, temp4); 334 temp1 = _mm_srai_epi16(temp1, 2); 335 temp2 = _mm_srai_epi16(temp2, 2); 336 p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1); 337 q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2); 338 339 // p1_2 and q1_2 340 temp6 = _mm_add_epi16(temp6, p0_8x16); 341 temp5 = _mm_add_epi16(temp5, q0_8x16); 342 temp1 = _mm_add_epi16(temp6, p2_8x16); 343 temp2 = _mm_add_epi16(temp5, q2_8x16); 344 temp1 = _mm_srai_epi16(temp1, 2); 345 temp2 = _mm_srai_epi16(temp2, 2); 346 p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1); 347 q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2); 348 349 // p0_2 and q0_2 350 temp1 = _mm_add_epi16(temp3, p2_8x16); 351 temp2 = _mm_add_epi16(temp4, q2_8x16); 352 temp1 = _mm_add_epi16(temp1, q1_8x16); 353 temp2 = _mm_add_epi16(temp2, p1_8x16); 354 temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 355 temp3 = _mm_slli_epi16(temp3, 1); 356 temp1 = _mm_add_epi16(temp1, temp3); 357 temp2 = _mm_add_epi16(temp2, temp3); 358 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 359 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 360 temp1 = _mm_srai_epi16(temp1, 3); 361 temp2 = _mm_srai_epi16(temp2, 3); 362 p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1); 363 q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2); 364 365 // p2_2 and q2_2 366 temp1 = _mm_add_epi16(temp6, const_val2_16x8); 367 temp2 = _mm_add_epi16(temp5, const_val2_16x8); 368 temp3 = _mm_slli_epi16(p2_8x16, 1); 369 temp4 = _mm_slli_epi16(q2_8x16, 1); 370 temp3 = _mm_add_epi16(p2_8x16, temp3); 371 temp4 = _mm_add_epi16(q2_8x16, temp4); 372 temp5 = _mm_slli_epi16(p3_8x16, 1); 373 temp6 = _mm_slli_epi16(q3_8x16, 1); 374 temp1 = _mm_add_epi16(temp1, temp3); 375 temp2 = _mm_add_epi16(temp2, temp4); 376 temp1 = _mm_add_epi16(temp1, temp5); 377 temp2 = _mm_add_epi16(temp2, temp6); 378 temp1 = _mm_srai_epi16(temp1, 3); 379 temp2 = _mm_srai_epi16(temp2, 3); 380 p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1); 381 q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2); 382 383 // p0 and q0 384 p0_16x8 = _mm_and_si128(p0_16x8, 385 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 386 p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); 387 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); 388 q0_16x8 = _mm_and_si128(q0_16x8, 389 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 390 q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); 391 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); 392 393 // p0 and q0 394 p0_16x8 = _mm_and_si128(p0_16x8, 395 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 396 p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); 397 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); 398 q0_16x8 = _mm_and_si128(q0_16x8, 399 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 400 q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); 401 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); 402 403 // p1 and q1 404 p1_16x8 = _mm_and_si128(p1_16x8, 405 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 406 p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); 407 p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); 408 q1_16x8 = _mm_and_si128(q1_16x8, 409 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 410 q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); 411 q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); 412 413 // p2 and q2 414 p2_16x8 = _mm_and_si128(p2_16x8, 415 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 416 p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); 417 p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); 418 q2_16x8 = _mm_and_si128(q2_16x8, 419 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 420 q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); 421 q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); 422 423 temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); 424 temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8); 425 temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8); 426 temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); 427 428 p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); 429 p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); 430 q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); 431 q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); 432 433 line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); 434 line2 = _mm_srli_si128(line1, 8); 435 line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); 436 line4 = _mm_srli_si128(line3, 8); 437 line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); 438 line6 = _mm_srli_si128(line5, 8); 439 line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); 440 line8 = _mm_srli_si128(line7, 8); 441 442 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); 443 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); 444 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); 445 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); 446 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); 447 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); 448 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); 449 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); 450 451 temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8); 452 temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8); 453 temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8); 454 temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8); 455 456 p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); 457 p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); 458 q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); 459 q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); 460 461 line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); 462 line2 = _mm_srli_si128(line1, 8); 463 line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); 464 line4 = _mm_srli_si128(line3, 8); 465 line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); 466 line6 = _mm_srli_si128(line5, 8); 467 line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); 468 line8 = _mm_srli_si128(line7, 8); 469 470 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1); 471 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2); 472 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3); 473 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4); 474 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5); 475 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6); 476 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7); 477 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8); 478 479 } 480 481 /*****************************************************************************/ 482 /* */ 483 /* Function Name : ih264_deblk_luma_horz_bs4_ssse3() */ 484 /* */ 485 /* Description : This function performs filtering of a luma block */ 486 /* horizontal edge when the boundary strength is set to 4. */ 487 /* */ 488 /* Inputs : pu1_src - pointer to the src sample q0 */ 489 /* src_strd - source stride */ 490 /* alpha - alpha value for the boundary */ 491 /* beta - beta value for the boundary */ 492 /* */ 493 /* Globals : None */ 494 /* */ 495 /* Processing : This operation is described in Sec. 8.7.2.4 under the */ 496 /* title "Filtering process for edges for bS equal to 4" in */ 497 /* ITU T Rec H.264. */ 498 /* */ 499 /* Outputs : None */ 500 /* */ 501 /* Returns : None */ 502 /* */ 503 /* Issues : None */ 504 /* */ 505 /* Revision History: */ 506 /* */ 507 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 508 /* 12 02 2015 Naveen Kumar P Initial version */ 509 /* */ 510 /*****************************************************************************/ 511 void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src, 512 WORD32 src_strd, 513 WORD32 alpha, 514 WORD32 beta) 515 { 516 WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0; 517 WORD16 i16_posQ1, i16_posQ2, i16_posQ3; 518 UWORD8 *pu1_HorzPixel; 519 __m128i zero = _mm_setzero_si128(); 520 __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; 521 __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; 522 __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; 523 __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; 524 __m128i q0_16x8_1; 525 __m128i p0_16x8_1; 526 __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; 527 __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; 528 __m128i temp1, temp2, temp3, temp4, temp5, temp6; 529 __m128i Alpha_8x16, Beta_8x16; 530 __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; 531 __m128i const_val2_16x8 = _mm_set1_epi16(2); 532 533 pu1_HorzPixel = pu1_src - (src_strd << 2); 534 535 i16_posQ1 = src_strd; 536 i16_posQ2 = X2(src_strd); 537 i16_posQ3 = X3(src_strd); 538 i16_posP0 = X3(src_strd); 539 i16_posP1 = X2(src_strd); 540 i16_posP2 = src_strd; 541 i16_posP3 = 0; 542 543 Alpha_8x16 = _mm_set1_epi16(alpha); 544 Beta_8x16 = _mm_set1_epi16(beta); 545 546 p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3)); 547 p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2)); 548 p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1)); 549 p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0)); 550 q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src)); 551 q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1)); 552 q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2)); 553 q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3)); 554 555 //Cond1 (ABS(p0 - q0) < alpha) 556 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 557 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 558 temp1 = _mm_add_epi8(temp1, temp2); 559 560 temp2 = _mm_unpacklo_epi8(temp1, zero); 561 temp1 = _mm_unpackhi_epi8(temp1, zero); 562 563 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 564 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 565 566 flag1_16x8 = _mm_packs_epi16(temp2, temp1); 567 568 //Cond2 (ABS(q1 - q0) < beta) 569 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 570 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 571 temp1 = _mm_add_epi8(temp1, temp2); 572 573 temp2 = _mm_unpacklo_epi8(temp1, zero); 574 temp1 = _mm_unpackhi_epi8(temp1, zero); 575 576 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 577 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 578 579 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 580 581 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 582 583 //Cond3 (ABS(p1 - p0) < beta) 584 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 585 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 586 temp1 = _mm_add_epi8(temp1, temp2); 587 588 temp2 = _mm_unpacklo_epi8(temp1, zero); 589 temp1 = _mm_unpackhi_epi8(temp1, zero); 590 591 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 592 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 593 594 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 595 596 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 597 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 598 599 // (ABS(p0 - q0) < ((alpha >> 2) + 2)) 600 temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); 601 temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); 602 temp1 = _mm_add_epi8(temp1, temp2); 603 Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); 604 Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); 605 606 temp2 = _mm_unpacklo_epi8(temp1, zero); 607 temp1 = _mm_unpackhi_epi8(temp1, zero); 608 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 609 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 610 611 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 612 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 613 614 // (ABS(p2 - p0) < beta) 615 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 616 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 617 temp1 = _mm_add_epi8(temp1, temp2); 618 619 temp2 = _mm_unpacklo_epi8(temp1, zero); 620 temp1 = _mm_unpackhi_epi8(temp1, zero); 621 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 622 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 623 624 flag3_16x8 = _mm_packs_epi16(temp2, temp1); 625 flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); 626 627 // (ABS(q2 - q0) < beta) 628 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 629 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 630 temp1 = _mm_add_epi8(temp1, temp2); 631 632 temp2 = _mm_unpacklo_epi8(temp1, zero); 633 temp1 = _mm_unpackhi_epi8(temp1, zero); 634 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 635 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 636 637 flag4_16x8 = _mm_packs_epi16(temp2, temp1); 638 flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); 639 640 // First 8 pixels 641 p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); 642 p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); 643 p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); 644 p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); 645 q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); 646 q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); 647 q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); 648 q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); 649 650 // p0_1 and q0_1 651 temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 652 temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 653 temp5 = _mm_add_epi16(temp1, const_val2_16x8); 654 temp6 = _mm_add_epi16(temp2, const_val2_16x8); 655 temp3 = _mm_slli_epi16(p1_8x16, 1); 656 temp4 = _mm_slli_epi16(q1_8x16, 1); 657 temp1 = _mm_add_epi16(temp5, temp3); 658 temp2 = _mm_add_epi16(temp6, temp4); 659 p0_16x8_1 = _mm_srai_epi16(temp1, 2); 660 q0_16x8_1 = _mm_srai_epi16(temp2, 2); 661 662 // p1_2 and q1_2 663 temp6 = _mm_add_epi16(temp6, p0_8x16); 664 temp5 = _mm_add_epi16(temp5, q0_8x16); 665 temp1 = _mm_add_epi16(temp6, p2_8x16); 666 temp2 = _mm_add_epi16(temp5, q2_8x16); 667 p1_16x8_2 = _mm_srai_epi16(temp1, 2); 668 q1_16x8_2 = _mm_srai_epi16(temp2, 2); 669 670 // p0_2 and q0_2 671 temp1 = _mm_add_epi16(temp3, p2_8x16); 672 temp2 = _mm_add_epi16(temp4, q2_8x16); 673 temp1 = _mm_add_epi16(temp1, q1_8x16); 674 temp2 = _mm_add_epi16(temp2, p1_8x16); 675 temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 676 temp3 = _mm_slli_epi16(temp3, 1); 677 temp1 = _mm_add_epi16(temp1, temp3); 678 temp2 = _mm_add_epi16(temp2, temp3); 679 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 680 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 681 p0_16x8_2 = _mm_srai_epi16(temp1, 3); 682 q0_16x8_2 = _mm_srai_epi16(temp2, 3); 683 684 // p2_2 and q2_2 685 temp1 = _mm_add_epi16(temp6, const_val2_16x8); 686 temp2 = _mm_add_epi16(temp5, const_val2_16x8); 687 temp3 = _mm_slli_epi16(p2_8x16, 1); 688 temp4 = _mm_slli_epi16(q2_8x16, 1); 689 temp3 = _mm_add_epi16(p2_8x16, temp3); 690 temp4 = _mm_add_epi16(q2_8x16, temp4); 691 temp5 = _mm_slli_epi16(p3_8x16, 1); 692 temp6 = _mm_slli_epi16(q3_8x16, 1); 693 temp1 = _mm_add_epi16(temp1, temp3); 694 temp2 = _mm_add_epi16(temp2, temp4); 695 temp1 = _mm_add_epi16(temp1, temp5); 696 temp2 = _mm_add_epi16(temp2, temp6); 697 p2_16x8_2 = _mm_srai_epi16(temp1, 3); 698 q2_16x8_2 = _mm_srai_epi16(temp2, 3); 699 700 // Second 8 pixels and packing with first 8 pixels 701 p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero); 702 p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero); 703 p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero); 704 p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero); 705 q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero); 706 q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero); 707 q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero); 708 q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero); 709 710 // p0_1 and q0_1 711 temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 712 temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 713 temp5 = _mm_add_epi16(temp1, const_val2_16x8); 714 temp6 = _mm_add_epi16(temp2, const_val2_16x8); 715 temp3 = _mm_slli_epi16(p1_8x16, 1); 716 temp4 = _mm_slli_epi16(q1_8x16, 1); 717 temp1 = _mm_add_epi16(temp5, temp3); 718 temp2 = _mm_add_epi16(temp6, temp4); 719 temp1 = _mm_srai_epi16(temp1, 2); 720 temp2 = _mm_srai_epi16(temp2, 2); 721 p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1); 722 q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2); 723 724 // p1_2 and q1_2 725 temp6 = _mm_add_epi16(temp6, p0_8x16); 726 temp5 = _mm_add_epi16(temp5, q0_8x16); 727 temp1 = _mm_add_epi16(temp6, p2_8x16); 728 temp2 = _mm_add_epi16(temp5, q2_8x16); 729 temp1 = _mm_srai_epi16(temp1, 2); 730 temp2 = _mm_srai_epi16(temp2, 2); 731 p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1); 732 q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2); 733 734 // p0_2 and q0_2 735 temp1 = _mm_add_epi16(temp3, p2_8x16); 736 temp2 = _mm_add_epi16(temp4, q2_8x16); 737 temp1 = _mm_add_epi16(temp1, q1_8x16); 738 temp2 = _mm_add_epi16(temp2, p1_8x16); 739 temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 740 temp3 = _mm_slli_epi16(temp3, 1); 741 temp1 = _mm_add_epi16(temp1, temp3); 742 temp2 = _mm_add_epi16(temp2, temp3); 743 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 744 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 745 temp1 = _mm_srai_epi16(temp1, 3); 746 temp2 = _mm_srai_epi16(temp2, 3); 747 p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1); 748 q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2); 749 750 // p2_2 and q2_2 751 temp1 = _mm_add_epi16(temp6, const_val2_16x8); 752 temp2 = _mm_add_epi16(temp5, const_val2_16x8); 753 temp3 = _mm_slli_epi16(p2_8x16, 1); 754 temp4 = _mm_slli_epi16(q2_8x16, 1); 755 temp3 = _mm_add_epi16(p2_8x16, temp3); 756 temp4 = _mm_add_epi16(q2_8x16, temp4); 757 temp5 = _mm_slli_epi16(p3_8x16, 1); 758 temp6 = _mm_slli_epi16(q3_8x16, 1); 759 temp1 = _mm_add_epi16(temp1, temp3); 760 temp2 = _mm_add_epi16(temp2, temp4); 761 temp1 = _mm_add_epi16(temp1, temp5); 762 temp2 = _mm_add_epi16(temp2, temp6); 763 temp1 = _mm_srai_epi16(temp1, 3); 764 temp2 = _mm_srai_epi16(temp2, 3); 765 p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1); 766 q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2); 767 768 // p0 and q0 769 p0_16x8 = _mm_and_si128(p0_16x8, 770 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 771 p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); 772 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); 773 q0_16x8 = _mm_and_si128(q0_16x8, 774 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 775 q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); 776 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); 777 778 // p0 and q0 779 p0_16x8 = _mm_and_si128(p0_16x8, 780 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 781 p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); 782 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); 783 q0_16x8 = _mm_and_si128(q0_16x8, 784 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 785 q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); 786 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); 787 788 // p1 and q1 789 p1_16x8 = _mm_and_si128(p1_16x8, 790 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 791 p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); 792 p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); 793 q1_16x8 = _mm_and_si128(q1_16x8, 794 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 795 q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); 796 q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); 797 798 // p2 and q2 799 p2_16x8 = _mm_and_si128(p2_16x8, 800 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 801 p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); 802 p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); 803 q2_16x8 = _mm_and_si128(q2_16x8, 804 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 805 q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); 806 q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); 807 808 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8); 809 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8); 810 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8); 811 812 _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8); 813 _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8); 814 _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8); 815 816 } 817 818 /*****************************************************************************/ 819 /* */ 820 /* Function Name : ih264_deblk_luma_vert_bslt4_ssse3() */ 821 /* */ 822 /* Description : This function performs filtering of a luma block */ 823 /* vertical edge when the boundary strength is less than 4. */ 824 /* */ 825 /* Inputs : pu1_src - pointer to the src sample q0 */ 826 /* src_strd - source stride */ 827 /* alpha - alpha value for the boundary */ 828 /* beta - beta value for the boundary */ 829 /* u4_bs - packed Boundary strength array */ 830 /* pu1_cliptab - tc0_table */ 831 /* */ 832 /* Globals : None */ 833 /* */ 834 /* Processing : This operation is described in Sec. 8.7.2.3 under the */ 835 /* title "Filtering process for edges for bS less than 4" */ 836 /* in ITU T Rec H.264. */ 837 /* */ 838 /* Outputs : None */ 839 /* */ 840 /* Returns : None */ 841 /* */ 842 /* Issues : None */ 843 /* */ 844 /* Revision History: */ 845 /* */ 846 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 847 /* 12 02 2015 Naveen Kumar P Initial version */ 848 /* */ 849 /*****************************************************************************/ 850 void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src, 851 WORD32 src_strd, 852 WORD32 alpha, 853 WORD32 beta, 854 UWORD32 u4_bs, 855 const UWORD8 *pu1_cliptab) 856 { 857 UWORD8 u1_Bs, u1_Bs1; 858 859 WORD32 j = 0; 860 861 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh; 862 __m128i int1, int2, int3, int4, high1, high2; 863 __m128i flag, flag1, i_C, i_C0; 864 __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp, 865 temp1; 866 __m128i zero = _mm_setzero_si128(); 867 868 for(j = 0; j <= 8 * src_strd; j += 8 * src_strd) 869 { 870 //Transpose 871 linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j)); 872 lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j)); 873 linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j)); 874 lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j)); 875 876 linea = _mm_unpacklo_epi8(linea, zero); 877 lineb = _mm_unpacklo_epi8(lineb, zero); 878 linec = _mm_unpacklo_epi8(linec, zero); 879 lined = _mm_unpacklo_epi8(lined, zero); 880 881 int1 = _mm_unpacklo_epi16(linea, lineb); 882 lineb = _mm_unpackhi_epi16(linea, lineb); 883 884 int2 = _mm_unpacklo_epi16(linec, lined); 885 lined = _mm_unpackhi_epi16(linec, lined); 886 887 linea = _mm_unpacklo_epi16(int1, int2); 888 int1 = _mm_unpackhi_epi16(int1, int2); 889 890 linec = _mm_unpacklo_epi16(lineb, lined); 891 high1 = _mm_unpackhi_epi16(lineb, lined); 892 893 linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j)); 894 linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j)); 895 lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j)); 896 lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j)); 897 898 linee = _mm_unpacklo_epi8(linee, zero); 899 linef = _mm_unpacklo_epi8(linef, zero); 900 lineg = _mm_unpacklo_epi8(lineg, zero); 901 lineh = _mm_unpacklo_epi8(lineh, zero); 902 903 int2 = _mm_unpacklo_epi16(linee, linef); 904 linef = _mm_unpackhi_epi16(linee, linef); 905 906 int3 = _mm_unpacklo_epi16(lineg, lineh); 907 lineh = _mm_unpackhi_epi16(lineg, lineh); 908 909 linee = _mm_unpacklo_epi16(int2, int3); 910 int2 = _mm_unpackhi_epi16(int2, int3); 911 912 lineg = _mm_unpacklo_epi16(linef, lineh); 913 high2 = _mm_unpackhi_epi16(linef, lineh); 914 915 int4 = _mm_unpacklo_epi16(linea, linee); 916 lineb = _mm_unpackhi_epi16(linea, linee); 917 918 int3 = _mm_unpacklo_epi16(int1, int2); 919 lined = _mm_unpackhi_epi16(int1, int2); 920 921 int2 = _mm_unpacklo_epi16(linec, lineg); 922 linef = _mm_unpackhi_epi16(linec, lineg); 923 924 linea = int4; 925 linec = int3; 926 linee = int2; 927 928 lineg = _mm_unpacklo_epi16(high1, high2); 929 lineh = _mm_unpackhi_epi16(high1, high2); 930 931 //end of transpose 932 933 u1_Bs = (u4_bs >> 24) & 0xff; 934 u1_Bs1 = (u4_bs >> 16) & 0xff; 935 u4_bs <<= 16; 936 937 flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, 938 u1_Bs1, u1_Bs); 939 flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s 940 flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask 941 942 i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], 943 pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], 944 pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs], 945 pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]); 946 947 diff = _mm_subs_epi16(linec, lined); //Condn 1 948 diff = _mm_abs_epi16(diff); 949 const1 = _mm_set1_epi16(alpha); 950 flag = _mm_cmpgt_epi16(const1, diff); 951 952 diff = _mm_subs_epi16(linee, lined); //Condtn 2 953 diff = _mm_abs_epi16(diff); 954 const1 = _mm_set1_epi16(beta); 955 flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); 956 957 diff = _mm_subs_epi16(lineb, linec); //Condtn 3 958 diff = _mm_abs_epi16(diff); 959 flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on 960 961 flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions) 962 963 //Adding Ap<Beta and Aq<Beta 964 i_Ap = _mm_subs_epi16(linea, linec); 965 i_Ap = _mm_abs_epi16(i_Ap); 966 const2 = _mm_cmpgt_epi16(const1, i_Ap); 967 const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0 968 i_C = _mm_add_epi16(i_C0, const2); 969 970 i_Aq = _mm_subs_epi16(linef, lined); 971 i_Aq = _mm_abs_epi16(i_Aq); 972 const2 = _mm_cmpgt_epi16(const1, i_Aq); 973 const2 = _mm_subs_epi16(zero, const2); 974 i_C = _mm_add_epi16(i_C, const2); 975 976 //Calculate in_macro 977 diff = _mm_subs_epi16(lined, linec); 978 diff = _mm_slli_epi16(diff, 2); 979 const2 = _mm_subs_epi16(lineb, linee); 980 diff = _mm_add_epi16(diff, const2); 981 const2 = _mm_set1_epi16(4); 982 diff = _mm_add_epi16(diff, const2); 983 in_macro = _mm_srai_epi16(diff, 3); 984 985 in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3 986 i_C = _mm_subs_epi16(zero, i_C); 987 in_macro = _mm_max_epi16(i_C, in_macro); 988 989 //Compute and store 990 in_macrotemp = _mm_add_epi16(linec, in_macro); 991 in_macrotemp = _mm_and_si128(in_macrotemp, flag); 992 temp = _mm_and_si128(linec, 993 _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF))); 994 temp = _mm_add_epi16(temp, in_macrotemp); 995 //temp= _mm_packus_epi16 (temp, zero); 996 //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp); 997 998 in_macrotemp = _mm_subs_epi16(lined, in_macro); 999 in_macrotemp = _mm_and_si128(in_macrotemp, flag); 1000 temp1 = _mm_and_si128(lined, 1001 _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF))); 1002 temp1 = _mm_add_epi16(temp1, in_macrotemp); 1003 //temp1= _mm_packus_epi16 (temp1, zero); 1004 //_mm_storel_epi64(pu1_src+i, in_macrotemp); 1005 1006 //If Ap<Beta 1007 flag1 = _mm_cmpgt_epi16(const1, i_Ap); 1008 flag1 = _mm_and_si128(flag, flag1); 1009 in_macrotemp = _mm_add_epi16(linec, lined); 1010 in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1)); 1011 in_macrotemp = _mm_srai_epi16(in_macrotemp, 1); 1012 in_macro = _mm_add_epi16(in_macrotemp, linea); 1013 in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1)); 1014 in_macro = _mm_srai_epi16(in_macro, 1); 1015 1016 in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3 1017 i_C0 = _mm_subs_epi16(zero, i_C0); 1018 in_macro = _mm_max_epi16(i_C0, in_macro); 1019 1020 in_macro = _mm_and_si128(in_macro, flag1); 1021 lineb = _mm_add_epi16(lineb, in_macro); 1022 //in_macro= _mm_packus_epi16 (i_p1, zero); 1023 //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro); 1024 1025 flag1 = _mm_cmpgt_epi16(const1, i_Aq); 1026 flag1 = _mm_and_si128(flag, flag1); 1027 in_macro = _mm_add_epi16(in_macrotemp, linef); 1028 in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1)); 1029 in_macro = _mm_srai_epi16(in_macro, 1); 1030 1031 i_C0 = _mm_abs_epi16(i_C0); 1032 in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3 1033 i_C0 = _mm_subs_epi16(zero, i_C0); 1034 in_macro = _mm_max_epi16(i_C0, in_macro); 1035 1036 in_macro = _mm_and_si128(in_macro, flag1); 1037 linee = _mm_add_epi16(linee, in_macro); 1038 //in_macro= _mm_packus_epi16 (i_q1, zero); 1039 //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro); 1040 linec = temp; 1041 lined = temp1; 1042 //End of filtering 1043 1044 int1 = _mm_unpacklo_epi16(linea, linee); 1045 linee = _mm_unpackhi_epi16(linea, linee); 1046 1047 int2 = _mm_unpacklo_epi16(linec, lineg); 1048 lineg = _mm_unpackhi_epi16(linec, lineg); 1049 1050 linea = _mm_unpacklo_epi16(int1, int2); 1051 int3 = _mm_unpackhi_epi16(int1, int2); 1052 1053 linec = _mm_unpacklo_epi16(linee, lineg); 1054 lineg = _mm_unpackhi_epi16(linee, lineg); 1055 1056 int1 = _mm_unpacklo_epi16(lineb, linef); 1057 linef = _mm_unpackhi_epi16(lineb, linef); 1058 1059 int2 = _mm_unpacklo_epi16(lined, lineh); 1060 lineh = _mm_unpackhi_epi16(lined, lineh); 1061 1062 lineb = _mm_unpacklo_epi16(int1, int2); 1063 int4 = _mm_unpackhi_epi16(int1, int2); 1064 1065 lined = _mm_unpacklo_epi16(linef, lineh); 1066 lineh = _mm_unpackhi_epi16(linef, lineh); 1067 1068 int1 = _mm_unpackhi_epi16(linea, lineb); 1069 linea = _mm_unpacklo_epi16(linea, lineb); 1070 1071 int2 = _mm_unpacklo_epi16(int3, int4); 1072 high1 = _mm_unpackhi_epi16(int3, int4); 1073 1074 lineb = _mm_unpacklo_epi16(linec, lined); 1075 linef = _mm_unpackhi_epi16(linec, lined); 1076 1077 lined = _mm_unpacklo_epi16(lineg, lineh); 1078 lineh = _mm_unpackhi_epi16(lineg, lineh); 1079 1080 linee = int1; 1081 lineg = high1; 1082 linec = int2; 1083 //End of inverse transpose 1084 1085 //Packs and stores 1086 linea = _mm_packus_epi16(linea, zero); 1087 _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea); 1088 1089 lineb = _mm_packus_epi16(lineb, zero); 1090 _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb); 1091 1092 linec = _mm_packus_epi16(linec, zero); 1093 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec); 1094 1095 lined = _mm_packus_epi16(lined, zero); 1096 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined); 1097 1098 linee = _mm_packus_epi16(linee, zero); 1099 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee); 1100 1101 linef = _mm_packus_epi16(linef, zero); 1102 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef); 1103 1104 lineg = _mm_packus_epi16(lineg, zero); 1105 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg); 1106 1107 lineh = _mm_packus_epi16(lineh, zero); 1108 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh); 1109 1110 } 1111 } 1112 1113 /*****************************************************************************/ 1114 /* */ 1115 /* Function Name : ih264_deblk_luma_horz_bslt4_ssse3() */ 1116 /* */ 1117 /* Description : This function performs filtering of a luma block */ 1118 /* horizontal edge when boundary strength is less than 4. */ 1119 /* */ 1120 /* Inputs : pu1_src - pointer to the src sample q0 */ 1121 /* src_strd - source stride */ 1122 /* alpha - alpha value for the boundary */ 1123 /* beta - beta value for the boundary */ 1124 /* u4_bs - packed Boundary strength array */ 1125 /* pu1_cliptab - tc0_table */ 1126 /* */ 1127 /* Globals : None */ 1128 /* */ 1129 /* Processing : This operation is described in Sec. 8.7.2.3 under the */ 1130 /* title "Filtering process for edges for bS less than 4" */ 1131 /* in ITU T Rec H.264. */ 1132 /* */ 1133 /* Outputs : None */ 1134 /* */ 1135 /* Returns : None */ 1136 /* */ 1137 /* Issues : None */ 1138 /* */ 1139 /* Revision History: */ 1140 /* */ 1141 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 1142 /* 12 02 2015 Naveen Kumar P Initial version */ 1143 /* */ 1144 /*****************************************************************************/ 1145 void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src, 1146 WORD32 src_strd, 1147 WORD32 alpha, 1148 WORD32 beta, 1149 UWORD32 u4_bs, 1150 const UWORD8 *pu1_cliptab) 1151 { 1152 WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2; 1153 UWORD8 *pu1_HorzPixel; 1154 __m128i zero = _mm_setzero_si128(); 1155 __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16; 1156 __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8; 1157 __m128i temp1, temp2; 1158 __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8; 1159 __m128i in_macro_16x8, in_macro_hi_16x8; 1160 __m128i const_val4_8x16; 1161 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 1162 UWORD8 clip0, clip1, clip2, clip3; 1163 1164 pu1_HorzPixel = pu1_src - (src_strd << 2); 1165 1166 i16_posQ1 = src_strd; 1167 i16_posQ2 = X2(src_strd); 1168 i16_posP0 = X3(src_strd); 1169 i16_posP1 = X2(src_strd); 1170 i16_posP2 = src_strd; 1171 1172 q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src)); 1173 q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1)); 1174 1175 u1_Bs0 = (u4_bs >> 24) & 0xff; 1176 u1_Bs1 = (u4_bs >> 16) & 0xff; 1177 u1_Bs2 = (u4_bs >> 8) & 0xff; 1178 u1_Bs3 = (u4_bs >> 0) & 0xff; 1179 clip0 = pu1_cliptab[u1_Bs0]; 1180 clip1 = pu1_cliptab[u1_Bs1]; 1181 clip2 = pu1_cliptab[u1_Bs2]; 1182 clip3 = pu1_cliptab[u1_Bs3]; 1183 1184 Alpha_8x16 = _mm_set1_epi16(alpha); 1185 Beta_8x16 = _mm_set1_epi16(beta); 1186 1187 bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2, 1188 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1, 1189 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0); 1190 1191 C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2, 1192 clip2, clip1, clip1, clip1, clip1, clip0, clip0, 1193 clip0, clip0); 1194 1195 bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero); 1196 bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask 1197 C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero); 1198 C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero); 1199 1200 p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1)); 1201 p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0)); 1202 p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2)); 1203 q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2)); 1204 1205 //Cond1 (ABS(p0 - q0) < alpha) 1206 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 1207 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 1208 temp1 = _mm_add_epi8(temp1, temp2); 1209 1210 temp2 = _mm_unpacklo_epi8(temp1, zero); 1211 temp1 = _mm_unpackhi_epi8(temp1, zero); 1212 1213 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 1214 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 1215 1216 flag1_16x8 = _mm_packs_epi16(temp2, temp1); 1217 flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b); 1218 1219 //Cond2 (ABS(q1 - q0) < beta) 1220 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 1221 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 1222 temp1 = _mm_add_epi8(temp1, temp2); 1223 1224 temp2 = _mm_unpacklo_epi8(temp1, zero); 1225 temp1 = _mm_unpackhi_epi8(temp1, zero); 1226 1227 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1228 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1229 1230 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 1231 1232 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1233 1234 //Cond3 (ABS(p1 - p0) < beta) 1235 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 1236 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 1237 temp1 = _mm_add_epi8(temp1, temp2); 1238 1239 temp2 = _mm_unpacklo_epi8(temp1, zero); 1240 temp1 = _mm_unpackhi_epi8(temp1, zero); 1241 1242 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1243 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1244 1245 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 1246 1247 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 1248 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1249 1250 // (ABS(p2 - p0) < beta) 1251 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 1252 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 1253 temp1 = _mm_add_epi8(temp1, temp2); 1254 1255 temp2 = _mm_unpacklo_epi8(temp1, zero); 1256 temp1 = _mm_unpackhi_epi8(temp1, zero); 1257 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1258 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1259 1260 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 1261 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1262 1263 temp2 = _mm_subs_epi16(zero, temp2); 1264 temp1 = _mm_subs_epi16(zero, temp1); 1265 1266 C_8x16 = _mm_add_epi16(C0_8x16, temp2); 1267 C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1); 1268 1269 // (ABS(q2 - q0) < beta) 1270 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 1271 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 1272 temp1 = _mm_add_epi8(temp1, temp2); 1273 1274 temp2 = _mm_unpacklo_epi8(temp1, zero); 1275 temp1 = _mm_unpackhi_epi8(temp1, zero); 1276 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1277 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1278 1279 flag3_16x8 = _mm_packs_epi16(temp2, temp1); 1280 flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8); 1281 1282 temp2 = _mm_subs_epi16(zero, temp2); 1283 temp1 = _mm_subs_epi16(zero, temp1); 1284 1285 C_8x16 = _mm_add_epi16(C_8x16, temp2); 1286 C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1); 1287 1288 const_val4_8x16 = _mm_set1_epi16(4); 1289 temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero), 1290 _mm_unpacklo_epi8(p0_16x8, zero)); 1291 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1292 _mm_unpacklo_epi8(q1_16x8, zero)); 1293 temp1 = _mm_slli_epi16(temp1, 2); 1294 temp1 = _mm_add_epi16(temp1, temp2); 1295 temp1 = _mm_add_epi16(temp1, const_val4_8x16); 1296 in_macro_16x8 = _mm_srai_epi16(temp1, 3); 1297 1298 temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero), 1299 _mm_unpackhi_epi8(p0_16x8, zero)); 1300 temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1301 _mm_unpackhi_epi8(q1_16x8, zero)); 1302 temp1 = _mm_slli_epi16(temp1, 2); 1303 temp1 = _mm_add_epi16(temp1, temp2); 1304 temp1 = _mm_add_epi16(temp1, const_val4_8x16); 1305 in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3); 1306 1307 in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3 1308 in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3 1309 C_8x16 = _mm_subs_epi16(zero, C_8x16); 1310 C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16); 1311 in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3 1312 in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3 1313 1314 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8); 1315 temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8); 1316 1317 temp1 = _mm_packus_epi16(temp1, temp2); 1318 1319 temp1 = _mm_and_si128(temp1, flag1_16x8); 1320 temp2 = _mm_and_si128(p0_16x8, 1321 _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); 1322 1323 temp1 = _mm_add_epi8(temp1, temp2); 1324 1325 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1); 1326 1327 temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8); 1328 temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8); 1329 1330 temp1 = _mm_packus_epi16(temp1, temp2); 1331 1332 temp1 = _mm_and_si128(temp1, flag1_16x8); 1333 temp2 = _mm_and_si128(q0_16x8, 1334 _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); 1335 1336 temp1 = _mm_add_epi8(temp1, temp2); 1337 _mm_storeu_si128((__m128i *)(pu1_src), temp1); 1338 1339 //if(Ap < Beta) 1340 temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), 1341 _mm_unpacklo_epi8(p0_16x8, zero)); 1342 temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1); 1343 //temp2 = _mm_subs_epi16(zero,temp2); 1344 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2); 1345 temp2 = _mm_add_epi16(temp1, temp2); 1346 in_macro_16x8 = _mm_srai_epi16(temp2, 1); 1347 1348 temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero), 1349 _mm_unpackhi_epi8(p0_16x8, zero)); 1350 temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1); 1351 //temp2 = _mm_subs_epi16(zero,temp2); 1352 temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2); 1353 temp2 = _mm_add_epi16(temp1, temp2); 1354 in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1); 1355 1356 in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 1357 in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 1358 C0_8x16 = _mm_subs_epi16(zero, C0_8x16); 1359 C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16); 1360 in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 1361 in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 1362 1363 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8); 1364 temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8); 1365 1366 temp1 = _mm_packus_epi16(temp1, temp2); 1367 1368 temp1 = _mm_and_si128(temp1, flag2_16x8); 1369 temp2 = _mm_and_si128(p1_16x8, 1370 _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF))); 1371 temp1 = _mm_add_epi8(temp1, temp2); 1372 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1); 1373 1374 //if(Aq < Beta) 1375 temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), 1376 _mm_unpacklo_epi8(p0_16x8, zero)); 1377 temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1); 1378 //temp2 = _mm_slli_epi16 (temp2, 1); 1379 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2); 1380 temp2 = _mm_add_epi16(temp1, temp2); 1381 in_macro_16x8 = _mm_srai_epi16(temp2, 1); 1382 1383 temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero), 1384 _mm_unpackhi_epi8(p0_16x8, zero)); 1385 temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1); 1386 //temp2 = _mm_slli_epi16 (temp2, 1); 1387 temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2); 1388 temp2 = _mm_add_epi16(temp1, temp2); 1389 in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1); 1390 1391 in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 1392 in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 1393 C0_8x16 = _mm_subs_epi16(zero, C0_8x16); 1394 C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16); 1395 in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 1396 in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3 1397 1398 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8); 1399 temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8); 1400 1401 temp1 = _mm_packus_epi16(temp1, temp2); 1402 1403 temp1 = _mm_and_si128(temp1, flag3_16x8); 1404 temp2 = _mm_and_si128(q1_16x8, 1405 _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF))); 1406 temp1 = _mm_add_epi8(temp1, temp2); 1407 1408 _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1); 1409 1410 } 1411 1412 /*****************************************************************************/ 1413 /* */ 1414 /* Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3() */ 1415 /* */ 1416 /* Description : This function performs filtering of a luma block */ 1417 /* vertical edge when boundary strength is set to 4. */ 1418 /* */ 1419 /* Inputs : pu1_src - pointer to the src sample q0 */ 1420 /* src_strd - source stride */ 1421 /* alpha - alpha value for the boundary */ 1422 /* beta - beta value for the boundary */ 1423 /* */ 1424 /* Globals : None */ 1425 /* */ 1426 /* Processing : When the function is called twice, this operation is as */ 1427 /* described in Sec. 8.7.2.3 under the title "Filtering */ 1428 /* process for edges for bS equal to 4" in ITU T Rec H.264. */ 1429 /* */ 1430 /* Outputs : None */ 1431 /* */ 1432 /* Returns : None */ 1433 /* */ 1434 /* Issues : None */ 1435 /* */ 1436 /* Revision History: */ 1437 /* */ 1438 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 1439 /* 12 02 2015 Naveen Kumar P Initial version */ 1440 /* */ 1441 /*****************************************************************************/ 1442 void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src, 1443 WORD32 src_strd, 1444 WORD32 alpha, 1445 WORD32 beta) 1446 { 1447 __m128i zero = _mm_setzero_si128(); 1448 __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; 1449 __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; 1450 __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16; 1451 __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16; 1452 __m128i q0_16x8_1; 1453 __m128i p0_16x8_1; 1454 __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2; 1455 __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2; 1456 __m128i temp1, temp2, temp3, temp4, temp5, temp6; 1457 __m128i Alpha_8x16, Beta_8x16; 1458 __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8; 1459 __m128i const_val2_16x8 = _mm_set1_epi16(2); 1460 __m128i line1, line2, line3, line4, line5, line6, line7, line8; 1461 1462 Alpha_8x16 = _mm_set1_epi16(alpha); 1463 Beta_8x16 = _mm_set1_epi16(beta); 1464 1465 line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); 1466 line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); 1467 line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); 1468 line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); 1469 line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); 1470 line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); 1471 line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); 1472 line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); 1473 1474 temp1 = _mm_unpacklo_epi8(line1, line2); 1475 temp2 = _mm_unpacklo_epi8(line3, line4); 1476 temp3 = _mm_unpacklo_epi8(line5, line6); 1477 temp4 = _mm_unpacklo_epi8(line7, line8); 1478 1479 line1 = _mm_unpacklo_epi16(temp1, temp2); 1480 line2 = _mm_unpackhi_epi16(temp1, temp2); 1481 line3 = _mm_unpacklo_epi16(temp3, temp4); 1482 line4 = _mm_unpackhi_epi16(temp3, temp4); 1483 1484 p1_8x16 = _mm_unpacklo_epi32(line1, line3); 1485 p0_8x16 = _mm_unpackhi_epi32(line1, line3); 1486 q0_8x16 = _mm_unpacklo_epi32(line2, line4); 1487 q1_8x16 = _mm_unpackhi_epi32(line2, line4); 1488 1489 p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero); 1490 p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero); 1491 q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero); 1492 q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero); 1493 p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero); 1494 p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero); 1495 q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero); 1496 q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero); 1497 1498 //Cond1 (ABS(p0 - q0) < alpha) 1499 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 1500 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 1501 temp1 = _mm_add_epi8(temp1, temp2); 1502 1503 temp2 = _mm_unpacklo_epi8(temp1, zero); 1504 temp1 = _mm_unpackhi_epi8(temp1, zero); 1505 1506 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 1507 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 1508 1509 flag1_16x8 = _mm_packs_epi16(temp2, temp1); 1510 1511 //Cond2 (ABS(q1 - q0) < beta) 1512 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 1513 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 1514 temp1 = _mm_add_epi8(temp1, temp2); 1515 1516 temp2 = _mm_unpacklo_epi8(temp1, zero); 1517 temp1 = _mm_unpackhi_epi8(temp1, zero); 1518 1519 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1520 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1521 1522 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 1523 1524 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1525 1526 //Cond3 (ABS(p1 - p0) < beta) 1527 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 1528 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 1529 temp1 = _mm_add_epi8(temp1, temp2); 1530 1531 temp2 = _mm_unpacklo_epi8(temp1, zero); 1532 temp1 = _mm_unpackhi_epi8(temp1, zero); 1533 1534 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1535 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1536 1537 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 1538 1539 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 1540 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1541 1542 // (ABS(p0 - q0) < ((alpha >> 2) + 2)) 1543 temp1 = _mm_subs_epu8(p0_16x8, q0_16x8); 1544 temp2 = _mm_subs_epu8(q0_16x8, p0_16x8); 1545 temp1 = _mm_add_epi8(temp1, temp2); 1546 Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2); 1547 Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8); 1548 1549 temp2 = _mm_unpacklo_epi8(temp1, zero); 1550 temp1 = _mm_unpackhi_epi8(temp1, zero); 1551 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 1552 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1); 1553 1554 flag2_16x8 = _mm_packs_epi16(temp2, temp1); 1555 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1556 1557 // (ABS(p2 - p0) < beta) 1558 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 1559 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 1560 temp1 = _mm_add_epi8(temp1, temp2); 1561 1562 temp2 = _mm_unpacklo_epi8(temp1, zero); 1563 temp1 = _mm_unpackhi_epi8(temp1, zero); 1564 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1565 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1566 1567 flag3_16x8 = _mm_packs_epi16(temp2, temp1); 1568 flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8); 1569 1570 // (ABS(q2 - q0) < beta) 1571 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 1572 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 1573 temp1 = _mm_add_epi8(temp1, temp2); 1574 1575 temp2 = _mm_unpacklo_epi8(temp1, zero); 1576 temp1 = _mm_unpackhi_epi8(temp1, zero); 1577 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1578 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1); 1579 1580 flag4_16x8 = _mm_packs_epi16(temp2, temp1); 1581 flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8); 1582 1583 // First 8 pixels 1584 p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero); 1585 p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero); 1586 p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero); 1587 p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero); 1588 q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero); 1589 q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero); 1590 q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero); 1591 q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero); 1592 1593 // p0_1 and q0_1 1594 temp1 = _mm_add_epi16(p0_8x16, q1_8x16); 1595 temp2 = _mm_add_epi16(p1_8x16, q0_8x16); 1596 temp5 = _mm_add_epi16(temp1, const_val2_16x8); 1597 temp6 = _mm_add_epi16(temp2, const_val2_16x8); 1598 temp3 = _mm_slli_epi16(p1_8x16, 1); 1599 temp4 = _mm_slli_epi16(q1_8x16, 1); 1600 temp1 = _mm_add_epi16(temp5, temp3); 1601 temp2 = _mm_add_epi16(temp6, temp4); 1602 p0_16x8_1 = _mm_srai_epi16(temp1, 2); 1603 q0_16x8_1 = _mm_srai_epi16(temp2, 2); 1604 1605 // p1_2 and q1_2 1606 temp6 = _mm_add_epi16(temp6, p0_8x16); 1607 temp5 = _mm_add_epi16(temp5, q0_8x16); 1608 temp1 = _mm_add_epi16(temp6, p2_8x16); 1609 temp2 = _mm_add_epi16(temp5, q2_8x16); 1610 p1_16x8_2 = _mm_srai_epi16(temp1, 2); 1611 q1_16x8_2 = _mm_srai_epi16(temp2, 2); 1612 1613 // p0_2 and q0_2 1614 temp1 = _mm_add_epi16(temp3, p2_8x16); 1615 temp2 = _mm_add_epi16(temp4, q2_8x16); 1616 temp1 = _mm_add_epi16(temp1, q1_8x16); 1617 temp2 = _mm_add_epi16(temp2, p1_8x16); 1618 temp3 = _mm_add_epi16(p0_8x16, q0_8x16); 1619 temp3 = _mm_slli_epi16(temp3, 1); 1620 temp1 = _mm_add_epi16(temp1, temp3); 1621 temp2 = _mm_add_epi16(temp2, temp3); 1622 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4)); 1623 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4)); 1624 p0_16x8_2 = _mm_srai_epi16(temp1, 3); 1625 q0_16x8_2 = _mm_srai_epi16(temp2, 3); 1626 1627 // p2_2 and q2_2 1628 temp1 = _mm_add_epi16(temp6, const_val2_16x8); 1629 temp2 = _mm_add_epi16(temp5, const_val2_16x8); 1630 temp3 = _mm_slli_epi16(p2_8x16, 1); 1631 temp4 = _mm_slli_epi16(q2_8x16, 1); 1632 temp3 = _mm_add_epi16(p2_8x16, temp3); 1633 temp4 = _mm_add_epi16(q2_8x16, temp4); 1634 temp5 = _mm_slli_epi16(p3_8x16, 1); 1635 temp6 = _mm_slli_epi16(q3_8x16, 1); 1636 temp1 = _mm_add_epi16(temp1, temp3); 1637 temp2 = _mm_add_epi16(temp2, temp4); 1638 temp1 = _mm_add_epi16(temp1, temp5); 1639 temp2 = _mm_add_epi16(temp2, temp6); 1640 p2_16x8_2 = _mm_srai_epi16(temp1, 3); 1641 q2_16x8_2 = _mm_srai_epi16(temp2, 3); 1642 1643 // p0_1 and q0_1 1644 p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero); 1645 q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero); 1646 1647 // p1_2 and q1_2 1648 p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero); 1649 q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero); 1650 1651 // p0_2 and q0_2 1652 p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero); 1653 q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero); 1654 1655 // p2_2 and q2_2 1656 p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero); 1657 q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero); 1658 1659 // p0 and q0 1660 p0_16x8 = _mm_and_si128(p0_16x8, 1661 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 1662 p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8); 1663 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1); 1664 q0_16x8 = _mm_and_si128(q0_16x8, 1665 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF))); 1666 q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8); 1667 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1); 1668 1669 // p0 and q0 1670 p0_16x8 = _mm_and_si128(p0_16x8, 1671 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 1672 p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8); 1673 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2); 1674 q0_16x8 = _mm_and_si128(q0_16x8, 1675 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 1676 q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8); 1677 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2); 1678 1679 // p1 and q1 1680 p1_16x8 = _mm_and_si128(p1_16x8, 1681 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 1682 p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8); 1683 p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2); 1684 q1_16x8 = _mm_and_si128(q1_16x8, 1685 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 1686 q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8); 1687 q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2); 1688 1689 // p2 and q2 1690 p2_16x8 = _mm_and_si128(p2_16x8, 1691 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF))); 1692 p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8); 1693 p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2); 1694 q2_16x8 = _mm_and_si128(q2_16x8, 1695 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF))); 1696 q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8); 1697 q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2); 1698 1699 temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); 1700 temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8); 1701 temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8); 1702 temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); 1703 1704 p3_8x16 = _mm_unpacklo_epi16(temp1, temp2); 1705 p2_8x16 = _mm_unpackhi_epi16(temp1, temp2); 1706 q2_8x16 = _mm_unpacklo_epi16(temp3, temp4); 1707 q3_8x16 = _mm_unpackhi_epi16(temp3, temp4); 1708 1709 line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16); 1710 line2 = _mm_srli_si128(line1, 8); 1711 line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16); 1712 line4 = _mm_srli_si128(line3, 8); 1713 line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16); 1714 line6 = _mm_srli_si128(line5, 8); 1715 line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16); 1716 line8 = _mm_srli_si128(line7, 8); 1717 1718 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); 1719 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); 1720 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); 1721 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); 1722 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); 1723 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); 1724 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); 1725 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); 1726 1727 } 1728 1729 /*****************************************************************************/ 1730 /* */ 1731 /* Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */ 1732 /* */ 1733 /* Description : This function performs filtering of a luma block */ 1734 /* vertical edge when boundary strength is less than 4. */ 1735 /* */ 1736 /* Inputs : pu1_src - pointer to the src sample q0 */ 1737 /* src_strd - source stride */ 1738 /* alpha - alpha value for the boundary */ 1739 /* beta - beta value for the boundary */ 1740 /* u4_bs - packed Boundary strength array */ 1741 /* pu1_cliptab - tc0_table */ 1742 /* */ 1743 /* Globals : None */ 1744 /* */ 1745 /* Processing : When the function is called twice, this operation is as */ 1746 /* described in Sec. 8.7.2.3 under the title "Filtering */ 1747 /* process for edges for bS less than 4" in ITU T Rec H.264.*/ 1748 /* */ 1749 /* Outputs : None */ 1750 /* */ 1751 /* Returns : None */ 1752 /* */ 1753 /* Issues : None */ 1754 /* */ 1755 /* Revision History: */ 1756 /* */ 1757 /* DD MM YYYY Author(s) Changes (Describe the changes made) */ 1758 /* 12 02 2015 Naveen Kumar P Initial version */ 1759 /* */ 1760 /*****************************************************************************/ 1761 void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src, 1762 WORD32 src_strd, 1763 WORD32 alpha, 1764 WORD32 beta, 1765 UWORD32 u4_bs, 1766 const UWORD8 *pu1_cliptab) 1767 { 1768 __m128i zero = _mm_setzero_si128(); 1769 __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16; 1770 __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8; 1771 __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8; 1772 __m128i temp1, temp2, temp3, temp4; 1773 __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8; 1774 __m128i in_macro_16x8; 1775 __m128i const_val4_8x16; 1776 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3; 1777 UWORD8 clip0, clip1, clip2, clip3; 1778 __m128i line1, line2, line3, line4, line5, line6, line7, line8; 1779 __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2; 1780 __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2; 1781 1782 line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd)); 1783 line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd)); 1784 line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd)); 1785 line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd)); 1786 line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd)); 1787 line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd)); 1788 line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd)); 1789 line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd)); 1790 1791 temp1 = _mm_unpacklo_epi8(line1, line2); 1792 temp2 = _mm_unpacklo_epi8(line3, line4); 1793 temp3 = _mm_unpacklo_epi8(line5, line6); 1794 temp4 = _mm_unpacklo_epi8(line7, line8); 1795 1796 line1 = _mm_unpacklo_epi16(temp1, temp2); 1797 line2 = _mm_unpackhi_epi16(temp1, temp2); 1798 line3 = _mm_unpacklo_epi16(temp3, temp4); 1799 line4 = _mm_unpackhi_epi16(temp3, temp4); 1800 1801 temp1 = _mm_unpacklo_epi32(line1, line3); 1802 temp2 = _mm_unpackhi_epi32(line1, line3); 1803 temp3 = _mm_unpacklo_epi32(line2, line4); 1804 temp4 = _mm_unpackhi_epi32(line2, line4); 1805 1806 p3_16x8 = _mm_unpacklo_epi64(temp1, zero); 1807 p2_16x8 = _mm_unpackhi_epi64(temp1, zero); 1808 q2_16x8 = _mm_unpacklo_epi64(temp4, zero); 1809 q3_16x8 = _mm_unpackhi_epi64(temp4, zero); 1810 p1_16x8 = _mm_unpacklo_epi64(temp2, zero); 1811 p0_16x8 = _mm_unpackhi_epi64(temp2, zero); 1812 q0_16x8 = _mm_unpacklo_epi64(temp3, zero); 1813 q1_16x8 = _mm_unpackhi_epi64(temp3, zero); 1814 1815 u1_Bs0 = (u4_bs >> 24) & 0xff; 1816 u1_Bs1 = (u4_bs >> 16) & 0xff; 1817 u1_Bs2 = (u4_bs >> 8) & 0xff; 1818 u1_Bs3 = (u4_bs >> 0) & 0xff; 1819 clip0 = pu1_cliptab[u1_Bs0]; 1820 clip1 = pu1_cliptab[u1_Bs1]; 1821 clip2 = pu1_cliptab[u1_Bs2]; 1822 clip3 = pu1_cliptab[u1_Bs3]; 1823 1824 Alpha_8x16 = _mm_set1_epi16(alpha); 1825 Beta_8x16 = _mm_set1_epi16(beta); 1826 1827 bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2, 1828 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0); 1829 1830 C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2, 1831 clip1, clip1, clip0, clip0); 1832 1833 bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero); 1834 bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask 1835 C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero); 1836 1837 //Cond1 (ABS(p0 - q0) < alpha) 1838 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8); 1839 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8); 1840 temp1 = _mm_add_epi8(temp1, temp2); 1841 1842 temp2 = _mm_unpacklo_epi8(temp1, zero); 1843 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2); 1844 1845 flag1_16x8 = _mm_packs_epi16(temp2, zero); 1846 flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b); 1847 1848 //Cond2 (ABS(q1 - q0) < beta) 1849 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8); 1850 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8); 1851 temp1 = _mm_add_epi8(temp1, temp2); 1852 1853 temp2 = _mm_unpacklo_epi8(temp1, zero); 1854 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1855 1856 flag2_16x8 = _mm_packs_epi16(temp2, zero); 1857 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1858 1859 //Cond3 (ABS(p1 - p0) < beta) 1860 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8); 1861 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8); 1862 temp1 = _mm_add_epi8(temp1, temp2); 1863 1864 temp2 = _mm_unpacklo_epi8(temp1, zero); 1865 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1866 1867 flag2_16x8 = _mm_packs_epi16(temp2, zero); 1868 1869 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta)) 1870 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1871 1872 // (ABS(p2 - p0) < beta) 1873 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8); 1874 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8); 1875 temp1 = _mm_add_epi8(temp1, temp2); 1876 1877 temp2 = _mm_unpacklo_epi8(temp1, zero); 1878 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1879 1880 flag2_16x8 = _mm_packs_epi16(temp2, zero); 1881 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8); 1882 1883 temp2 = _mm_subs_epi16(zero, temp2); 1884 1885 C_8x16 = _mm_add_epi16(C0_8x16, temp2); 1886 1887 // (ABS(q2 - q0) < beta) 1888 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8); 1889 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8); 1890 temp1 = _mm_add_epi8(temp1, temp2); 1891 1892 temp2 = _mm_unpacklo_epi8(temp1, zero); 1893 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2); 1894 1895 flag3_16x8 = _mm_packs_epi16(temp2, zero); 1896 flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8); 1897 1898 temp2 = _mm_subs_epi16(zero, temp2); 1899 1900 C_8x16 = _mm_add_epi16(C_8x16, temp2); 1901 1902 const_val4_8x16 = _mm_set1_epi16(4); 1903 temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero), 1904 _mm_unpacklo_epi8(p0_16x8, zero)); 1905 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1906 _mm_unpacklo_epi8(q1_16x8, zero)); 1907 temp1 = _mm_slli_epi16(temp1, 2); 1908 temp1 = _mm_add_epi16(temp1, temp2); 1909 temp1 = _mm_add_epi16(temp1, const_val4_8x16); 1910 in_macro_16x8 = _mm_srai_epi16(temp1, 3); 1911 1912 in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3 1913 C_8x16 = _mm_subs_epi16(zero, C_8x16); 1914 in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3 1915 1916 // p0 1917 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8); 1918 1919 temp1 = _mm_packus_epi16(temp1, zero); 1920 1921 p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8); 1922 p0_16x8_2 = _mm_and_si128( 1923 p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); 1924 1925 p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2); 1926 1927 // q0 1928 temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8); 1929 1930 temp1 = _mm_packus_epi16(temp1, zero); 1931 1932 q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8); 1933 q0_16x8_2 = _mm_and_si128( 1934 q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF))); 1935 1936 q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2); 1937 1938 //if(Ap < Beta) 1939 temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), 1940 _mm_unpacklo_epi8(p0_16x8, zero)); 1941 temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1); 1942 //temp2 = _mm_subs_epi16(zero,temp2); 1943 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2); 1944 temp2 = _mm_add_epi16(temp1, temp2); 1945 in_macro_16x8 = _mm_srai_epi16(temp2, 1); 1946 1947 in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 1948 C0_8x16 = _mm_subs_epi16(zero, C0_8x16); 1949 in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 1950 1951 // p1 1952 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8); 1953 1954 temp1 = _mm_packus_epi16(temp1, zero); 1955 1956 p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8); 1957 p1_16x8 = _mm_and_si128(p1_16x8, 1958 _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF))); 1959 p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1); 1960 1961 //if(Aq < Beta) 1962 temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero), 1963 _mm_unpacklo_epi8(p0_16x8, zero)); 1964 temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1); 1965 //temp2 = _mm_slli_epi16 (temp2, 1); 1966 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2); 1967 temp2 = _mm_add_epi16(temp1, temp2); 1968 in_macro_16x8 = _mm_srai_epi16(temp2, 1); 1969 1970 in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3 1971 C0_8x16 = _mm_subs_epi16(zero, C0_8x16); 1972 in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3 1973 1974 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8); 1975 1976 // q1 1977 temp1 = _mm_packus_epi16(temp1, zero); 1978 1979 q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8); 1980 q1_16x8 = _mm_and_si128(q1_16x8, 1981 _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF))); 1982 q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1); 1983 1984 temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8); 1985 temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1); 1986 temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8); 1987 temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8); 1988 1989 line7 = _mm_unpacklo_epi16(temp1, temp2); 1990 temp1 = _mm_unpackhi_epi16(temp1, temp2); 1991 line8 = _mm_unpacklo_epi16(temp3, temp4); 1992 temp2 = _mm_unpackhi_epi16(temp3, temp4); 1993 1994 line1 = _mm_unpacklo_epi32(line7, line8); 1995 line2 = _mm_srli_si128(line1, 8); 1996 line3 = _mm_unpackhi_epi32(line7, line8); 1997 line4 = _mm_srli_si128(line3, 8); 1998 line5 = _mm_unpacklo_epi32(temp1, temp2); 1999 line6 = _mm_srli_si128(line5, 8); 2000 line7 = _mm_unpackhi_epi32(temp1, temp2); 2001 line8 = _mm_srli_si128(line7, 8); 2002 2003 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1); 2004 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2); 2005 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3); 2006 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4); 2007 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5); 2008 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6); 2009 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7); 2010 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8); 2011 } 2012 2013