1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_sao_atom_intr.c 22 * 23 * @brief 24 * Contains function definitions for Sample adaptive offset(SAO) used in-loop 25 * filtering 26 * 27 * @author 28 * 100592 29 * 30 * @par List of Functions: 31 * - ihevc_sao_band_offset_luma_ssse3() 32 * - ihevc_sao_band_offset_chroma_ssse3() 33 * - ihevc_sao_edge_offset_class0_ssse3() 34 * - ihevc_sao_edge_offset_class0_chroma_ssse3() 35 * - ihevc_sao_edge_offset_class1_ssse3() 36 * - ihevc_sao_edge_offset_class1_chroma_ssse3() 37 * - ihevc_sao_edge_offset_class2_ssse3() 38 * - ihevc_sao_edge_offset_class2_chroma_ssse3() 39 * - ihevc_sao_edge_offset_class3_ssse3() 40 * - ihevc_sao_edge_offset_class3_chroma_ssse3() 41 * 42 * @remarks 43 * None 44 * 45 ******************************************************************************* 46 */ 47 /*****************************************************************************/ 48 /* File Includes */ 49 /*****************************************************************************/ 50 #include <stdio.h> 51 52 #include "ihevc_typedefs.h" 53 #include "ihevc_platform_macros.h" 54 #include "ihevc_macros.h" 55 #include "ihevc_func_selector.h" 56 #include "ihevc_defs.h" 57 #include "ihevc_tables_x86_intr.h" 58 #include "ihevc_common_tables.h" 59 #include "ihevc_sao.h" 60 61 #include <immintrin.h> 62 63 #define NUM_BAND_TABLE 32 64 /** 65 ******************************************************************************* 66 * 67 * @brief 68 * Has two sets of functions : band offset and edge offset both for luma and chroma 69 * edge offset has horizontal ,vertical, 135 degree and 45 degree 70 * 71 * @par Description: 72 * 73 * 74 * @param[in-out] pu1_src 75 * Pointer to the source 76 * 77 * @param[in] src_strd 78 * Source stride 79 * 80 * @param[in-out] pu1_src_left 81 * source left boundary 82 * 83 * @param[in-out] pu1_src_top 84 * Source top boundary 85 * 86 * @param[in-out] pu1_src_top_left 87 * Source top left boundary 88 * 89 * @param[in] pu1_src_top_right 90 * Source top right boundary 91 * 92 * @param[in] pu1_src_bot_left 93 * Source bottom left boundary 94 * 95 * @param[in] pu1_avail 96 * boundary availability flags 97 * 98 * @param[in] pi1_sao_offset_u 99 * Chroma U sao offset values 100 * 101 * @param[in] pi1_sao_offset_v 102 * Chroma V sao offset values 103 * 104 * @param[in] pi1_sao_offset 105 * Luma sao offset values 106 * 107 * @param[in] wd 108 * width of the source 109 110 * @param[in] ht 111 * height of the source 112 * @returns 113 * 114 * @remarks 115 * None 116 * 117 ******************************************************************************* 118 */ 119 120 121 void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src, 122 WORD32 src_strd, 123 UWORD8 *pu1_src_left, 124 UWORD8 *pu1_src_top, 125 UWORD8 *pu1_src_top_left, 126 WORD32 sao_band_pos, 127 WORD8 *pi1_sao_offset, 128 WORD32 wd, 129 WORD32 ht) 130 { 131 WORD32 row, col; 132 UWORD8 *pu1_src_cpy; 133 WORD32 wd_rem; 134 WORD8 offset = 0; 135 136 __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; 137 __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b; 138 __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4; 139 __m128i band_pos_16x8b; 140 __m128i sao_offset; 141 __m128i cmp_mask, cmp_store; 142 143 /* Updating left and top-left and top */ 144 for(row = 0; row < ht; row++) 145 { 146 pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)]; 147 } 148 pu1_src_top_left[0] = pu1_src_top[wd - 1]; 149 for(col = 0; col < wd; col += 8) 150 { 151 tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset)); 152 _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1); 153 offset += 8; 154 } 155 156 //replicating sao_band_pos as 8 bit value 16 times 157 158 159 band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3)); 160 //value set for sao_offset extraction 161 tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1); 162 tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2); 163 tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3); 164 tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4); 165 166 //loaded sao offset values 167 sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset); 168 169 //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers 170 band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); 171 band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); 172 band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); 173 band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); 174 175 //band_position addition 176 band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b); 177 band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b); 178 band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b); 179 band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b); 180 //sao_offset duplication 181 tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); 182 tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); 183 tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); 184 tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); 185 //settng for comparision 186 cmp_mask = _mm_set1_epi16(16); 187 cmp_store = _mm_set1_epi16(0x00ff); 188 189 //sao_offset addition 190 band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1); 191 band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2); 192 band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3); 193 band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4); 194 //masking upper 8bit values of each 16 bit band table value 195 band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store); 196 band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store); 197 band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store); 198 band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store); 199 200 switch(sao_band_pos) 201 { 202 case 0: 203 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b); 204 band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2); 205 break; 206 case 28: 207 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b); 208 band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2); 209 break; 210 case 29: 211 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b); 212 band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2); 213 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b); 214 band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2); 215 break; 216 case 30: 217 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b); 218 band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2); 219 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b); 220 band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2); 221 break; 222 case 31: 223 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b); 224 band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2); 225 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b); 226 band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2); 227 break; 228 default: 229 break; 230 } 231 //sao_offset is reused for zero cmp mask. 232 sao_offset = _mm_setzero_si128(); 233 tmp_set_128i_1 = _mm_set1_epi8(1); 234 //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0); 235 cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16); 236 237 //masking upper 8bit values of each 16 bit band table value 238 band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store); 239 band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store); 240 band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store); 241 band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store); 242 243 //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b 244 band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b); 245 band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b); 246 247 band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31 248 band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned 249 band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31 250 251 cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1); 252 // band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store); 253 254 for(col = wd; col >= 16; col -= 16) 255 { 256 pu1_src_cpy = pu1_src; 257 for(row = ht; row > 0; row -= 2) 258 { 259 260 261 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. 262 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 263 // row = 1 264 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 265 266 267 268 //saturated substract 8 bit 269 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b); 270 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b); 271 //if the values less than 0 put ff 272 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); 273 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); 274 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 275 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 276 //if the values gret=ater than 31 put ff 277 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b); 278 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b); 279 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 280 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 281 282 283 //row 0 and row1 284 //if the values >16 then put ff ,cmp_mask = dup16(15) 285 cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); 286 //values 16 to 31 for row 0 & 1 but values <16 ==0 287 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store); 288 // values 0 to 15 for row 0 & 1 289 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store); 290 //values 16 to 31 for row 0 & 1 but values <16 masked to ff 291 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); 292 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store); 293 //row 2 and row 3 294 //if the values >16 then put ff ,cmp_mask = dup16(15) 295 cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); 296 //values 16 to 31 for row 2 & 3 but values <16 ==0 297 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store); 298 // values 0 to 15 for row 2 & 3 299 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store); 300 //values 16 to 31 for row 2 & 3 but values <16 masked to ff 301 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); 302 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store); 303 304 //row 0 and row 1 305 //to preserve pixel values in which no offset needs to be added. 306 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); 307 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store); 308 309 //row 2 and row 3 310 //to preserve pixel values in which no offset needs to be added. 311 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); 312 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store); 313 314 //indexing 0 - 15 bandtable indexes 315 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1); 316 tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3); 317 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2); 318 tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4); 319 // combining all offsets results 320 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 321 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 322 // combing results woth the pixel values 323 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); 324 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); 325 326 327 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 328 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b); 329 // row = 1 330 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b); 331 332 pu1_src_cpy += (src_strd << 1); 333 } 334 pu1_src += 16; 335 } 336 wd_rem = wd & 0xF; 337 if(wd_rem) 338 {pu1_src_cpy = pu1_src; 339 for(row = ht; row > 0; row -= 4) 340 { 341 342 343 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. 344 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); 345 // row = 1 346 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 347 // row = 2 348 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); 349 // row = 3 350 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); 351 //row0 and row1 packed and row2 and row3 packed 352 353 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); 354 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b); 355 356 //saturated substract 8 bit 357 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b); 358 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b); 359 //if the values less than 0 put ff 360 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); 361 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); 362 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 363 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 364 //if the values gret=ater than 31 put ff 365 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b); 366 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b); 367 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 368 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 369 370 371 372 //row 0 and row1 373 //if the values >16 then put ff ,cmp_mask = dup16(15) 374 cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); 375 //values 16 to 31 for row 0 & 1 but values <16 ==0 376 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store); 377 // values 0 to 15 for row 0 & 1 378 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store); 379 //values 16 to 31 for row 0 & 1 but values <16 masked to ff 380 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); 381 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store); 382 //row 2 and row 3 383 //if the values >16 then put ff ,cmp_mask = dup16(15) 384 cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); 385 //values 16 to 31 for row 2 & 3 but values <16 ==0 386 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store); 387 // values 0 to 15 for row 2 & 3 388 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store); 389 //values 16 to 31 for row 2 & 3 but values <16 masked to ff 390 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); 391 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store); 392 393 //row 0 and row 1 394 //to preserve pixel values in which no offset needs to be added. 395 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); 396 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store); 397 398 //row 2 and row 3 399 //to preserve pixel values in which no offset needs to be added. 400 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); 401 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store); 402 403 //indexing 0 - 15 bandtable indexes 404 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1); 405 tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3); 406 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2); 407 tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4); 408 // combining all offsets results 409 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 410 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 411 // combing results woth the pixel values 412 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); 413 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); 414 415 //Getting row1 separately 416 src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8); 417 //Getting row3 separately 418 src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 419 420 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 421 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b); 422 // row = 1 423 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b); 424 // row = 2 425 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b); 426 // row = 3 427 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b); 428 429 pu1_src_cpy += (src_strd << 2); 430 431 } 432 pu1_src += 8; 433 } 434 435 436 } 437 438 void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src, 439 WORD32 src_strd, 440 UWORD8 *pu1_src_left, 441 UWORD8 *pu1_src_top, 442 UWORD8 *pu1_src_top_left, 443 WORD32 sao_band_pos_u, 444 WORD32 sao_band_pos_v, 445 WORD8 *pi1_sao_offset_u, 446 WORD8 *pi1_sao_offset_v, 447 WORD32 wd, 448 WORD32 ht) 449 { 450 WORD32 row, col; 451 WORD8 offset = 0; 452 453 454 __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; 455 __m128i cmp_msk2; 456 __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b; 457 __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4; 458 __m128i band_pos_u_16x8b, band_pos_v_16x8b; 459 __m128i sao_offset; 460 __m128i cmp_mask; 461 462 463 /* Updating left and top and top-left */ 464 for(row = 0; row < ht; row++) 465 { 466 pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)]; 467 pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)]; 468 } 469 pu1_src_top_left[0] = pu1_src_top[wd - 2]; 470 pu1_src_top_left[1] = pu1_src_top[wd - 1]; 471 for(col = 0; col < wd; col += 8) 472 { 473 tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset)); 474 _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1); 475 offset += 8; 476 } 477 478 { // band _table creation 479 __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b; 480 // Band table for U component : band_table0_16x8b and band_table2_16x8b 481 //replicating sao_band_pos as 8 bit value 16 times 482 band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3)); 483 //value set for sao_offset extraction 484 tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1); 485 tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2); 486 tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3); 487 tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4); 488 489 //loaded sao offset values 490 sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); 491 492 //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers 493 band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); 494 band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); 495 band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); 496 band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); 497 498 //band_position addition 499 band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b); 500 band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b); 501 band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b); 502 band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b); 503 //sao_offset duplication 504 temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); 505 temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); 506 temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); 507 temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); 508 509 //sao_offset addition 510 band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b); 511 band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b); 512 band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b); 513 band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b); 514 //reuse for clipping 515 temp1_8x16b = _mm_set1_epi16(0x00ff); 516 //settng for comparision 517 cmp_mask = _mm_set1_epi16(16); 518 519 //masking upper 8bit values of each 16 bit band table value 520 band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b); 521 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); 522 band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b); 523 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); 524 525 //temp1_8x16b reuse for compare storage 526 switch(sao_band_pos_u) 527 { 528 case 0: 529 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b); 530 band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b); 531 break; 532 case 28: 533 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); 534 band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b); 535 break; 536 case 29: 537 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b); 538 band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b); 539 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); 540 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b); 541 break; 542 case 30: 543 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); 544 band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b); 545 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b); 546 band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b); 547 break; 548 case 31: 549 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b); 550 band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b); 551 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); 552 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b); 553 break; 554 default: 555 break; 556 } 557 //masking upper 8bit values of each 16 bit band table value 558 band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b); 559 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); 560 band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b); 561 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); 562 //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b 563 band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b); 564 band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b); 565 // Band table for U component over 566 567 // Band table for V component : band_table1_16x8b and band_table3_16x8b 568 // replicating sao_band_pos as 8 bit value 16 times 569 band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3)); 570 571 //loaded sao offset values 572 sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); 573 574 //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers 575 temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx)); 576 band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8)); 577 temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16)); 578 band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24)); 579 580 //band_position addition 581 temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b); 582 band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b); 583 temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b); 584 band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b); 585 //sao_offset duplication 586 tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1); 587 tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2); 588 tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3); 589 tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4); 590 591 //sao_offset addition 592 temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1); 593 band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2); 594 temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3); 595 band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4); 596 597 //masking upper 8bit values of 16 bit band table value 598 temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b); 599 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); 600 temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b); 601 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); 602 //temp1_8x16b reuse for compare storage 603 604 switch(sao_band_pos_v) 605 { 606 case 0: 607 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b); 608 temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b); 609 break; 610 case 28: 611 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); 612 band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b); 613 break; 614 case 29: 615 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b); 616 temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b); 617 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b); 618 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b); 619 break; 620 case 30: 621 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); 622 band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b); 623 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b); 624 temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b); 625 break; 626 case 31: 627 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b); 628 temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b); 629 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b); 630 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b); 631 break; 632 default: 633 break; 634 } 635 //masking upper 8bit values of each 16 bit band table value 636 temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b); 637 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b); 638 temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b); 639 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b); 640 //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b 641 band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b); 642 band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b); 643 //band table for u and v created 644 } 645 { 646 UWORD8 *pu1_src_cpy; 647 WORD32 wd_rem; 648 649 650 //sao_offset is reused for zero cmp mask. 651 sao_offset = _mm_setzero_si128(); 652 tmp_set_128i_1 = _mm_set1_epi8(1); 653 //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0); 654 cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16); 655 //to avoid ffff to be saturated to 0 instead it should be to ff 656 657 cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31 658 band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned 659 band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned 660 cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31 661 662 cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1); 663 664 for(col = wd; col >= 16; col -= 16) 665 { 666 pu1_src_cpy = pu1_src; 667 for(row = ht; row > 0; row -= 2) 668 { 669 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. 670 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 671 // row = 1 672 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 673 674 675 //odd values 676 src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); 677 src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); 678 //even values 679 src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8); 680 src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8); 681 src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); 682 src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); 683 //combining odd values 684 src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b); 685 //combining even values 686 src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b); 687 688 //saturated substract 8 bit 689 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b); 690 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b); 691 //if the values less than 0 put ff 692 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); 693 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); 694 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 695 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 696 //if the values greater than 31 put ff 697 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2); 698 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2); 699 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 700 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 701 // registers reused to increase performance 702 //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1 703 src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); 704 //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3 705 src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); 706 707 //values 16 to 31 for row 0 & 1 but values <16 ==0 708 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b); 709 // values 0 to 15 for row 0 & 1 710 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b); 711 //values 16 to 31 for row 2 & 3 but values <16 ==0 712 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b); 713 // values 0 to 15 for row 2 & 3 714 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b); 715 716 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1 717 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); 718 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3 719 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); 720 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b); 721 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b); 722 723 724 //to choose which pixel values to preserve in row 0 and row 1 725 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); 726 //to choose which pixel values to preserve in row 2 and row 3 727 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); 728 //values of all rows to which no offset needs to be added preserved. 729 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b); 730 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b); 731 732 //indexing 0 - 15 bandtable indexes 733 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low 734 tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low 735 //indexing 16 -31 bandtable indexes 736 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high 737 tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high 738 // combining all offsets results 739 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U 740 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V 741 // combing results with the pixel values 742 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); 743 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); 744 //reorganising even and odd values 745 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b); 746 src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b); 747 748 749 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 750 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b); 751 // row = 1 752 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b); 753 754 755 pu1_src_cpy += (src_strd << 1); 756 757 } 758 pu1_src += 16; 759 } 760 761 wd_rem = wd & 0xF; 762 if(wd_rem) 763 { 764 pu1_src_cpy = pu1_src; 765 for(row = ht; row > 0; row -= 4) 766 { 767 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. 768 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); 769 // row = 1 770 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 771 // row = 2 772 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); 773 // row = 3 774 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); 775 //row0 and row1 packed and row2 and row3 packed 776 777 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); 778 src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b); 779 //odd values 780 src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); 781 src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); 782 //even values 783 src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8); 784 src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8); 785 src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8); 786 src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8); 787 //combining odd values 788 src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b); 789 //combining even values 790 src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b); 791 792 //saturated substract 8 bit 793 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b); 794 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b); 795 //if the values less than 0 put ff 796 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1); 797 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3); 798 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 799 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 800 //if the values greater than 31 put ff 801 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2); 802 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2); 803 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); 804 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); 805 // registers reused to increase performance 806 //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1 807 src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask); 808 //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3 809 src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask); 810 811 //values 16 to 31 for row 0 & 1 but values <16 ==0 812 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b); 813 // values 0 to 15 for row 0 & 1 814 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b); 815 //values 16 to 31 for row 2 & 3 but values <16 ==0 816 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b); 817 // values 0 to 15 for row 2 & 3 818 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b); 819 820 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1 821 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset); 822 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3 823 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset); 824 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b); 825 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b); 826 827 828 //to choose which pixel values to preserve in row 0 and row 1 829 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2); 830 //to choose which pixel values to preserve in row 2 and row 3 831 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4); 832 //values of all rows to which no offset needs to be added preserved. 833 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b); 834 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b); 835 836 //indexing 0 - 15 bandtable indexes 837 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low 838 tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low 839 //indexing 16 -31 bandtable indexes 840 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high 841 tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high 842 // combining all offsets results 843 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U 844 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V 845 // combing results with the pixel values 846 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1); 847 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3); 848 //reorganising even and odd values 849 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b); 850 src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b); 851 //Getting row1 separately 852 src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 853 //Getting row3 separately 854 src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 855 856 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 857 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b); 858 // row = 1 859 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b); 860 // row = 2 861 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b); 862 // row = 3 863 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b); 864 865 pu1_src_cpy += (src_strd << 2); 866 867 } 868 pu1_src += 16; 869 } 870 871 872 } 873 } 874 875 876 877 void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src, 878 WORD32 src_strd, 879 UWORD8 *pu1_src_left, 880 UWORD8 *pu1_src_top, 881 UWORD8 *pu1_src_top_left, 882 UWORD8 *pu1_src_top_right, 883 UWORD8 *pu1_src_bot_left, 884 UWORD8 *pu1_avail, 885 WORD8 *pi1_sao_offset, 886 WORD32 wd, 887 WORD32 ht) 888 { 889 WORD32 row, col; 890 UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp; 891 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; 892 UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; 893 UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; 894 UWORD8 u1_avail0, u1_avail1; 895 WORD32 wd_rem; 896 WORD32 offset = 0; 897 __m128i src_temp0_16x8b, src_temp1_16x8b; 898 __m128i left0_16x8b, left1_16x8b; 899 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b; 900 __m128i edge0_16x8b, edge1_16x8b; 901 __m128i au1_mask8x16b; 902 __m128i edge_idx_8x16b, sao_offset_8x16b; 903 __m128i const2_16x8b, const0_16x8b; 904 __m128i left_store_16x8b; 905 UNUSED(pu1_src_top_right); 906 UNUSED(pu1_src_bot_left); 907 908 au1_mask8x16b = _mm_set1_epi8(0xff); 909 910 /* Update top and top-left arrays */ 911 912 *pu1_src_top_left = pu1_src_top[wd - 1]; 913 914 for(col = wd; col >= 16; col -= 16) 915 { 916 const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd)); 917 _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b); 918 offset += 16; 919 } 920 921 //setting availability mask to ff size MAX_CTB_SIZE 922 for(col = 0; col < MAX_CTB_SIZE; col += 16) 923 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); 924 for(row = 0; row < ht; row++) 925 { 926 au1_src_left_tmp[row] = pu1_src_left[row]; 927 } 928 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); 929 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); 930 931 //availability mask creation 932 u1_avail0 = pu1_avail[0]; 933 u1_avail1 = pu1_avail[1]; 934 au1_mask[0] = u1_avail0; 935 au1_mask[wd - 1] = u1_avail1; 936 937 const2_16x8b = _mm_set1_epi8(2); 938 const0_16x8b = _mm_setzero_si128(); 939 pu1_src_left_cpy = au1_src_left_tmp; 940 pu1_src_left_str = au1_src_left_tmp1; 941 { 942 au1_mask_cpy = au1_mask; 943 for(col = wd; col >= 16; col -= 16) 944 { 945 pu1_src_cpy = pu1_src; 946 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); 947 //pu1_src_left_cpy =au1_src_left_tmp; 948 for(row = ht; row > 0; row -= 2) 949 { 950 951 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); 952 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. 953 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 954 // row = 1 955 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 956 957 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2); 958 //row 1 left 959 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15); 960 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); 961 //row 0 left 962 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15); 963 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 964 965 966 //separating +ve and and -ve values. 967 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); 968 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); 969 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); 970 cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); 971 //creating mask 00 for +ve and -ve values and FF for zero. 972 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 973 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 974 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); 975 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); 976 //combining the appropriate sign change 977 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 978 left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); 979 980 //row = 0 right 981 edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1)); 982 // row = 1 right 983 edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); 984 //separating +ve and and -ve values. 985 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); 986 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); 987 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); 988 cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); 989 //creating mask 00 for +ve and -ve values and FF for zero. 990 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 991 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 992 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); 993 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); 994 //combining the appropriate sign change 995 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 996 edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); 997 998 //combining sign-left and sign_right 999 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); 1000 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); 1001 //adding constant 2 1002 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1003 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 1004 //shuffle to get sao index 1005 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1006 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 1007 //using availability mask 1008 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 1009 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 1010 1011 //shuffle to get sao offset 1012 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1013 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 1014 //cnvert to 16 bit then add and then saturated pack 1015 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1016 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1017 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); 1018 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 1019 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1020 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); 1021 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 1022 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 1023 1024 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 1025 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 1026 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); 1027 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 1028 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1029 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); 1030 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); 1031 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 1032 1033 1034 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 1035 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 1036 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 1037 // row = 1 1038 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); 1039 1040 pu1_src_cpy += (src_strd << 1); 1041 pu1_src_left_cpy += 2; 1042 pu1_src_left_str += 2; 1043 } 1044 au1_mask_cpy += 16; 1045 pu1_src += 16; 1046 pu1_src_left_cpy -= ht; 1047 pu1_src_left_str -= ht; 1048 1049 pu1_left_tmp = pu1_src_left_cpy; 1050 pu1_src_left_cpy = pu1_src_left_str; 1051 pu1_src_left_str = pu1_left_tmp; 1052 } 1053 1054 wd_rem = wd & 0xF; 1055 if(wd_rem) 1056 { 1057 1058 cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd)); 1059 _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b); 1060 1061 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); 1062 pu1_src_cpy = pu1_src; 1063 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); 1064 //pu1_src_left_cpy =au1_src_left_tmp; 1065 for(row = ht; row > 0; row -= 4) 1066 { 1067 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); 1068 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. 1069 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); 1070 // row = 1 1071 cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 1072 // row = 2 1073 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); 1074 // row = 3 1075 cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); 1076 1077 1078 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4); 1079 //row 3 left 1080 edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8); 1081 cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15); 1082 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); 1083 //row 2 left 1084 edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); 1085 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15); 1086 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); 1087 //row 1 left 1088 edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8); 1089 cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15); 1090 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); 1091 //row 0 left 1092 edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 1093 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15); 1094 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15); 1095 1096 // packing rows together for 16 SIMD operations 1097 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b); 1098 src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b); 1099 // packing rows together for 16 SIMD operations 1100 left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b); 1101 left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b); 1102 1103 //separating +ve and and -ve values. 1104 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); 1105 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); 1106 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); 1107 cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); 1108 //creating mask 00 for +ve and -ve values and FF for zero. 1109 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1110 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1111 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); 1112 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); 1113 //combining the appropriate sign change 1114 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1115 left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); 1116 1117 //row = 0 right 1118 edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1)); 1119 // row = 1 right 1120 cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1)); 1121 // row = 2 right 1122 edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1)); 1123 // row = 3 right 1124 cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1)); 1125 // packing rows together for 16 SIMD operations 1126 edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b); 1127 edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b); 1128 1129 //separating +ve and and -ve values. 1130 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); 1131 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); 1132 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); 1133 cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); 1134 //creating mask 00 for +ve and -ve values and FF for zero. 1135 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1136 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1137 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b); 1138 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b); 1139 //combining the appropriate sign change 1140 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1141 edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b); 1142 1143 //combining sign-left and sign_right 1144 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); 1145 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); 1146 //adding constant 2 1147 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1148 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 1149 //shuffle to get sao index 1150 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1151 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 1152 //shuffle to get sao offset 1153 //using availability mask 1154 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 1155 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 1156 1157 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1158 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 1159 //cnvert to 16 bit then add and then saturated pack 1160 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1161 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1162 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); 1163 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 1164 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1165 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); 1166 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 1167 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 1168 1169 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 1170 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 1171 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); 1172 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 1173 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1174 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); 1175 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); 1176 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 1177 //separting row 1 and row 3 1178 cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 1179 cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8); 1180 1181 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 1182 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 1183 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 1184 // row = 1 1185 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b); 1186 // row = 2 1187 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b); 1188 // row = 3 1189 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b); 1190 1191 pu1_src_cpy += (src_strd << 2); 1192 pu1_src_left_cpy += 4; 1193 pu1_src_left_str += 4; 1194 } 1195 pu1_src += wd; 1196 pu1_src_left_cpy -= ht; 1197 pu1_src_left_str -= ht; 1198 1199 pu1_left_tmp = pu1_src_left_cpy; 1200 pu1_src_left_cpy = pu1_src_left_str; 1201 pu1_src_left_str = pu1_left_tmp; 1202 } 1203 for(row = 0; row < ht; row++) 1204 { 1205 pu1_src_left[row] = pu1_src_left_cpy[row]; 1206 } 1207 } 1208 } 1209 1210 1211 void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src, 1212 WORD32 src_strd, 1213 UWORD8 *pu1_src_left, 1214 UWORD8 *pu1_src_top, 1215 UWORD8 *pu1_src_top_left, 1216 UWORD8 *pu1_src_top_right, 1217 UWORD8 *pu1_src_bot_left, 1218 UWORD8 *pu1_avail, 1219 WORD8 *pi1_sao_offset_u, 1220 WORD8 *pi1_sao_offset_v, 1221 WORD32 wd, 1222 WORD32 ht) 1223 { 1224 WORD32 row, col; 1225 UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp; 1226 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; 1227 UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; 1228 UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)]; 1229 UWORD8 u1_avail0, u1_avail1; 1230 WORD32 wd_rem; 1231 WORD32 offset = 0; 1232 1233 __m128i src_temp0_16x8b, src_temp1_16x8b; 1234 __m128i left0_16x8b, left1_16x8b; 1235 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; 1236 __m128i edge0_16x8b, edge1_16x8b; 1237 __m128i au1_mask8x16b; 1238 __m128i edge_idx_8x16b, sao_offset_8x16b; 1239 __m128i const2_16x8b, const0_16x8b; 1240 __m128i left_store_16x8b; 1241 __m128i chroma_offset_8x16b; 1242 UNUSED(pu1_src_top_right); 1243 UNUSED(pu1_src_bot_left); 1244 1245 au1_mask8x16b = _mm_set1_epi8(0xff); 1246 1247 /* Update top and top-left arrays */ 1248 pu1_src_top_left[0] = pu1_src_top[wd - 2]; 1249 pu1_src_top_left[1] = pu1_src_top[wd - 1];; 1250 1251 for(col = wd; col >= 16; col -= 16) 1252 { 1253 const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd)); 1254 _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b); 1255 offset += 16; 1256 } 1257 for(row = 0; row < 2 * ht; row++) 1258 { 1259 au1_src_left_tmp[row] = pu1_src_left[row]; 1260 } 1261 //setting availability mask to ff size MAX_CTB_SIZE 1262 for(col = 0; col < MAX_CTB_SIZE; col += 16) 1263 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); 1264 1265 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); 1266 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); 1267 const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); 1268 chroma_offset_8x16b = _mm_set1_epi16(0x0800); 1269 //availability mask creation 1270 u1_avail0 = pu1_avail[0]; 1271 u1_avail1 = pu1_avail[1]; 1272 au1_mask[0] = u1_avail0; 1273 au1_mask[1] = u1_avail0; 1274 au1_mask[wd - 1] = u1_avail1; 1275 au1_mask[wd - 2] = u1_avail1; 1276 sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); 1277 const2_16x8b = _mm_set1_epi8(2); 1278 const0_16x8b = _mm_setzero_si128(); 1279 1280 { 1281 pu1_src_left_cpy = au1_src_left_tmp; 1282 pu1_src_left_str = au1_src_left_tmp1; 1283 au1_mask_cpy = au1_mask; 1284 for(col = wd; col >= 16; col -= 16) 1285 { 1286 pu1_src_cpy = pu1_src; 1287 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); 1288 1289 for(row = ht; row > 0; row -= 2) 1290 { 1291 1292 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); 1293 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. 1294 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 1295 // row = 1 1296 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 1297 1298 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4); 1299 //row 1 left 1300 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14); 1301 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); 1302 //row 0 left 1303 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14); 1304 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 1305 1306 1307 //separating +ve and and -ve values.row 0 left 1308 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); 1309 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); 1310 //creating mask 00 for +ve and -ve values and FF for zero. 1311 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1312 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1313 //combining the appropriate sign change 1314 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1315 1316 //separating +ve and and -ve values.row 1 left 1317 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); 1318 cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); 1319 //creating mask 00 for +ve and -ve values and FF for zero. 1320 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1321 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1322 //combining the appropriate sign change 1323 left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1324 1325 1326 //row = 0 right 1327 edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2)); 1328 // row = 1 right 1329 edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); 1330 //separating +ve and and -ve values.row 0 right 1331 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); 1332 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); 1333 //creating mask 00 for +ve and -ve values and FF for zero. 1334 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1335 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1336 //combining the appropriate sign change 1337 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1338 1339 //separating +ve and and -ve values.row 1 right 1340 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); 1341 cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); 1342 //creating mask 00 for +ve and -ve values and FF for zero. 1343 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1344 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1345 //combining the appropriate sign change 1346 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1347 1348 //combining sign-left and sign_right 1349 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); 1350 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); 1351 //adding constant 2 1352 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1353 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 1354 //shuffle to get sao index 1355 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1356 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 1357 //using availability mask 1358 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 1359 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 1360 //adding chroma offset to access U and V 1361 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 1362 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); 1363 1364 //shuffle to get sao offset 1365 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1366 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 1367 //cnvert to 16 bit then add and then saturated pack 1368 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1369 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1370 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); 1371 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 1372 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); 1373 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1374 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 1375 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 1376 1377 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 1378 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 1379 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); 1380 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 1381 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); 1382 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1383 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); 1384 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 1385 1386 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 1387 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 1388 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 1389 // row = 1 1390 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); 1391 1392 pu1_src_cpy += (src_strd << 1); 1393 pu1_src_left_cpy += 4; 1394 pu1_src_left_str += 4; 1395 } 1396 au1_mask_cpy += 16; 1397 pu1_src += 16; 1398 pu1_src_left_cpy -= 2 * ht; 1399 pu1_src_left_str -= 2 * ht; 1400 1401 pu1_left_tmp = pu1_src_left_cpy; 1402 pu1_src_left_cpy = pu1_src_left_str; 1403 pu1_src_left_str = pu1_left_tmp; 1404 } 1405 1406 wd_rem = wd & 0xF; 1407 if(wd_rem) 1408 { 1409 1410 cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd)); 1411 _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b); 1412 1413 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); 1414 pu1_src_cpy = pu1_src; 1415 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); 1416 1417 for(row = ht; row > 0; row -= 4) 1418 { 1419 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); 1420 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos. 1421 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); 1422 // row = 1 1423 cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 1424 // row = 2 1425 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); 1426 // row = 3 1427 cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); 1428 1429 1430 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8); 1431 //row 3 left 1432 edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8); 1433 left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14); 1434 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); 1435 //row 2 left 1436 edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); 1437 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14); 1438 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); 1439 1440 1441 // packing rows together for 16 SIMD operations 1442 src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b); 1443 left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b); 1444 1445 //row 1 left 1446 edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8); 1447 edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14); 1448 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); 1449 //row 0 left 1450 edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 1451 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14); 1452 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14); 1453 // packing rows together for 16 SIMD operations 1454 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b); 1455 left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b); 1456 1457 //separating +ve and and -ve values.for row 2 and row 3 1458 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b); 1459 cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b); 1460 //creating mask 00 for +ve and -ve values and FF for zero. 1461 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1462 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1463 //combining the appropriate sign change 1464 left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1465 1466 1467 1468 1469 1470 //separating +ve and and -ve values. 1471 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b); 1472 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b); 1473 //creating mask 00 for +ve and -ve values and FF for zero. 1474 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1475 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1476 //combining the appropriate sign change 1477 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1478 1479 1480 //row = 0 right 1481 edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2)); 1482 // row = 1 right 1483 cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2)); 1484 // row = 2 right 1485 edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2)); 1486 // row = 3 right 1487 cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2)); 1488 // packing rows together for 16 SIMD operations 1489 edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b); 1490 edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b); 1491 1492 //separating +ve and and -ve values. 1493 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b); 1494 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b); 1495 //creating mask 00 for +ve and -ve values and FF for zero. 1496 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1497 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1498 //combining the appropriate sign change 1499 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1500 1501 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b); 1502 cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b); 1503 //creating mask 00 for +ve and -ve values and FF for zero. 1504 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1505 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1506 //combining the appropriate sign change 1507 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1508 1509 //combining sign-left and sign_right 1510 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b); 1511 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b); 1512 //adding constant 2 1513 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1514 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 1515 //shuffle to get sao index 1516 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1517 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 1518 //shuffle to get sao offset 1519 //using availability mask 1520 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 1521 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 1522 //adding chroma offset to access U and V 1523 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 1524 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); 1525 1526 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1527 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 1528 //cnvert to 16 bit then add and then saturated pack 1529 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1530 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1531 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b); 1532 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 1533 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b); 1534 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1535 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 1536 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 1537 1538 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 1539 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 1540 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b); 1541 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 1542 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b); 1543 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1544 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); 1545 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 1546 1547 //seaprting row 1 and row 3 1548 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 1549 cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8); 1550 1551 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 1552 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 1553 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 1554 // row = 1 1555 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 1556 // row = 2 1557 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b); 1558 // row = 3 1559 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); 1560 1561 pu1_src_cpy += (src_strd << 2); 1562 pu1_src_left_cpy += 8; 1563 pu1_src_left_str += 8; 1564 } 1565 pu1_src += wd; 1566 pu1_src_left_cpy -= 2 * ht; 1567 pu1_src_left_str -= 2 * ht; 1568 1569 pu1_left_tmp = pu1_src_left_cpy; 1570 pu1_src_left_cpy = pu1_src_left_str; 1571 pu1_src_left_str = pu1_left_tmp; 1572 } 1573 for(row = 0; row < 2 * ht; row++) 1574 { 1575 pu1_src_left[row] = pu1_src_left_cpy[row]; 1576 } 1577 } 1578 1579 } 1580 1581 1582 void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src, 1583 WORD32 src_strd, 1584 UWORD8 *pu1_src_left, 1585 UWORD8 *pu1_src_top, 1586 UWORD8 *pu1_src_top_left, 1587 UWORD8 *pu1_src_top_right, 1588 UWORD8 *pu1_src_bot_left, 1589 UWORD8 *pu1_avail, 1590 WORD8 *pi1_sao_offset, 1591 WORD32 wd, 1592 WORD32 ht) 1593 { 1594 WORD32 row, col; 1595 UWORD8 *pu1_src_top_cpy; 1596 UWORD8 *pu1_src_cpy; 1597 WORD32 wd_rem; 1598 1599 1600 __m128i src_top_16x8b, src_bottom_16x8b; 1601 __m128i src_temp0_16x8b, src_temp1_16x8b; 1602 __m128i signup0_16x8b, signdwn1_16x8b; 1603 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; 1604 __m128i edge0_16x8b, edge1_16x8b; 1605 __m128i edge_idx_8x16b, sao_offset_8x16b; 1606 __m128i const2_16x8b, const0_16x8b; 1607 1608 UNUSED(pu1_src_top_right); 1609 UNUSED(pu1_src_bot_left); 1610 1611 1612 /* Updating left and top-left */ 1613 for(row = 0; row < ht; row++) 1614 { 1615 pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)]; 1616 } 1617 *pu1_src_top_left = pu1_src_top[wd - 1]; 1618 1619 1620 1621 pu1_src_top_cpy = pu1_src_top; 1622 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); 1623 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); 1624 1625 /* Update height and source pointers based on the availability flags */ 1626 if(0 == pu1_avail[2]) 1627 { 1628 pu1_src_top_cpy = pu1_src; 1629 pu1_src += src_strd; 1630 ht--; 1631 } 1632 if(0 == pu1_avail[3]) 1633 { 1634 ht--; 1635 } 1636 1637 const2_16x8b = _mm_set1_epi8(2); 1638 const0_16x8b = _mm_setzero_si128(); 1639 1640 { 1641 WORD32 ht_rem; 1642 for(col = wd; col >= 16; col -= 16) 1643 { 1644 pu1_src_cpy = pu1_src; 1645 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); 1646 //row = 0 1647 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 1648 //separating +ve and and -ve values. 1649 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 1650 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 1651 //creating mask 00 for +ve and -ve values and FF for zero. 1652 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1653 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1654 //combining the appropriate sign change 1655 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1656 1657 for(row = ht; row >= 2; row -= 2) 1658 { 1659 1660 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 1661 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 1662 // row = 2 1663 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 1664 1665 1666 //row 0 -row1 1667 //separating +ve and and -ve values. 1668 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); 1669 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); 1670 //creating mask 00 for +ve and -ve values and FF for zero. 1671 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1672 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1673 //combining the appropriate sign change 1674 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1675 //row1-row0 1676 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); 1677 1678 //row1 -bottom 1679 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 1680 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 1681 //creating mask 00 for +ve and -ve values and FF for zero. 1682 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1683 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1684 //combining the appropriate sign change 1685 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1686 1687 //combining sign-left and sign_right 1688 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 1689 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); 1690 1691 //for the next iteration signup0_16x8b = -signdwn1_16x8b 1692 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); 1693 //adding constant 2 1694 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1695 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 1696 //shuffle to get sao index 1697 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1698 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 1699 //shuffle to get sao offset 1700 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1701 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 1702 //copying the next top 1703 src_top_16x8b = src_temp1_16x8b; 1704 //cnvert to 16 bit then add and then saturated pack 1705 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1706 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1707 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 1708 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 1709 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1710 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 1711 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 1712 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 1713 1714 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 1715 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 1716 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 1717 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 1718 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1719 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 1720 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); 1721 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 1722 1723 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 1724 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 1725 // row = 1 1726 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); 1727 1728 src_temp0_16x8b = src_bottom_16x8b; 1729 pu1_src_cpy += (src_strd << 1); 1730 } 1731 ht_rem = ht & 0x1; 1732 1733 if(ht_rem) 1734 { 1735 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 1736 //current row -next row 1737 //separating +ve and and -ve values. 1738 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); 1739 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); 1740 //creating mask 00 for +ve and -ve values and FF for zero. 1741 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1742 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1743 //combining the appropriate sign change 1744 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1745 //adding top and botton and constant 2 1746 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 1747 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1748 1749 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1750 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1751 //copying the next top 1752 src_top_16x8b = src_temp0_16x8b; 1753 //cnvert to 16 bit then add and then saturated pack 1754 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1755 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1756 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 1757 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 1758 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1759 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 1760 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 1761 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 1762 1763 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 1764 } 1765 if(0 == pu1_avail[3]) 1766 { 1767 src_top_16x8b = src_bottom_16x8b; 1768 } 1769 //updating top flag 1770 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 1771 pu1_src += 16; 1772 } 1773 1774 wd_rem = wd & 0xF; 1775 if(wd_rem) 1776 { 1777 pu1_src_cpy = pu1_src; 1778 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); 1779 //row = 0 1780 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); 1781 //separating +ve and and -ve values. 1782 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 1783 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 1784 //creating mask 00 for +ve and -ve values and FF for zero. 1785 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1786 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1787 //combining the appropriate sign change 1788 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1789 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 1790 for(row = ht; row >= 4; row -= 4) 1791 { 1792 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 1793 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 1794 // row = 2 1795 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); 1796 1797 //row 0 -row1 1798 //separating +ve and and -ve values. 1799 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); 1800 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); 1801 //creating mask 00 for +ve and -ve values and FF for zero. 1802 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1803 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1804 //combining the appropriate sign change 1805 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1806 1807 //row1-row0 1808 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); 1809 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 1810 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 1811 //row1 -row2 1812 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 1813 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 1814 //creating mask 00 for +ve and -ve values and FF for zero. 1815 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1816 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1817 //combining the appropriate sign change 1818 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 1819 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 1820 //packing row 0 n row 1 1821 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); 1822 //row = 3 1823 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); 1824 // row = 4 1825 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd)); 1826 1827 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 1828 signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2) 1829 //separating +ve and and -ve values.(2,3) 1830 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b); 1831 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b); 1832 //creating mask 00 for +ve and -ve values and FF for zero. 1833 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1834 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1835 //combining the appropriate sign change 1836 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) 1837 1838 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down) 1839 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); 1840 //separating +ve and and -ve values.(3,4) 1841 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b); 1842 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b); 1843 //creating mask 00 for +ve and -ve values and FF for zero. 1844 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1845 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1846 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4) 1847 //combining sign-left and sign_right 1848 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3) 1849 1850 edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2) 1851 1852 //packing row 2 n row 3 1853 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); 1854 //for the next iteration signup0_16x8b = -signdwn1_16x8b 1855 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3) 1856 1857 //adding constant 2 1858 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1859 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 1860 //shuffle to get sao index 1861 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1862 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 1863 //shuffle to get sao offset 1864 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1865 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 1866 //the next top already in src_top_16x8b 1867 //src_top_16x8b = src_temp1_16x8b; 1868 //cnvert to 16 bit then add and then saturated pack 1869 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1870 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1871 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 1872 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 1873 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1874 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 1875 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 1876 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 1877 1878 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 1879 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); 1880 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 1881 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); 1882 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 1883 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 1884 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); 1885 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); 1886 1887 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 1888 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); 1889 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 1890 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 1891 // row = 1 1892 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 1893 //row = 2 1894 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); 1895 // row = 3 1896 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); 1897 1898 src_temp0_16x8b = src_temp1_16x8b; 1899 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 1900 pu1_src_cpy += (src_strd << 2); 1901 1902 } 1903 ht_rem = ht & 0x2; 1904 if(ht_rem) 1905 { 1906 1907 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 1908 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 1909 // row = 2 1910 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); 1911 1912 //row 0 -row1 1913 //separating +ve and and -ve values. 1914 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); 1915 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); 1916 //creating mask 00 for +ve and -ve values and FF for zero. 1917 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1918 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1919 //combining the appropriate sign change 1920 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1921 //row1-row0 1922 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); 1923 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 1924 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 1925 //row1 -row2 1926 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 1927 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 1928 //creating mask 00 for +ve and -ve values and FF for zero. 1929 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1930 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1931 //combining the appropriate sign change 1932 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 1933 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 1934 //adding top and down substraction 1935 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 1936 //for the next iteration signup0_16x8b = -signdwn1_16x8b 1937 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next 1938 src_top_16x8b = src_temp1_16x8b; 1939 //adding constant 2 1940 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1941 1942 //shuffle to get sao index 1943 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1944 1945 //shuffle to get sao offset 1946 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1947 1948 //the next top already in src_top_16x8b 1949 //cnvert to 16 bit then add and then saturated pack 1950 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1951 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1952 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 1953 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 1954 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 1955 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 1956 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); 1957 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); 1958 1959 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 1960 1961 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 1962 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 1963 // row = 1 1964 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 1965 src_temp0_16x8b = src_bottom_16x8b; 1966 pu1_src_cpy += (src_strd << 1); 1967 1968 } 1969 ht_rem = ht & 0x1; 1970 if(ht_rem) 1971 { 1972 1973 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 1974 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 1975 1976 //row 0 -row1 1977 //separating +ve and and -ve values. 1978 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); 1979 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); 1980 //creating mask 00 for +ve and -ve values and FF for zero. 1981 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 1982 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 1983 //combining the appropriate sign change 1984 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 1985 //adding top and down substraction 1986 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 1987 //adding constant 2 1988 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 1989 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); 1990 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); 1991 //shuffle to get sao index 1992 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 1993 //shuffle to get sao offset 1994 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 1995 src_top_16x8b = src_temp0_16x8b; 1996 //cnvert to 16 bit then add and then saturated pack 1997 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 1998 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 1999 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 2000 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 2001 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); 2002 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 2003 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 2004 pu1_src_cpy += (src_strd); 2005 2006 } 2007 if(0 == pu1_avail[3]) 2008 { 2009 src_top_16x8b = src_bottom_16x8b; 2010 } 2011 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 2012 pu1_src += 8; 2013 } 2014 } 2015 } 2016 2017 void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src, 2018 WORD32 src_strd, 2019 UWORD8 *pu1_src_left, 2020 UWORD8 *pu1_src_top, 2021 UWORD8 *pu1_src_top_left, 2022 UWORD8 *pu1_src_top_right, 2023 UWORD8 *pu1_src_bot_left, 2024 UWORD8 *pu1_avail, 2025 WORD8 *pi1_sao_offset_u, 2026 WORD8 *pi1_sao_offset_v, 2027 WORD32 wd, 2028 WORD32 ht) 2029 { 2030 WORD32 row, col; 2031 UWORD8 *pu1_src_top_cpy; 2032 UWORD8 *pu1_src_cpy; 2033 WORD32 wd_rem; 2034 2035 2036 __m128i src_top_16x8b, src_bottom_16x8b; 2037 __m128i src_temp0_16x8b, src_temp1_16x8b; 2038 __m128i signup0_16x8b, signdwn1_16x8b; 2039 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; 2040 __m128i edge0_16x8b, edge1_16x8b; 2041 __m128i edge_idx_8x16b, sao_offset_8x16b; 2042 __m128i const2_16x8b, const0_16x8b; 2043 __m128i chroma_offset_8x16b; 2044 2045 UNUSED(pu1_src_top_right); 2046 UNUSED(pu1_src_bot_left); 2047 2048 /* Updating left and top and top-left */ 2049 for(row = 0; row < ht; row++) 2050 { 2051 pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)]; 2052 pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)]; 2053 } 2054 pu1_src_top_left[0] = pu1_src_top[wd - 2]; 2055 pu1_src_top_left[1] = pu1_src_top[wd - 1]; 2056 2057 2058 2059 pu1_src_top_cpy = pu1_src_top; 2060 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); 2061 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); 2062 const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); 2063 chroma_offset_8x16b = _mm_set1_epi16(0x0800); 2064 /* Update height and source pointers based on the availability flags */ 2065 if(0 == pu1_avail[2]) 2066 { 2067 pu1_src_top_cpy = pu1_src; 2068 pu1_src += src_strd; 2069 ht--; 2070 } 2071 if(0 == pu1_avail[3]) 2072 { 2073 ht--; 2074 } 2075 sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); 2076 const2_16x8b = _mm_set1_epi8(2); 2077 const0_16x8b = _mm_setzero_si128(); 2078 2079 2080 { 2081 WORD32 ht_rem; 2082 2083 2084 2085 for(col = wd; col >= 16; col -= 16) 2086 { 2087 pu1_src_cpy = pu1_src; 2088 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); 2089 //row = 0 2090 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 2091 //separating +ve and and -ve values. 2092 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 2093 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 2094 //creating mask 00 for +ve and -ve values and FF for zero. 2095 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2096 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2097 //combining the appropriate sign change 2098 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2099 2100 for(row = ht; row >= 2; row -= 2) 2101 { 2102 2103 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 2104 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 2105 // row = 2 2106 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 2107 2108 2109 //row 0 -row1 2110 //separating +ve and and -ve values. 2111 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); 2112 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); 2113 //creating mask 00 for +ve and -ve values and FF for zero. 2114 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2115 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2116 //combining the appropriate sign change 2117 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2118 //row1-row0 2119 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); 2120 2121 //row1 -bottom 2122 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 2123 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 2124 //creating mask 00 for +ve and -ve values and FF for zero. 2125 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2126 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2127 //combining the appropriate sign change 2128 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2129 2130 //combining sign-left and sign_right 2131 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 2132 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); 2133 2134 //for the next iteration signup0_16x8b = -signdwn1_16x8b 2135 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); 2136 //adding constant 2 2137 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 2138 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 2139 //copying the next top 2140 src_top_16x8b = src_temp1_16x8b; 2141 2142 2143 //shuffle to get sao index 2144 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 2145 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 2146 //adding chroma offset to access U and V 2147 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 2148 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); 2149 2150 //shuffle to get sao offset 2151 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 2152 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 2153 //cnvert to 16 bit then add and then saturated pack 2154 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 2155 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 2156 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 2157 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 2158 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 2159 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 2160 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 2161 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 2162 2163 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 2164 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 2165 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 2166 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 2167 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 2168 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 2169 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); 2170 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 2171 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 2172 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 2173 // row = 1 2174 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); 2175 2176 src_temp0_16x8b = src_bottom_16x8b; 2177 pu1_src_cpy += (src_strd << 1); 2178 } 2179 ht_rem = ht & 0x1; 2180 2181 if(ht_rem) 2182 { 2183 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 2184 //current row -next row 2185 //separating +ve and and -ve values. 2186 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); 2187 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); 2188 //creating mask 00 for +ve and -ve values and FF for zero. 2189 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2190 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2191 //combining the appropriate sign change 2192 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2193 //adding top and botton and constant 2 2194 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 2195 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 2196 //copying the next top 2197 src_top_16x8b = src_temp0_16x8b; 2198 2199 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 2200 //adding chroma offset to access U and V 2201 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 2202 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 2203 2204 //cnvert to 16 bit then add and then saturated pack 2205 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 2206 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 2207 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 2208 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 2209 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 2210 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 2211 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 2212 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 2213 2214 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 2215 } 2216 if(0 == pu1_avail[3]) 2217 { 2218 src_top_16x8b = src_bottom_16x8b; 2219 } 2220 //updating top flag 2221 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 2222 pu1_src += 16; 2223 } 2224 2225 wd_rem = wd & 0xF; 2226 if(wd_rem) 2227 { 2228 pu1_src_cpy = pu1_src; 2229 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); 2230 //row = 0 2231 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); 2232 //separating +ve and and -ve values. 2233 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 2234 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 2235 //creating mask 00 for +ve and -ve values and FF for zero. 2236 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2237 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2238 //combining the appropriate sign change 2239 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2240 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 2241 for(row = ht; row >= 4; row -= 4) 2242 { 2243 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 2244 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 2245 // row = 2 2246 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); 2247 2248 //row 0 -row1 2249 //separating +ve and and -ve values. 2250 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); 2251 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); 2252 //creating mask 00 for +ve and -ve values and FF for zero. 2253 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2254 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2255 //combining the appropriate sign change 2256 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2257 2258 //row1-row0 2259 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); 2260 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 2261 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 2262 //row1 -row2 2263 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 2264 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 2265 //creating mask 00 for +ve and -ve values and FF for zero. 2266 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2267 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2268 //combining the appropriate sign change 2269 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 2270 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 2271 //packing row 0 n row 1 2272 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); 2273 //row = 3 2274 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd)); 2275 // row = 4 2276 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd)); 2277 2278 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 2279 signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2) 2280 //separating +ve and and -ve values.(2,3) 2281 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b); 2282 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b); 2283 //creating mask 00 for +ve and -ve values and FF for zero. 2284 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2285 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2286 //combining the appropriate sign change 2287 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) 2288 2289 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down) 2290 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); 2291 //separating +ve and and -ve values.(3,4) 2292 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b); 2293 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b); 2294 //creating mask 00 for +ve and -ve values and FF for zero. 2295 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2296 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2297 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4) 2298 //combining sign-left and sign_right 2299 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3) 2300 2301 edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2) 2302 2303 //packing row 2 n row 3 2304 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); 2305 //for the next iteration signup0_16x8b = -signdwn1_16x8b 2306 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3) 2307 //adding constant 2 2308 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 2309 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 2310 //shuffle to get sao index 2311 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 2312 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 2313 //adding chroma offset to access U and V 2314 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 2315 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); 2316 2317 //shuffle to get sao offset 2318 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 2319 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 2320 //the next top already in src_top_16x8b 2321 //cnvert to 16 bit then add and then saturated pack 2322 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 2323 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 2324 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 2325 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 2326 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 2327 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 2328 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 2329 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 2330 2331 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 2332 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); 2333 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 2334 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); 2335 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 2336 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 2337 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); 2338 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); 2339 2340 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 2341 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); 2342 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 2343 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 2344 // row = 1 2345 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 2346 //row = 2 2347 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); 2348 // row = 3 2349 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); 2350 2351 src_temp0_16x8b = src_temp1_16x8b; 2352 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 2353 pu1_src_cpy += (src_strd << 2); 2354 2355 } 2356 ht_rem = ht & 0x2; 2357 if(ht_rem) 2358 { 2359 2360 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 2361 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 2362 // row = 2 2363 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd)); 2364 2365 //row 0 -row1 2366 //separating +ve and and -ve values. 2367 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b); 2368 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b); 2369 //creating mask 00 for +ve and -ve values and FF for zero. 2370 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2371 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2372 //combining the appropriate sign change 2373 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2374 //row1-row0 2375 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); 2376 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 2377 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 2378 //row1 -row2 2379 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 2380 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 2381 //creating mask 00 for +ve and -ve values and FF for zero. 2382 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2383 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2384 //combining the appropriate sign change 2385 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 2386 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 2387 //adding top and down substraction 2388 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 2389 //for the next iteration signup0_16x8b = -signdwn1_16x8b 2390 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next 2391 src_top_16x8b = src_temp1_16x8b; 2392 2393 //adding constant 2 2394 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 2395 2396 //shuffle to get sao index 2397 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 2398 2399 //adding chroma offset to access U and V 2400 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 2401 //shuffle to get sao offset 2402 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 2403 //the next top already in src_top_16x8b 2404 //cnvert to 16 bit then add and then saturated pack 2405 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 2406 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 2407 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 2408 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 2409 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 2410 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 2411 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); 2412 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); 2413 2414 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 2415 2416 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 2417 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 2418 // row = 1 2419 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 2420 src_temp0_16x8b = src_bottom_16x8b; 2421 pu1_src_cpy += (src_strd << 1); 2422 2423 } 2424 ht_rem = ht & 0x1; 2425 if(ht_rem) 2426 { 2427 2428 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 2429 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd)); 2430 2431 //row 0 -row1 2432 //separating +ve and and -ve values. 2433 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); 2434 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); 2435 //creating mask 00 for +ve and -ve values and FF for zero. 2436 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2437 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2438 //combining the appropriate sign change 2439 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2440 //adding top and down substraction 2441 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 2442 //adding constant 2 2443 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 2444 src_top_16x8b = src_temp0_16x8b; 2445 2446 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); 2447 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); 2448 //shuffle to get sao index 2449 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 2450 //adding chroma offset to access U and V 2451 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 2452 //shuffle to get sao offset 2453 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 2454 2455 //cnvert to 16 bit then add and then saturated pack 2456 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 2457 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 2458 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 2459 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 2460 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); 2461 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 2462 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 2463 pu1_src_cpy += (src_strd); 2464 2465 } 2466 if(0 == pu1_avail[3]) 2467 { 2468 src_top_16x8b = src_bottom_16x8b; 2469 } 2470 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 2471 pu1_src += 8; 2472 } 2473 } 2474 } 2475 2476 /* 135 degree filtering */ 2477 void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src, 2478 WORD32 src_strd, 2479 UWORD8 *pu1_src_left, 2480 UWORD8 *pu1_src_top, 2481 UWORD8 *pu1_src_top_left, 2482 UWORD8 *pu1_src_top_right, 2483 UWORD8 *pu1_src_bot_left, 2484 UWORD8 *pu1_avail, 2485 WORD8 *pi1_sao_offset, 2486 WORD32 wd, 2487 WORD32 ht) 2488 { 2489 WORD32 row, col; 2490 UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; 2491 UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; 2492 UWORD8 *pu1_firstleft; 2493 UWORD8 *pu1_src_cpy, *pu1_src_org; 2494 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; 2495 UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; 2496 UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; 2497 WORD32 wd_rem; 2498 UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp; 2499 WORD32 ht_tmp, ht_0; 2500 2501 WORD32 bit_depth; 2502 UWORD8 u1_avail0, u1_avail1; 2503 2504 __m128i src_top_16x8b, src_bottom_16x8b; 2505 __m128i src_temp0_16x8b, src_temp1_16x8b; 2506 __m128i signup0_16x8b, signdwn1_16x8b; 2507 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; 2508 __m128i edge0_16x8b, edge1_16x8b; 2509 __m128i au1_mask8x16b; 2510 __m128i edge_idx_8x16b, sao_offset_8x16b; 2511 __m128i const2_16x8b, const0_16x8b; 2512 __m128i left_store_16x8b; 2513 UNUSED(pu1_src_top_right); 2514 UNUSED(pu1_src_bot_left); 2515 2516 ht_0 = ht; ht_tmp = ht; 2517 au1_mask8x16b = _mm_set1_epi8(0xff); 2518 2519 //setting availability mask to ff size MAX_CTB_SIZE 2520 for(col = 0; col < MAX_CTB_SIZE; col += 16) 2521 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); 2522 for(row = 0; row < ht; row++) 2523 { 2524 au1_src_left_tmp[row] = pu1_src_left[row]; 2525 } 2526 bit_depth = BIT_DEPTH_LUMA; 2527 pu1_src_org = pu1_src; 2528 pu1_src_top_cpy = pu1_src_top; 2529 pu1_src_left_cpy2 = au1_src_left_tmp; 2530 pu1_src_left_cpy = au1_src_left_tmp; 2531 pu1_src_left_str2 = au1_src_left_tmp1; 2532 pu1_src_left_str = au1_src_left_tmp1; 2533 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); 2534 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); 2535 2536 2537 /* If top-left is available, process separately */ 2538 if(0 != pu1_avail[4]) 2539 { 2540 WORD8 edge_idx; 2541 2542 edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) + 2543 SIGN(pu1_src[0] - pu1_src[1 + src_strd]); 2544 2545 edge_idx = gi1_table_edge_idx[edge_idx]; 2546 2547 if(0 != edge_idx) 2548 { 2549 u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); 2550 } 2551 else 2552 { 2553 u1_pos_0_0_tmp = pu1_src[0]; 2554 } 2555 } 2556 else 2557 { 2558 u1_pos_0_0_tmp = pu1_src[0]; 2559 } 2560 2561 /* If bottom-right is available, process separately */ 2562 if(0 != pu1_avail[7]) 2563 { 2564 WORD8 edge_idx; 2565 2566 edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) + 2567 SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]); 2568 2569 edge_idx = gi1_table_edge_idx[edge_idx]; 2570 2571 if(0 != edge_idx) 2572 { 2573 u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); 2574 } 2575 else 2576 { 2577 u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]; 2578 } 2579 } 2580 else 2581 { 2582 u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd]; 2583 } 2584 pu1_firstleft = pu1_src_top_left; 2585 2586 /* Update height and source pointers based on the availability flags */ 2587 if(0 == pu1_avail[2]) 2588 { 2589 pu1_firstleft = pu1_src_left_cpy2; 2590 pu1_src_left_cpy2++; 2591 pu1_src_left_str2++; 2592 pu1_src_top_cpy = pu1_src; 2593 pu1_src += src_strd; 2594 ht--; 2595 } 2596 if(0 == pu1_avail[3]) 2597 { 2598 ht--; 2599 ht_0--; 2600 } 2601 //storing top left in a mmx register 2602 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft); 2603 const2_16x8b = _mm_set1_epi8(2); 2604 const0_16x8b = _mm_setzero_si128(); 2605 left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15); 2606 //update top -left 2607 *pu1_src_top_left = pu1_src_top[wd - 1]; 2608 //availability mask creation 2609 u1_avail0 = pu1_avail[0]; 2610 u1_avail1 = pu1_avail[1]; 2611 au1_mask[0] = u1_avail0; 2612 au1_mask[wd - 1] = u1_avail1; 2613 { 2614 WORD32 ht_rem; 2615 2616 2617 pu1_src_left_cpy = pu1_src_left_cpy2; 2618 pu1_src_left_str = pu1_src_left_str2; 2619 au1_mask_cpy = au1_mask; 2620 for(col = wd; col >= 16; col -= 16) 2621 { 2622 pu1_src_cpy = pu1_src; 2623 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); 2624 //row = 0 2625 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 2626 src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15); 2627 //loading the mask 2628 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); 2629 //separating +ve and and -ve values. 2630 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 2631 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 2632 //creating mask 00 for +ve and -ve values and FF for zero. 2633 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2634 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2635 //combining the appropriate sign change 2636 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2637 2638 2639 for(row = ht; row >= 2; row -= 2) 2640 { 2641 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 2642 //row = 1 2643 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 2644 // row = 1 right 2645 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); 2646 //to insert left in row 0 2647 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); 2648 //row 0 -row1 2649 //separating +ve and and -ve values. 2650 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); 2651 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); 2652 2653 //creating mask 00 for +ve and -ve values and FF for zero. 2654 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2655 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2656 //manipulation for row 1 - row 0 2657 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); 2658 //combining the appropriate sign change 2659 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) 2660 //row1-row0 2661 //separating +ve and and -ve values. 2662 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 2663 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 2664 //creating mask 00 for +ve and -ve values and FF for zero. 2665 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2666 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2667 // row = 2 right 2668 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1)); 2669 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) 2670 2671 2672 //row1 -bottom 2673 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 2674 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 2675 //creating mask 00 for +ve and -ve values and FF for zero. 2676 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2677 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2678 //combining the appropriate sign change 2679 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2680 // row = 2 2681 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 2682 2683 //combining sign-left and sign_right 2684 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 2685 2686 //storing the row 1 left for next row. 2687 signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14); 2688 2689 //combining sign-left and sign_right 2690 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); 2691 //manipulation for bottom - row 1 2692 signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15); 2693 //eliminating old left for row 0 and row 1 2694 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 2695 //bottom - row1 2696 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b); 2697 cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b); 2698 //creating mask 00 for +ve and -ve values and FF for zero. 2699 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2700 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2701 //for the next iteration bottom -row1 2702 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2703 //row1 getting it right for left of next block 2704 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); 2705 //adding constant 2 2706 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 2707 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 2708 //shuffle to get sao index 2709 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 2710 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 2711 //using availability mask 2712 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 2713 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 2714 //shuffle to get sao offset 2715 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 2716 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 2717 //row0 getting it right for left of next block 2718 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 2719 //copying the next top 2720 src_top_16x8b = src_temp1_16x8b; 2721 //cnvert to 16 bit then add and then saturated pack 2722 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 2723 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 2724 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 2725 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 2726 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 2727 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 2728 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 2729 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 2730 2731 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 2732 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 2733 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 2734 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 2735 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 2736 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 2737 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); 2738 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 2739 2740 //store left boundary 2741 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 2742 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 2743 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 2744 // row = 1 2745 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); 2746 2747 src_temp0_16x8b = src_bottom_16x8b; 2748 pu1_src_cpy += (src_strd << 1); 2749 pu1_src_left_cpy += 2; 2750 pu1_src_left_str += 2; 2751 } 2752 ht_rem = ht & 0x1; 2753 2754 if(ht_rem) 2755 { 2756 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 2757 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); 2758 //current row -next row 2759 //separating +ve and and -ve values. 2760 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); 2761 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); 2762 //creating mask 00 for +ve and -ve values and FF for zero. 2763 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2764 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2765 //combining the appropriate sign change 2766 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2767 //adding top and botton and constant 2 2768 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 2769 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 2770 //eliminating old left for row 0 and row 1 2771 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); 2772 2773 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 2774 //using availability mask 2775 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 2776 2777 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 2778 2779 //row0 getting it right for left of next block 2780 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 2781 //copying the next top 2782 src_top_16x8b = src_temp0_16x8b; 2783 //cnvert to 16 bit then add and then saturated pack 2784 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 2785 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 2786 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 2787 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 2788 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 2789 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 2790 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 2791 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 2792 //store left boundary 2793 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 2794 2795 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 2796 pu1_src_cpy += (src_strd); 2797 pu1_src_left_cpy += 1; 2798 pu1_src_left_str += 1; 2799 } 2800 if(0 == pu1_avail[3]) 2801 { 2802 src_top_16x8b = src_bottom_16x8b; 2803 pu1_src_left_str[0] = pu1_src_cpy[15]; 2804 } 2805 if(0 == pu1_avail[2]) 2806 { 2807 pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd]; 2808 } 2809 2810 //for the top left of next part of the block 2811 left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); 2812 //updating top flag 2813 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 2814 pu1_src += 16; 2815 au1_mask_cpy += 16; 2816 2817 2818 pu1_left_tmp = pu1_src_left_cpy2; 2819 pu1_src_left_cpy2 = pu1_src_left_str2; 2820 pu1_src_left_str2 = pu1_left_tmp; 2821 2822 pu1_src_left_cpy = pu1_src_left_cpy2; 2823 pu1_src_left_str = pu1_src_left_str2; 2824 } 2825 2826 wd_rem = wd & 0xF; 2827 if(wd_rem) 2828 { 2829 pu1_src_left_cpy = pu1_src_left_cpy2; 2830 pu1_src_left_str = pu1_src_left_str2; 2831 pu1_src_cpy = pu1_src; 2832 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); 2833 //row = 0 2834 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); 2835 src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15); 2836 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? 2837 //separating +ve and and -ve values. 2838 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 2839 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 2840 //creating mask 00 for +ve and -ve values and FF for zero. 2841 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2842 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2843 //preparing au1_mask 2844 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); 2845 //combining the appropriate sign change 2846 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2847 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 2848 2849 for(row = ht; row >= 4; row -= 4) 2850 { 2851 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 2852 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 2853 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 2854 // row = 2 2855 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 2856 //right row1 2857 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); 2858 //row 0 -row1 2859 //separating +ve and and -ve values. 2860 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 2861 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 2862 //manipulation for row 1 -row 0 2863 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); 2864 //creating mask 00 for +ve and -ve values and FF for zero. 2865 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2866 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2867 //row 0 left 2868 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); 2869 //combining the appropriate sign change 2870 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2871 //row 1 -row0 2872 //separating +ve and and -ve values. 2873 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 2874 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 2875 2876 //creating mask 00 for +ve and -ve values and FF for zero. 2877 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2878 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2879 //row1-row0 2880 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 2881 2882 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 2883 2884 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 2885 //right row2 2886 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); 2887 //packing row 0 n row 1 2888 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); 2889 //row1 -row2 2890 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 2891 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 2892 //creating mask 00 for +ve and -ve values and FF for zero. 2893 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2894 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2895 //combining the appropriate sign change 2896 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 2897 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 2898 //manipulation for row 2 -row 1 2899 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 2900 //row 1 left 2901 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); 2902 //row = 3 2903 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); 2904 2905 // row = 4 2906 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); 2907 2908 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 2909 2910 //separating +ve and and -ve values.(2,1) 2911 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 2912 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 2913 //manipulation for row 3 -row 2 2914 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); 2915 //creating mask 00 for +ve and -ve values and FF for zero. 2916 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2917 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2918 //row 2 left 2919 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); 2920 //combining the appropriate sign change 2921 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) 2922 2923 //separating +ve and and -ve values.(3,2) 2924 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); 2925 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); 2926 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) 2927 //creating mask 00 for +ve and -ve values and FF for zero. 2928 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2929 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2930 //right row3 2931 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1); 2932 //combining the appropriate sign change 2933 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) 2934 2935 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) 2936 2937 //separating +ve and and -ve values.(2,3) 2938 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 2939 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 2940 //right row 4 2941 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); 2942 //creating mask 00 for +ve and -ve values and FF for zero. 2943 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2944 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2945 //combining the appropriate sign change 2946 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) 2947 2948 //separating +ve and and -ve values.(3,bottom) 2949 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); 2950 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); 2951 2952 //creating mask 00 for +ve and -ve values and FF for zero. 2953 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2954 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2955 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) 2956 //combining the appropriate sign change 2957 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) 2958 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) 2959 2960 //manipulation for bottom -row 3 2961 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 2962 //eliminating old left for row 0,1,2,3 2963 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); 2964 //packing row 2 n row 3 2965 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); 2966 //row 3 left 2967 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15); 2968 //loading row 3 right into left 2969 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15); 2970 //adding bottom and top values of row 2 and row 3 2971 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) 2972 //separating +ve and and -ve values.(botttom,3) 2973 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 2974 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 2975 //to store right of row 2 2976 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); 2977 //creating mask 00 for +ve and -ve values and FF for zero. 2978 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 2979 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 2980 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration 2981 2982 //storing right of row 2into left 2983 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 2984 //to store right of row 0 2985 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 2986 //storing right of row 1 into left 2987 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 2988 2989 //adding constant 2 2990 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 2991 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 2992 //shuffle to get sao index 2993 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 2994 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 2995 //using availability mask 2996 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 2997 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 2998 //shuffle to get sao offset 2999 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 3000 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 3001 3002 //storing right of row 0 into left 3003 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 3004 //cnvert to 16 bit then add and then saturated pack 3005 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 3006 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 3007 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 3008 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 3009 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 3010 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 3011 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 3012 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 3013 3014 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 3015 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); 3016 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 3017 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); 3018 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 3019 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 3020 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); 3021 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); 3022 3023 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 3024 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); 3025 3026 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 3027 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 3028 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 3029 // row = 1 3030 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 3031 //row = 2 3032 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); 3033 // row = 3 3034 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); 3035 3036 src_temp0_16x8b = src_temp1_16x8b; 3037 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 3038 pu1_src_cpy += (src_strd << 2); 3039 pu1_src_left_cpy += 4; 3040 pu1_src_left_str += 4; 3041 } 3042 ht_rem = ht & 0x2; 3043 if(ht_rem) 3044 { 3045 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 3046 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 3047 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 3048 // row = 2 3049 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 3050 3051 //row 0 -row 1 3052 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); 3053 //separating +ve and and -ve values. 3054 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 3055 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 3056 //manipulation for row 1 -row 0 3057 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15); 3058 //creating mask 00 for +ve and -ve values and FF for zero. 3059 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3060 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3061 //manipulation for row 1 - row 0 3062 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15); 3063 //combining the appropriate sign change 3064 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3065 3066 //row1-row0 3067 //separating +ve and and -ve values. 3068 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 3069 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 3070 3071 //creating mask 00 for +ve and -ve values and FF for zero. 3072 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3073 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3074 //combining the appropriate sign chang 3075 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3076 //row 1 -bottom 3077 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); 3078 3079 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 3080 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 3081 //row1 -bottom 3082 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 3083 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 3084 3085 //creating mask 00 for +ve and -ve values and FF for zero. 3086 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3087 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3088 //combining the appropriate sign change 3089 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 3090 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 3091 //manipulation for bottom -row1 3092 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 3093 //manipulation for bottom- row 1 3094 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); 3095 //adding top and down substraction 3096 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 3097 //bottom - row 1 3098 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 3099 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 3100 3101 //eliminating old left for row 0,1 3102 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 3103 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); 3104 //creating mask 00 for +ve and -ve values and FF for zero. 3105 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3106 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3107 //for the next iteration signup0_16x8b 3108 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next 3109 3110 //storing right of row 1 into left 3111 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 3112 //for storing right of row 1 3113 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 3114 3115 src_top_16x8b = src_temp1_16x8b; 3116 //storing right of row 0 into left 3117 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 3118 3119 //adding constant 2 3120 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 3121 3122 //shuffle to get sao index 3123 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 3124 //using availability mask 3125 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 3126 //shuffle to get sao offset 3127 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 3128 3129 //the next top already in src_top_16x8b 3130 //cnvert to 16 bit then add and then saturated pack 3131 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 3132 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 3133 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 3134 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 3135 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 3136 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 3137 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); 3138 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); 3139 3140 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 3141 3142 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 3143 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 3144 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 3145 // row = 1 3146 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 3147 src_temp0_16x8b = src_bottom_16x8b; 3148 pu1_src_cpy += (src_strd << 1); 3149 pu1_src_left_cpy += 2; 3150 pu1_src_left_str += 2; 3151 } 3152 ht_rem = ht & 0x1; 3153 if(ht_rem) 3154 { 3155 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 3156 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 3157 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 3158 //left store manipulation 1 3159 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); 3160 //row 0 -row1 3161 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); 3162 //separating +ve and and -ve values. 3163 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 3164 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 3165 //creating mask 00 for +ve and -ve values and FF for zero. 3166 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3167 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3168 //combining the appropriate sign change 3169 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3170 //adding top and down substraction 3171 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 3172 //for row 0 right to put into left store 3173 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 3174 //adding constant 2 3175 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 3176 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); 3177 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); 3178 //filling the left boundary value 3179 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 3180 3181 //shuffle to get sao index 3182 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 3183 //using availability mask 3184 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 3185 //shuffle to get sao offset 3186 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 3187 src_top_16x8b = src_temp0_16x8b; 3188 //cnvert to 16 bit then add and then saturated pack 3189 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 3190 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 3191 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 3192 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 3193 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); 3194 3195 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 3196 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 3197 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 3198 pu1_src_cpy += (src_strd); 3199 pu1_src_left_cpy += 1; 3200 pu1_src_left_str += 1; 3201 } 3202 if(0 == pu1_avail[3]) 3203 { 3204 src_top_16x8b = src_bottom_16x8b; 3205 pu1_src_left_str[0] = pu1_src_cpy[7]; 3206 } 3207 3208 if(0 == pu1_avail[2]) 3209 { 3210 pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd]; 3211 } 3212 3213 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 3214 pu1_src += 8; 3215 au1_mask_cpy += 16; 3216 3217 pu1_left_tmp = pu1_src_left_cpy2; 3218 pu1_src_left_cpy2 = pu1_src_left_str2; 3219 pu1_src_left_str2 = pu1_left_tmp; 3220 3221 pu1_src_left_cpy = pu1_src_left_cpy2; 3222 pu1_src_left_str = pu1_src_left_str2; 3223 } 3224 pu1_src_org[0] = u1_pos_0_0_tmp; 3225 pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp; 3226 pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy; 3227 for(row = 0; row < ht_tmp; row++) 3228 { 3229 pu1_src_left[row] = pu1_src_left_cpy[row]; 3230 } 3231 } 3232 3233 } 3234 3235 /* 135 degree filtering */ 3236 void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src, 3237 WORD32 src_strd, 3238 UWORD8 *pu1_src_left, 3239 UWORD8 *pu1_src_top, 3240 UWORD8 *pu1_src_top_left, 3241 UWORD8 *pu1_src_top_right, 3242 UWORD8 *pu1_src_bot_left, 3243 UWORD8 *pu1_avail, 3244 WORD8 *pi1_sao_offset_u, 3245 WORD8 *pi1_sao_offset_v, 3246 WORD32 wd, 3247 WORD32 ht) 3248 { 3249 WORD32 row, col; 3250 UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; 3251 UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; 3252 UWORD8 *pu1_firstleft; 3253 UWORD8 *pu1_src_cpy, *pu1_src_org; 3254 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; 3255 UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; 3256 UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)]; 3257 WORD32 wd_rem; 3258 UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v; 3259 WORD32 ht_tmp; 3260 WORD32 ht_0; 3261 3262 WORD32 bit_depth; 3263 UWORD8 u1_avail0, u1_avail1; 3264 3265 __m128i src_temp0_16x8b, src_temp1_16x8b; 3266 __m128i signup0_16x8b, signdwn1_16x8b; 3267 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; 3268 __m128i edge0_16x8b, edge1_16x8b; 3269 __m128i src_top_16x8b, src_bottom_16x8b; 3270 __m128i au1_mask8x16b; 3271 __m128i edge_idx_8x16b, sao_offset_8x16b; 3272 __m128i const2_16x8b, const0_16x8b; 3273 __m128i left_store_16x8b; 3274 __m128i chroma_offset_8x16b; 3275 3276 UNUSED(pu1_src_top_right); 3277 UNUSED(pu1_src_bot_left); 3278 3279 ht_0 = ht; ht_tmp = ht; 3280 au1_mask8x16b = _mm_set1_epi8(0xff); 3281 /* Updating left and top-left */ 3282 for(row = 0; row < 2 * ht; row++) 3283 { 3284 au1_src_left_tmp[row] = pu1_src_left[row]; 3285 } 3286 //setting availability mask to ff size MAX_CTB_SIZE 3287 for(col = 0; col < MAX_CTB_SIZE; col += 16) 3288 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); 3289 bit_depth = BIT_DEPTH_LUMA; 3290 pu1_src_org = pu1_src; 3291 pu1_src_top_cpy = pu1_src_top; 3292 pu1_src_left_cpy2 = au1_src_left_tmp; 3293 pu1_src_left_cpy = au1_src_left_tmp; 3294 pu1_src_left_str2 = au1_src_left_tmp1; 3295 pu1_src_left_str = au1_src_left_tmp1; 3296 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); 3297 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); 3298 const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); 3299 chroma_offset_8x16b = _mm_set1_epi16(0x0800); 3300 3301 /* If top-left is available, process separately */ 3302 if(0 != pu1_avail[4]) 3303 { 3304 WORD32 edge_idx; 3305 3306 /* U */ 3307 edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) + 3308 SIGN(pu1_src[0] - pu1_src[2 + src_strd]); 3309 3310 edge_idx = gi1_table_edge_idx[edge_idx]; 3311 3312 if(0 != edge_idx) 3313 { 3314 u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); 3315 } 3316 else 3317 { 3318 u1_pos_0_0_tmp_u = pu1_src[0]; 3319 } 3320 3321 /* V */ 3322 edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) + 3323 SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]); 3324 3325 edge_idx = gi1_table_edge_idx[edge_idx]; 3326 3327 if(0 != edge_idx) 3328 { 3329 u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); 3330 } 3331 else 3332 { 3333 u1_pos_0_0_tmp_v = pu1_src[1]; 3334 } 3335 } 3336 else 3337 { 3338 u1_pos_0_0_tmp_u = pu1_src[0]; 3339 u1_pos_0_0_tmp_v = pu1_src[1]; 3340 } 3341 3342 /* If bottom-right is available, process separately */ 3343 if(0 != pu1_avail[7]) 3344 { 3345 WORD32 edge_idx; 3346 3347 /* U */ 3348 edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) + 3349 SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]); 3350 3351 edge_idx = gi1_table_edge_idx[edge_idx]; 3352 3353 if(0 != edge_idx) 3354 { 3355 u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); 3356 } 3357 else 3358 { 3359 u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]; 3360 } 3361 3362 /* V */ 3363 edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) + 3364 SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]); 3365 3366 edge_idx = gi1_table_edge_idx[edge_idx]; 3367 3368 if(0 != edge_idx) 3369 { 3370 u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); 3371 } 3372 else 3373 { 3374 u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd]; 3375 } 3376 } 3377 else 3378 { 3379 u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd]; 3380 u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd]; 3381 } 3382 pu1_firstleft = pu1_src_top_left; 3383 3384 /* Update height and source pointers based on the availability flags */ 3385 if(0 == pu1_avail[2]) 3386 { 3387 pu1_firstleft = pu1_src_left_cpy2; 3388 pu1_src_left_cpy2 += 2; 3389 pu1_src_left_str2 += 2; 3390 pu1_src_top_cpy = pu1_src; 3391 pu1_src += src_strd; 3392 ht--; 3393 } 3394 if(0 == pu1_avail[3]) 3395 { 3396 ht--; 3397 ht_0--; 3398 } 3399 //storing top left in a mmx register 3400 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft); 3401 sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); 3402 const2_16x8b = _mm_set1_epi8(2); 3403 const0_16x8b = _mm_setzero_si128(); 3404 left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14); 3405 3406 //availability mask creation 3407 u1_avail0 = pu1_avail[0]; 3408 u1_avail1 = pu1_avail[1]; 3409 au1_mask[0] = u1_avail0; 3410 au1_mask[1] = u1_avail0; 3411 au1_mask[wd - 1] = u1_avail1; 3412 au1_mask[wd - 2] = u1_avail1; 3413 3414 /* top-left arrays */ 3415 pu1_src_top_left[0] = pu1_src_top[wd - 2]; 3416 pu1_src_top_left[1] = pu1_src_top[wd - 1]; 3417 { 3418 WORD32 ht_rem; 3419 au1_mask_cpy = au1_mask; 3420 3421 pu1_src_left_cpy = pu1_src_left_cpy2; 3422 pu1_src_left_str = pu1_src_left_str2; 3423 for(col = wd; col >= 16; col -= 16) 3424 { 3425 pu1_src_cpy = pu1_src; 3426 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); 3427 //row = 0 3428 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 3429 src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14); 3430 //loading the mask 3431 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); 3432 //separating +ve and and -ve values. 3433 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 3434 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 3435 //creating mask 00 for +ve and -ve values and FF for zero. 3436 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3437 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3438 //combining the appropriate sign change 3439 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3440 3441 3442 for(row = ht; row >= 2; row -= 2) 3443 { 3444 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 3445 //row = 1 3446 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 3447 // row = 1 right 3448 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); 3449 //to insert left in row 0 3450 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 3451 //row 0 -row1 3452 //separating +ve and and -ve values. 3453 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); 3454 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); 3455 3456 //creating mask 00 for +ve and -ve values and FF for zero. 3457 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3458 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3459 //manipulation for row 1 - row 0 3460 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); 3461 //combining the appropriate sign change 3462 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) 3463 //row1-row0 3464 //separating +ve and and -ve values. 3465 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 3466 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 3467 //creating mask 00 for +ve and -ve values and FF for zero. 3468 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3469 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3470 // row = 2 right 3471 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2)); 3472 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) 3473 3474 3475 //row1 -bottom 3476 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 3477 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 3478 //creating mask 00 for +ve and -ve values and FF for zero. 3479 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3480 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3481 //combining the appropriate sign change 3482 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3483 // row = 2 3484 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 3485 3486 //combining sign-left and sign_right 3487 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 3488 3489 //storing the row 1 left for next row. 3490 signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12); 3491 3492 //combining sign-left and sign_right 3493 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); 3494 //manipulation for bottom - row 1 3495 signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14); 3496 //eliminating old left for row 0 and row 1 3497 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); 3498 //bottom - row1 3499 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b); 3500 cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b); 3501 //creating mask 00 for +ve and -ve values and FF for zero. 3502 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3503 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3504 //for the next iteration bottom -row1 3505 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3506 //row1 getting it right for left of next iteration 3507 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); 3508 //copying the next top 3509 src_top_16x8b = src_temp1_16x8b; 3510 //row0 getting its right for left of next iteration. 3511 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 3512 3513 3514 //adding constant 2 3515 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 3516 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 3517 //shuffle to get sao index 3518 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 3519 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 3520 //using availability mask 3521 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 3522 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 3523 //adding chroma offset to access U and V 3524 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 3525 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); 3526 3527 3528 //shuffle to get sao offset 3529 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 3530 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 3531 //cnvert to 16 bit then add and then saturated pack 3532 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 3533 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 3534 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 3535 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 3536 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 3537 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 3538 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 3539 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 3540 3541 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 3542 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 3543 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 3544 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 3545 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 3546 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 3547 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); 3548 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 3549 3550 //store left boundary 3551 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 3552 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 3553 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 3554 // row = 1 3555 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); 3556 3557 src_temp0_16x8b = src_bottom_16x8b; 3558 pu1_src_cpy += (src_strd << 1); 3559 pu1_src_left_cpy += 4; 3560 pu1_src_left_str += 4; 3561 } 3562 ht_rem = ht & 0x1; 3563 3564 if(ht_rem) 3565 { 3566 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 3567 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); 3568 //current row -next row 3569 //separating +ve and and -ve values. 3570 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b); 3571 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b); 3572 //creating mask 00 for +ve and -ve values and FF for zero. 3573 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3574 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3575 //combining the appropriate sign change 3576 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3577 //adding top and botton and constant 2 3578 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 3579 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 3580 3581 //eliminating old left for row 0 and row 1 3582 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 3583 //copying the next top 3584 src_top_16x8b = src_temp0_16x8b; 3585 //row0 getting it right for left of next block 3586 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 3587 3588 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 3589 //using availability mask 3590 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 3591 //adding chroma offset to access U and V 3592 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 3593 3594 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 3595 3596 //cnvert to 16 bit then add and then saturated pack 3597 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 3598 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 3599 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 3600 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 3601 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 3602 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 3603 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 3604 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 3605 3606 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 3607 3608 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 3609 pu1_src_cpy += (src_strd); 3610 pu1_src_left_cpy += 2; 3611 pu1_src_left_str += 2; 3612 } 3613 if(0 == pu1_avail[3]) 3614 { 3615 src_top_16x8b = src_bottom_16x8b; 3616 pu1_src_left_str[1] = pu1_src_cpy[15]; 3617 pu1_src_left_str[0] = pu1_src_cpy[14]; 3618 } 3619 if(0 == pu1_avail[2]) 3620 { 3621 pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd]; 3622 pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd]; 3623 } 3624 3625 //for the top left of next part of the block 3626 left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); 3627 //updating top flag 3628 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 3629 pu1_src += 16; 3630 au1_mask_cpy += 16; 3631 3632 pu1_left_tmp = pu1_src_left_cpy2; 3633 pu1_src_left_cpy2 = pu1_src_left_str2; 3634 pu1_src_left_str2 = pu1_left_tmp; 3635 3636 pu1_src_left_cpy = pu1_src_left_cpy2; 3637 pu1_src_left_str = pu1_src_left_str2; 3638 } 3639 wd_rem = wd & 0xF; 3640 if(wd_rem) 3641 { 3642 pu1_src_left_cpy = pu1_src_left_cpy2; 3643 pu1_src_left_str = pu1_src_left_str2; 3644 pu1_src_cpy = pu1_src; 3645 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col)); 3646 //row = 0 3647 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy)); 3648 src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14); 3649 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? 3650 //separating +ve and and -ve values. 3651 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 3652 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 3653 //creating mask 00 for +ve and -ve values and FF for zero. 3654 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3655 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3656 //preparing au1_mask 3657 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); 3658 //combining the appropriate sign change 3659 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3660 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 3661 3662 for(row = ht; row >= 4; row -= 4) 3663 { 3664 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 3665 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 3666 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 3667 // row = 2 3668 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 3669 //right row1 3670 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 3671 //row 0 -row1 3672 //separating +ve and and -ve values. 3673 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 3674 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 3675 //manipulation for row 1 -row 0 3676 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 3677 //creating mask 00 for +ve and -ve values and FF for zero. 3678 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3679 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3680 //row 0 left 3681 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); 3682 //combining the appropriate sign change 3683 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3684 //row 1 -row0 3685 //separating +ve and and -ve values. 3686 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 3687 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 3688 3689 //creating mask 00 for +ve and -ve values and FF for zero. 3690 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3691 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3692 //row1-row0 3693 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3694 3695 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 3696 3697 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 3698 //right row2 3699 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); 3700 //packing row 0 n row 1 3701 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); 3702 //row1 -row2 3703 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 3704 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 3705 //creating mask 00 for +ve and -ve values and FF for zero. 3706 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3707 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3708 //combining the appropriate sign change 3709 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 3710 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 3711 //manipulation for row 2 -row 1 3712 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 3713 //row 1 left 3714 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); 3715 //row = 3 3716 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); 3717 3718 // row = 4 3719 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); 3720 3721 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 3722 3723 //separating +ve and and -ve values.(2,1) 3724 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 3725 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 3726 //manipulation for row 3 -row 2 3727 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); 3728 //creating mask 00 for +ve and -ve values and FF for zero. 3729 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3730 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3731 //row 2 left 3732 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); 3733 //combining the appropriate sign change 3734 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) 3735 3736 //separating +ve and and -ve values.(3,2) 3737 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); 3738 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); 3739 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) 3740 //creating mask 00 for +ve and -ve values and FF for zero. 3741 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3742 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3743 //right row3 3744 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2); 3745 //combining the appropriate sign change 3746 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) 3747 3748 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) 3749 3750 //separating +ve and and -ve values.(2,3) 3751 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 3752 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 3753 //right row 4 3754 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 3755 //creating mask 00 for +ve and -ve values and FF for zero. 3756 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3757 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3758 //combining the appropriate sign change 3759 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) 3760 3761 //separating +ve and and -ve values.(3,bottom) 3762 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); 3763 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); 3764 3765 //creating mask 00 for +ve and -ve values and FF for zero. 3766 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3767 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3768 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) 3769 //combining the appropriate sign change 3770 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) 3771 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) 3772 3773 //manipulation for bottom -row 3 3774 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8); 3775 //eliminating old left for row 0,1,2,3 3776 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8); 3777 //packing row 2 n row 3 3778 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); 3779 //row 3 left 3780 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14); 3781 3782 //adding bottom and top values of row 2 and row 3 3783 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) 3784 //separating +ve and and -ve values.(botttom,3) 3785 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 3786 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 3787 3788 //creating mask 00 for +ve and -ve values and FF for zero. 3789 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3790 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3791 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration 3792 3793 //to store right of row 2 3794 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); 3795 //loading row 3 right into left 3796 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14); 3797 //storing right of row 2into left 3798 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 3799 //to store right of row 0 3800 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 3801 //storing right of row 1 into left 3802 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 3803 //storing right of row 0 into left 3804 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 3805 3806 //adding constant 2 3807 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 3808 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 3809 //shuffle to get sao index 3810 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 3811 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 3812 //using availability mask 3813 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 3814 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 3815 3816 //adding chroma offset to access U and V 3817 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 3818 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); 3819 3820 //shuffle to get sao offset 3821 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 3822 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 3823 //cnvert to 16 bit then add and then saturated pack 3824 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 3825 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 3826 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 3827 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 3828 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 3829 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 3830 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 3831 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 3832 3833 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 3834 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); 3835 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 3836 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); 3837 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 3838 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 3839 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); 3840 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); 3841 3842 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 3843 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); 3844 3845 3846 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 3847 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 3848 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 3849 // row = 1 3850 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 3851 //row = 2 3852 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); 3853 // row = 3 3854 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); 3855 3856 src_temp0_16x8b = src_temp1_16x8b; 3857 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 3858 pu1_src_cpy += (src_strd << 2); 3859 pu1_src_left_cpy += 8; 3860 pu1_src_left_str += 8; 3861 } 3862 ht_rem = ht & 0x2; 3863 if(ht_rem) 3864 { 3865 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 3866 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 3867 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 3868 // row = 2 3869 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 3870 3871 //row 0 -row 1 3872 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 3873 //separating +ve and and -ve values. 3874 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 3875 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 3876 //manipulation for row 1 -row 0 3877 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 3878 //creating mask 00 for +ve and -ve values and FF for zero. 3879 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3880 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3881 //manipulation for row 1 - row 0 3882 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14); 3883 //combining the appropriate sign change 3884 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3885 3886 //row1-row0 3887 //separating +ve and and -ve values. 3888 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 3889 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 3890 3891 //creating mask 00 for +ve and -ve values and FF for zero. 3892 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3893 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3894 //combining the appropriate sign chang 3895 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3896 //row 1 -bottom 3897 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); 3898 3899 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 3900 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 3901 //row1 -bottom 3902 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 3903 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 3904 3905 //creating mask 00 for +ve and -ve values and FF for zero. 3906 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3907 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3908 //combining the appropriate sign change 3909 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 3910 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 3911 //manipulation for bottom -row1 3912 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 3913 //eliminating old left for row 0,1 3914 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); 3915 //manipulation for bottom- row 1 3916 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); 3917 //adding top and down substraction 3918 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 3919 //bottom - row 1 3920 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 3921 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 3922 3923 //shifting row 1 3924 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); 3925 //creating mask 00 for +ve and -ve values and FF for zero. 3926 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3927 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3928 //for the next iteration signup0_16x8b 3929 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next 3930 //storing right of row 1 into left 3931 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0 3932 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 3933 //the next top in src_top_16x8b 3934 src_top_16x8b = src_temp1_16x8b; 3935 //storing right of row 0 into left 3936 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 3937 3938 3939 //adding constant 2 3940 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 3941 3942 //shuffle to get sao index 3943 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 3944 //using availability mask 3945 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 3946 3947 //adding chroma offset to access U and V 3948 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 3949 3950 //shuffle to get sao offset 3951 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 3952 //the next top already in src_top_16x8b 3953 //cnvert to 16 bit then add and then saturated pack 3954 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 3955 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 3956 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 3957 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 3958 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 3959 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 3960 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); 3961 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); 3962 3963 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 3964 3965 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 3966 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 3967 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 3968 // row = 1 3969 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 3970 src_temp0_16x8b = src_bottom_16x8b; 3971 pu1_src_cpy += (src_strd << 1); 3972 pu1_src_left_cpy += 4; 3973 pu1_src_left_str += 4; 3974 } 3975 ht_rem = ht & 0x1; 3976 if(ht_rem) 3977 { 3978 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 3979 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 3980 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 3981 3982 //row 0 -row1 3983 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); 3984 //separating +ve and and -ve values. 3985 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 3986 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 3987 //creating mask 00 for +ve and -ve values and FF for zero. 3988 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 3989 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 3990 //combining the appropriate sign change 3991 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 3992 //adding top and down substraction 3993 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 3994 3995 //for row 0 right to put into left store 3996 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 3997 //left store manipulation 1 3998 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 3999 src_top_16x8b = src_temp0_16x8b; 4000 //filling the left boundary value 4001 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 4002 4003 //adding constant 2 4004 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 4005 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); 4006 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); 4007 4008 4009 //shuffle to get sao index 4010 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 4011 //using availability mask 4012 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 4013 //adding chroma offset to access U and V 4014 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 4015 4016 //shuffle to get sao offset 4017 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 4018 4019 //cnvert to 16 bit then add and then saturated pack 4020 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 4021 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 4022 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 4023 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 4024 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); 4025 4026 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 4027 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 4028 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 4029 pu1_src_cpy += (src_strd); 4030 pu1_src_left_cpy += 2; 4031 pu1_src_left_str += 2; 4032 } 4033 if(0 == pu1_avail[3]) 4034 { 4035 src_top_16x8b = src_bottom_16x8b; 4036 pu1_src_left_str[1] = pu1_src_cpy[7]; 4037 pu1_src_left_str[0] = pu1_src_cpy[6]; 4038 } 4039 4040 if(0 == pu1_avail[2]) 4041 { 4042 pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd]; 4043 pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd]; 4044 } 4045 4046 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 4047 pu1_src += 8; 4048 4049 pu1_left_tmp = pu1_src_left_cpy2; 4050 pu1_src_left_cpy2 = pu1_src_left_str2; 4051 pu1_src_left_str2 = pu1_left_tmp; 4052 4053 pu1_src_left_cpy = pu1_src_left_cpy2; 4054 pu1_src_left_str = pu1_src_left_str2; 4055 } 4056 pu1_src_org[0] = u1_pos_0_0_tmp_u; 4057 pu1_src_org[1] = u1_pos_0_0_tmp_v; 4058 pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u; 4059 pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v; 4060 pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy; 4061 for(row = 0; row < 2 * ht_tmp; row++) 4062 { 4063 pu1_src_left[row] = pu1_src_left_cpy[row]; 4064 } 4065 } 4066 4067 } 4068 4069 void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src, 4070 WORD32 src_strd, 4071 UWORD8 *pu1_src_left, 4072 UWORD8 *pu1_src_top, 4073 UWORD8 *pu1_src_top_left, 4074 UWORD8 *pu1_src_top_right, 4075 UWORD8 *pu1_src_bot_left, 4076 UWORD8 *pu1_avail, 4077 WORD8 *pi1_sao_offset, 4078 WORD32 wd, 4079 WORD32 ht) 4080 { 4081 WORD32 row, col; 4082 UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; 4083 UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2; 4084 UWORD8 *pu1_src_cpy, *pu1_src_org; 4085 UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8]; 4086 UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8]; 4087 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; 4088 WORD32 wd_rem; 4089 UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp; 4090 WORD32 ht_tmp; 4091 WORD32 bit_depth; 4092 UWORD8 u1_avail0, u1_avail1; 4093 4094 __m128i src_top_16x8b, src_bottom_16x8b; 4095 __m128i src_temp0_16x8b, src_temp1_16x8b; 4096 __m128i signup0_16x8b, signdwn1_16x8b; 4097 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; 4098 __m128i edge0_16x8b, edge1_16x8b; 4099 __m128i au1_mask8x16b; 4100 __m128i edge_idx_8x16b, sao_offset_8x16b; 4101 __m128i const2_16x8b, const0_16x8b; 4102 __m128i left_store_16x8b; 4103 4104 ht_tmp = ht; 4105 au1_mask8x16b = _mm_set1_epi8(0xff); 4106 4107 au1_src_left_tmp[0] = pu1_src[(wd - 1)]; 4108 //manipulation for bottom left 4109 for(row = 1; row < ht; row++) 4110 { 4111 au1_src_left_tmp[row] = pu1_src_left[row]; 4112 } 4113 au1_src_left_tmp[ht] = pu1_src_bot_left[0]; 4114 4115 *pu1_src_top_left = pu1_src_top[wd - 1]; 4116 //setting availability mask to ff size MAX_CTB_SIZE 4117 for(col = 0; col < MAX_CTB_SIZE; col += 16) 4118 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); 4119 bit_depth = BIT_DEPTH_LUMA; 4120 pu1_src_org = pu1_src; 4121 pu1_src_top_cpy = pu1_src_top; 4122 pu1_src_left_cpy2 = au1_src_left_tmp; 4123 pu1_src_left_cpy = au1_src_left_tmp; 4124 pu1_src_left_str2 = au1_src_left_tmp1; 4125 pu1_src_left_str = au1_src_left_tmp1; 4126 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); 4127 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset); 4128 4129 /* If top-right is available, process separately */ 4130 if(0 != pu1_avail[5]) 4131 { 4132 WORD32 edge_idx; 4133 4134 edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) + 4135 SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]); 4136 4137 edge_idx = gi1_table_edge_idx[edge_idx]; 4138 4139 if(0 != edge_idx) 4140 { 4141 u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); 4142 } 4143 else 4144 { 4145 u1_pos_wd_0_tmp = pu1_src[wd - 1]; 4146 } 4147 } 4148 else 4149 { 4150 u1_pos_wd_0_tmp = pu1_src[wd - 1]; 4151 } 4152 4153 /* If bottom-left is available, process separately */ 4154 if(0 != pu1_avail[6]) 4155 { 4156 WORD32 edge_idx; 4157 4158 edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) + 4159 SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]); 4160 4161 edge_idx = gi1_table_edge_idx[edge_idx]; 4162 4163 if(0 != edge_idx) 4164 { 4165 u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1); 4166 } 4167 else 4168 { 4169 u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd]; 4170 } 4171 } 4172 else 4173 { 4174 u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd]; 4175 } 4176 4177 4178 4179 /* Update height and source pointers based on the availability flags */ 4180 if(0 == pu1_avail[2]) 4181 { 4182 pu1_src_left_cpy2++; 4183 pu1_src_left_str2++; 4184 pu1_src_top_cpy = pu1_src; 4185 pu1_src += src_strd; 4186 ht--; 4187 } 4188 if(0 == pu1_avail[3]) 4189 { 4190 ht--; 4191 } 4192 4193 4194 const2_16x8b = _mm_set1_epi8(2); 4195 const0_16x8b = _mm_setzero_si128(); 4196 4197 4198 //availability mask creation 4199 u1_avail0 = pu1_avail[0]; 4200 u1_avail1 = pu1_avail[1]; 4201 au1_mask[0] = u1_avail0; 4202 au1_mask[wd - 1] = u1_avail1; 4203 { 4204 WORD32 ht_rem; 4205 4206 pu1_src_left_cpy = pu1_src_left_cpy2; 4207 pu1_src_left_str = pu1_src_left_str2; 4208 au1_mask_cpy = au1_mask; 4209 for(col = wd; col >= 16; col -= 16) 4210 { 4211 pu1_src_cpy = pu1_src; 4212 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1)); 4213 //row = 0 4214 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 4215 4216 //loading the mask 4217 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); 4218 //separating +ve and and -ve values. 4219 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 4220 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 4221 //creating mask 00 for +ve and -ve values and FF for zero. 4222 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4223 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4224 //combining the appropriate sign change 4225 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4226 4227 for(row = ht; row >= 2; row -= 2) 4228 { 4229 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); 4230 //row = 1 4231 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 4232 //to insert left in row 1 4233 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 4234 // row = 0 right 4235 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1)); 4236 4237 //manipulation for row 1 - row 0 4238 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); 4239 //row 0 -row1 4240 //separating +ve and and -ve values. 4241 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 4242 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 4243 4244 //creating mask 00 for +ve and -ve values and FF for zero. 4245 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4246 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4247 4248 //combining the appropriate sign change 4249 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) 4250 //combining sign-left and sign_right 4251 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 4252 4253 //row1-row0 4254 //separating +ve and and -ve values. 4255 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 4256 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 4257 //creating mask 00 for +ve and -ve values and FF for zero. 4258 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4259 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4260 4261 // row = 2 4262 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 4263 // row = 1 right 4264 signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1)); 4265 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) 4266 4267 //bottom - row1 4268 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 4269 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 4270 //creating mask 00 for +ve and -ve values and FF for zero. 4271 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4272 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4273 //for the next iteration bottom -row1 4274 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4275 4276 //to insert left in row 1 4277 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); 4278 //manipulation for row 1 - bottom 4279 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); 4280 4281 //row1 -bottom 4282 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 4283 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 4284 //creating mask 00 for +ve and -ve values and FF for zero. 4285 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4286 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4287 //combining the appropriate sign change 4288 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4289 4290 //combining sign-left and sign_right 4291 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); 4292 4293 //eliminating old left for row 0 and row 1 4294 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 4295 4296 //row1 getting it right for left of next block 4297 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15); 4298 //adding constant 2 4299 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 4300 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 4301 //shuffle to get sao index 4302 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 4303 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 4304 //using availability mask 4305 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 4306 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 4307 //shuffle to get sao offset 4308 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 4309 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 4310 //row0 getting it right for left of next block 4311 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 4312 //copying the next top 4313 src_top_16x8b = src_temp1_16x8b; 4314 //cnvert to 16 bit then add and then saturated pack 4315 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 4316 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 4317 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 4318 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 4319 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 4320 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 4321 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 4322 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 4323 4324 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 4325 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 4326 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 4327 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 4328 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 4329 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 4330 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); 4331 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 4332 //store left boundary 4333 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 4334 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 4335 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 4336 // row = 1 4337 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); 4338 4339 src_temp0_16x8b = src_bottom_16x8b; 4340 pu1_src_cpy += (src_strd << 1); 4341 pu1_src_left_cpy += 2; 4342 pu1_src_left_str += 2; 4343 } 4344 ht_rem = ht & 0x1; 4345 4346 if(ht_rem) 4347 { 4348 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 4349 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 4350 //to insert left in row 1 4351 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 4352 //manipulation for row 1 - row 0 4353 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); 4354 4355 //current row -next row 4356 //separating +ve and and -ve values. 4357 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 4358 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 4359 //creating mask 00 for +ve and -ve values and FF for zero. 4360 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4361 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4362 //combining the appropriate sign change 4363 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4364 //adding top and bottom and constant 2 4365 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 4366 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 4367 //eliminating old left for row 0 and row 1 4368 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); 4369 4370 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 4371 //using availability mask 4372 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 4373 4374 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 4375 4376 //row0 getting it right for left of next block 4377 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 4378 //copying the next top 4379 src_top_16x8b = src_temp0_16x8b; 4380 //cnvert to 16 bit then add and then saturated pack 4381 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 4382 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 4383 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 4384 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 4385 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 4386 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 4387 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 4388 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 4389 //store left boundary 4390 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 4391 4392 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 4393 pu1_src_cpy += (src_strd); 4394 src_temp0_16x8b = src_bottom_16x8b; 4395 pu1_src_left_cpy++; 4396 pu1_src_left_str++; 4397 } 4398 { //for bottom right 4399 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 4400 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); 4401 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 4402 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 4403 } 4404 if(0 == pu1_avail[3]) 4405 { 4406 src_top_16x8b = src_bottom_16x8b; 4407 } 4408 //for the top left of next part of the block 4409 left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); 4410 //updating top flag 4411 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 4412 pu1_src += 16; 4413 au1_mask_cpy += 16; 4414 4415 pu1_left_tmp = pu1_src_left_cpy2; 4416 pu1_src_left_cpy2 = pu1_src_left_str2; 4417 pu1_src_left_str2 = pu1_left_tmp; 4418 4419 pu1_src_left_cpy = pu1_src_left_cpy2; 4420 pu1_src_left_str = pu1_src_left_str2; 4421 } 4422 4423 wd_rem = wd & 0xF; 4424 if(wd_rem) 4425 { 4426 pu1_src_cpy = pu1_src; 4427 pu1_src_left_cpy = pu1_src_left_cpy2; 4428 pu1_src_left_str = pu1_src_left_str2; 4429 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1)); 4430 //row = 0 4431 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 4432 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? 4433 //separating +ve and and -ve values. 4434 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 4435 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 4436 //creating mask 00 for +ve and -ve values and FF for zero. 4437 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4438 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4439 //preparing au1_mask 4440 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); 4441 //combining the appropriate sign change 4442 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4443 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 4444 4445 for(row = ht; row >= 4; row -= 4) 4446 { 4447 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 4448 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 4449 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 4450 // row = 2 4451 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 4452 //manipulation for row 0 -row 1 4453 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 4454 //row 1 left 4455 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); 4456 //row 0 -row1 4457 //separating +ve and and -ve values. 4458 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 4459 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 4460 4461 //creating mask 00 for +ve and -ve values and FF for zero. 4462 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4463 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4464 //manipulatiing for row 1 -row 0 4465 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1); 4466 //combining the appropriate sign change 4467 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4468 //row 1 -row0 4469 //separating +ve and and -ve values. 4470 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 4471 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 4472 4473 //creating mask 00 for +ve and -ve values and FF for zero. 4474 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4475 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4476 //row1-row0 4477 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4478 4479 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 4480 4481 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 4482 //manipulation for row 1 -row 2 4483 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); 4484 //row 2 left 4485 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); 4486 //packing row 0 n row 1 4487 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); 4488 //row1 -row2 4489 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 4490 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 4491 //creating mask 00 for +ve and -ve values and FF for zero. 4492 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4493 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4494 //combining the appropriate sign change 4495 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 4496 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 4497 4498 //row 1 right 4499 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); 4500 //row = 3 4501 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); 4502 4503 // row = 4 4504 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); 4505 4506 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 4507 4508 //separating +ve and and -ve values.(2,1) 4509 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 4510 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 4511 4512 //creating mask 00 for +ve and -ve values and FF for zero. 4513 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4514 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4515 //row 2 right 4516 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1); 4517 //combining the appropriate sign change 4518 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) 4519 4520 //separating +ve and and -ve values.(3,2) 4521 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); 4522 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); 4523 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) 4524 //creating mask 00 for +ve and -ve values and FF for zero. 4525 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4526 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4527 //manipulation for row 2 -row 3 4528 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 4529 //row 3 left 4530 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15); 4531 //combining the appropriate sign change 4532 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) 4533 4534 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) 4535 4536 //separating +ve and and -ve values.(2,3) 4537 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 4538 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 4539 4540 //manipulation for row 3 -bottom 4541 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 11); 4542 //bottom left 4543 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); 4544 4545 //creating mask 00 for +ve and -ve values and FF for zero. 4546 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4547 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4548 //combining the appropriate sign change 4549 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) 4550 4551 //separating +ve and and -ve values.(3,bottom) 4552 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); 4553 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); 4554 4555 //creating mask 00 for +ve and -ve values and FF for zero. 4556 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4557 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4558 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) 4559 //combining the appropriate sign change 4560 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) 4561 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) 4562 4563 4564 //eliminating old left for row 0,1,2,3 4565 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); 4566 //packing row 2 n row 3 4567 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); 4568 //row 3 right 4569 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1); 4570 //loading row 3 right into left 4571 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15); 4572 //adding bottom and top values of row 2 and row 3 4573 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) 4574 //separating +ve and and -ve values.(botttom,3) 4575 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 4576 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 4577 //to store right of row 2 4578 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); 4579 //creating mask 00 for +ve and -ve values and FF for zero. 4580 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4581 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4582 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration 4583 4584 //storing right of row 2into left 4585 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 4586 //to store right of row 0 4587 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 4588 //storing right of row 1 into left 4589 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 4590 4591 //adding constant 2 4592 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 4593 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 4594 //shuffle to get sao index 4595 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 4596 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 4597 //using availability mask 4598 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 4599 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 4600 //shuffle to get sao offset 4601 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 4602 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 4603 4604 //storing right of row 0 into left 4605 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 4606 //cnvert to 16 bit then add and then saturated pack 4607 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 4608 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 4609 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 4610 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 4611 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 4612 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 4613 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 4614 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 4615 4616 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 4617 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); 4618 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 4619 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); 4620 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 4621 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 4622 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b); 4623 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); 4624 4625 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 4626 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); 4627 4628 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 4629 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 4630 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 4631 // row = 1 4632 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 4633 //row = 2 4634 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); 4635 // row = 3 4636 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); 4637 4638 src_temp0_16x8b = src_temp1_16x8b; 4639 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 4640 pu1_src_cpy += (src_strd << 2); 4641 pu1_src_left_cpy += 4; 4642 pu1_src_left_str += 4; 4643 } 4644 ht_rem = ht & 0x2; 4645 if(ht_rem) 4646 { 4647 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 4648 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 4649 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 4650 // row = 2 4651 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 4652 4653 //manipulation for row 0 -row 1 4654 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 4655 //bottom left 4656 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15); 4657 //separating +ve and and -ve values. 4658 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 4659 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 4660 4661 //creating mask 00 for +ve and -ve values and FF for zero. 4662 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4663 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4664 //manipulation for row 1 - row 0 4665 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1); 4666 //combining the appropriate sign change 4667 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4668 4669 //row1-row0 4670 //separating +ve and and -ve values. 4671 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 4672 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 4673 4674 //creating mask 00 for +ve and -ve values and FF for zero. 4675 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4676 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4677 //combining the appropriate sign chang 4678 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4679 4680 //manipulation for row 1 -bottom 4681 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13); 4682 //bottom left 4683 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); 4684 4685 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 4686 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 4687 //row1 -bottom 4688 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 4689 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 4690 4691 //creating mask 00 for +ve and -ve values and FF for zero. 4692 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4693 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4694 //combining the appropriate sign change 4695 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 4696 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 4697 //manipulation for bottom- row 1 (row 1 right) 4698 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); 4699 //adding top and down substraction 4700 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 4701 //bottom - row 1 4702 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 4703 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 4704 4705 //eliminating old left for row 0,1 4706 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 4707 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); 4708 //creating mask 00 for +ve and -ve values and FF for zero. 4709 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4710 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4711 //for the next iteration signup0_16x8b 4712 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next 4713 4714 //storing right of row 1 into left 4715 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 4716 //for storing right of row 1 4717 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 4718 4719 src_top_16x8b = src_temp1_16x8b; 4720 //storing right of row 0 into left 4721 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 4722 4723 //adding constant 2 4724 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 4725 4726 //shuffle to get sao index 4727 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 4728 //using availability mask 4729 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 4730 //shuffle to get sao offset 4731 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 4732 4733 //the next top already in src_top_16x8b 4734 //cnvert to 16 bit then add and then saturated pack 4735 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 4736 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 4737 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 4738 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 4739 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 4740 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 4741 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b); 4742 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); 4743 4744 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 4745 4746 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 4747 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 4748 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 4749 // row = 1 4750 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 4751 src_temp0_16x8b = src_bottom_16x8b; 4752 pu1_src_cpy += (src_strd << 1); 4753 pu1_src_left_cpy += 2; 4754 pu1_src_left_str += 2; 4755 } 4756 ht_rem = ht & 0x1; 4757 if(ht_rem) 4758 { 4759 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 4760 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 4761 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 4762 4763 4764 //manipulation for row 0 -bottom 4765 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14); 4766 //bottom left 4767 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15); 4768 //separating +ve and and -ve values. 4769 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 4770 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 4771 //creating mask 00 for +ve and -ve values and FF for zero. 4772 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 4773 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 4774 //combining the appropriate sign change 4775 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 4776 //adding top and down substraction 4777 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 4778 //for row 0 right to put into left store 4779 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 4780 //adding constant 2 4781 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 4782 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); 4783 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); 4784 //left store manipulation 1 4785 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); 4786 //filling the left boundary value 4787 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15); 4788 4789 //shuffle to get sao index 4790 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 4791 //using availability mask 4792 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 4793 //shuffle to get sao offset 4794 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 4795 src_top_16x8b = src_temp0_16x8b; 4796 //cnvert to 16 bit then add and then saturated pack 4797 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 4798 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 4799 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 4800 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 4801 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); 4802 4803 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 4804 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 4805 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 4806 pu1_src_cpy += (src_strd); 4807 src_temp0_16x8b = src_bottom_16x8b; 4808 pu1_src_left_cpy++; 4809 pu1_src_left_str++; 4810 } 4811 { //for bottom right 4812 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 4813 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1); 4814 src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 4815 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15); 4816 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b); 4817 } 4818 if(0 == pu1_avail[3]) 4819 { 4820 src_top_16x8b = src_bottom_16x8b; 4821 } 4822 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 4823 pu1_src += 8; 4824 4825 pu1_left_tmp = pu1_src_left_cpy2; 4826 pu1_src_left_cpy2 = pu1_src_left_str2; 4827 pu1_src_left_str2 = pu1_left_tmp; 4828 4829 pu1_src_left_cpy = pu1_src_left_cpy2; 4830 pu1_src_left_str = pu1_src_left_str2; 4831 4832 } 4833 pu1_src_org[wd - 1] = u1_pos_wd_0_tmp; 4834 pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp; 4835 pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy; 4836 pu1_src_left[0] = au1_src_left_tmp[0]; 4837 for(row = 1; row < ht_tmp; row++) 4838 { 4839 pu1_src_left[row] = pu1_src_left_cpy[row]; 4840 } 4841 } 4842 4843 } 4844 4845 void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src, 4846 WORD32 src_strd, 4847 UWORD8 *pu1_src_left, 4848 UWORD8 *pu1_src_top, 4849 UWORD8 *pu1_src_top_left, 4850 UWORD8 *pu1_src_top_right, 4851 UWORD8 *pu1_src_bot_left, 4852 UWORD8 *pu1_avail, 4853 WORD8 *pi1_sao_offset_u, 4854 WORD8 *pi1_sao_offset_v, 4855 WORD32 wd, 4856 WORD32 ht) 4857 { 4858 WORD32 row, col; 4859 UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2; 4860 UWORD8 *pu1_src_cpy, *pu1_src_org; 4861 UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)]; 4862 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy; 4863 WORD32 wd_rem; 4864 UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v; 4865 WORD32 ht_tmp; 4866 WORD32 bit_depth; 4867 UWORD8 u1_avail0, u1_avail1; 4868 4869 __m128i src_top_16x8b, src_bottom_16x8b; 4870 __m128i src_temp0_16x8b, src_temp1_16x8b; 4871 __m128i signup0_16x8b, signdwn1_16x8b; 4872 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b; 4873 __m128i edge0_16x8b, edge1_16x8b; 4874 __m128i au1_mask8x16b; 4875 __m128i edge_idx_8x16b, sao_offset_8x16b; 4876 __m128i left_store_16x8b; 4877 __m128i const0_16x8b, const2_16x8b; 4878 __m128i chroma_offset_8x16b; 4879 4880 ht_tmp = ht; 4881 au1_mask8x16b = _mm_set1_epi8(0xff); 4882 4883 4884 au1_src_left_tmp[0] = pu1_src[(wd - 2)]; 4885 au1_src_left_tmp[1] = pu1_src[(wd - 1)]; 4886 //manipulation for bottom left 4887 for(row = 2; row < 2 * ht; row++) 4888 { 4889 au1_src_left_tmp[row] = pu1_src_left[row]; 4890 } 4891 au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0]; 4892 au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1]; 4893 4894 pu1_src_top_left[0] = pu1_src_top[wd - 2]; 4895 pu1_src_top_left[1] = pu1_src_top[wd - 1]; 4896 //setting availability mask to ff size MAX_CTB_SIZE 4897 for(col = 0; col < MAX_CTB_SIZE; col += 16) 4898 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b); 4899 bit_depth = BIT_DEPTH_LUMA; 4900 pu1_src_org = pu1_src; 4901 pu1_src_top_cpy = pu1_src_top; 4902 pu1_src_left_cpy2 = au1_src_left_tmp; 4903 pu1_src_left_cpy = au1_src_left_tmp; 4904 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx); 4905 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u); 4906 const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v); 4907 chroma_offset_8x16b = _mm_set1_epi16(0x0800); 4908 /* If top-right is available, process separately */ 4909 if(0 != pu1_avail[5]) 4910 { 4911 WORD32 edge_idx; 4912 4913 /* U */ 4914 edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) + 4915 SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]); 4916 4917 edge_idx = gi1_table_edge_idx[edge_idx]; 4918 4919 if(0 != edge_idx) 4920 { 4921 u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); 4922 } 4923 else 4924 { 4925 u1_pos_wd_0_tmp_u = pu1_src[wd - 2]; 4926 } 4927 4928 /* V */ 4929 edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) + 4930 SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]); 4931 4932 edge_idx = gi1_table_edge_idx[edge_idx]; 4933 4934 if(0 != edge_idx) 4935 { 4936 u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); 4937 } 4938 else 4939 { 4940 u1_pos_wd_0_tmp_v = pu1_src[wd - 1]; 4941 } 4942 } 4943 else 4944 { 4945 u1_pos_wd_0_tmp_u = pu1_src[wd - 2]; 4946 u1_pos_wd_0_tmp_v = pu1_src[wd - 1]; 4947 } 4948 4949 /* If bottom-left is available, process separately */ 4950 if(0 != pu1_avail[6]) 4951 { 4952 WORD32 edge_idx; 4953 4954 /* U */ 4955 edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) + 4956 SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]); 4957 4958 edge_idx = gi1_table_edge_idx[edge_idx]; 4959 4960 if(0 != edge_idx) 4961 { 4962 u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1); 4963 } 4964 else 4965 { 4966 u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd]; 4967 } 4968 4969 /* V */ 4970 edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) + 4971 SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]); 4972 4973 edge_idx = gi1_table_edge_idx[edge_idx]; 4974 4975 if(0 != edge_idx) 4976 { 4977 u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1); 4978 } 4979 else 4980 { 4981 u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]; 4982 } 4983 } 4984 else 4985 { 4986 u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd]; 4987 u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1]; 4988 } 4989 4990 4991 4992 /* Update height and source pointers based on the availability flags */ 4993 if(0 == pu1_avail[2]) 4994 { 4995 pu1_src_left_cpy2 += 2; 4996 pu1_src_top_cpy = pu1_src; 4997 pu1_src += src_strd; 4998 ht--; 4999 } 5000 if(0 == pu1_avail[3]) 5001 { 5002 ht--; 5003 } 5004 5005 sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b); 5006 const2_16x8b = _mm_set1_epi8(2); 5007 const0_16x8b = _mm_setzero_si128(); 5008 5009 5010 //availability mask creation 5011 u1_avail0 = pu1_avail[0]; 5012 u1_avail1 = pu1_avail[1]; 5013 au1_mask[0] = u1_avail0; 5014 au1_mask[1] = u1_avail0; 5015 au1_mask[wd - 1] = u1_avail1; 5016 au1_mask[wd - 2] = u1_avail1; 5017 { 5018 WORD32 ht_rem; 5019 au1_mask_cpy = au1_mask; 5020 for(col = wd; col >= 16; col -= 16) 5021 { 5022 pu1_src_cpy = pu1_src; 5023 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2)); 5024 //row = 0 5025 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 5026 5027 //loading the mask 5028 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy); 5029 //separating +ve and and -ve values. 5030 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 5031 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 5032 //creating mask 00 for +ve and -ve values and FF for zero. 5033 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5034 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5035 //combining the appropriate sign change 5036 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5037 pu1_src_left_cpy = pu1_src_left_cpy2; 5038 5039 for(row = ht; row >= 2; row -= 2) 5040 { 5041 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy)); 5042 //row = 1 5043 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 5044 //to insert left in row 1 5045 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 5046 // row = 0 right 5047 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2)); 5048 5049 //manipulation for row 1 - row 0 5050 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); 5051 //row 0 -row1 5052 //separating +ve and and -ve values. 5053 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 5054 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 5055 5056 //creating mask 00 for +ve and -ve values and FF for zero. 5057 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5058 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5059 5060 //combining the appropriate sign change 5061 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1) 5062 //combining sign-left and sign_right 5063 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 5064 5065 //row1-row0 5066 //separating +ve and and -ve values. 5067 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b); 5068 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b); 5069 //creating mask 00 for +ve and -ve values and FF for zero. 5070 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5071 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5072 5073 // row = 2 5074 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 5075 // row = 1 right 5076 signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2)); 5077 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0) 5078 5079 //bottom - row1 5080 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 5081 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 5082 //creating mask 00 for +ve and -ve values and FF for zero. 5083 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5084 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5085 //for the next iteration bottom -row1 5086 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5087 5088 //to insert left in row 1 5089 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); 5090 //manipulation for row 1 - bottom 5091 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); 5092 5093 //row1 -bottom 5094 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 5095 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 5096 //creating mask 00 for +ve and -ve values and FF for zero. 5097 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5098 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5099 //combining the appropriate sign change 5100 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5101 5102 //combining sign-left and sign_right 5103 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b); 5104 5105 //eliminating old left for row 0 and row 1 5106 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); 5107 //row1 getting it right for left of next block 5108 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14); 5109 //row0 getting it right for left of next block 5110 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 5111 //copying the next top 5112 src_top_16x8b = src_temp1_16x8b; 5113 5114 5115 //adding constant 2 5116 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 5117 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 5118 //shuffle to get sao index 5119 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 5120 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 5121 //using availability mask 5122 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 5123 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 5124 5125 //adding chroma offset to access U and V 5126 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 5127 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); 5128 5129 //shuffle to get sao offset 5130 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 5131 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 5132 //cnvert to 16 bit then add and then saturated pack 5133 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 5134 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 5135 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 5136 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 5137 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 5138 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 5139 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 5140 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 5141 5142 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 5143 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 5144 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 5145 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b); 5146 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 5147 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 5148 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b); 5149 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b); 5150 //store left boundary 5151 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); 5152 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 5153 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 5154 // row = 1 5155 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b); 5156 5157 src_temp0_16x8b = src_bottom_16x8b; 5158 pu1_src_cpy += (src_strd << 1); 5159 pu1_src_left_cpy += 4; 5160 } 5161 ht_rem = ht & 0x1; 5162 5163 if(ht_rem) 5164 { 5165 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 5166 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 5167 //to insert left in row 1 5168 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 5169 //manipulation for row 1 - row 0 5170 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); 5171 5172 //current row -next row 5173 //separating +ve and and -ve values. 5174 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 5175 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 5176 //creating mask 00 for +ve and -ve values and FF for zero. 5177 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5178 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5179 //combining the appropriate sign change 5180 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5181 //adding top and bottom and constant 2 5182 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 5183 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 5184 //eliminating old left for row 0 and row 1 5185 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 5186 //row0 getting it right for left of next block 5187 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 5188 //copying the next top 5189 src_top_16x8b = src_temp0_16x8b; 5190 5191 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 5192 //using availability mask 5193 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 5194 5195 //adding chroma offset to access U and V 5196 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 5197 5198 5199 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 5200 5201 //cnvert to 16 bit then add and then saturated pack 5202 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 5203 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 5204 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 5205 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 5206 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 5207 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 5208 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 5209 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 5210 5211 //store left boundary 5212 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); 5213 5214 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 5215 pu1_src_cpy += (src_strd); 5216 src_temp0_16x8b = src_bottom_16x8b; 5217 pu1_src_left_cpy += 2; 5218 } 5219 { //for bottom right 5220 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 5221 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 5222 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 5223 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); 5224 } 5225 if(0 == pu1_avail[3]) 5226 { 5227 src_top_16x8b = src_bottom_16x8b; 5228 } 5229 //for the top left of next part of the block 5230 left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col)); 5231 //updating top flag 5232 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 5233 pu1_src += 16; 5234 au1_mask_cpy += 16; 5235 } 5236 pu1_src_left_cpy = pu1_src_left_cpy2; 5237 wd_rem = wd & 0xF; 5238 if(wd_rem) 5239 { 5240 pu1_src_cpy = pu1_src; 5241 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2)); 5242 //row = 0 5243 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy)); 5244 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //???? 5245 //separating +ve and and -ve values. 5246 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b); 5247 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b); 5248 //creating mask 00 for +ve and -ve values and FF for zero. 5249 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5250 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5251 //preparing au1_mask 5252 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b); 5253 //combining the appropriate sign change 5254 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5255 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 5256 pu1_src_left_cpy = pu1_src_left_cpy2; 5257 for(row = ht; row >= 4; row -= 4) 5258 { 5259 left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy); 5260 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 5261 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 5262 // row = 2 5263 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 5264 //manipulation for row 0 -row 1 5265 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 5266 //row 1 left 5267 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); 5268 //row 0 -row1 5269 //separating +ve and and -ve values. 5270 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 5271 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 5272 5273 //creating mask 00 for +ve and -ve values and FF for zero. 5274 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5275 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5276 //manipulatiing for row 1 -row 0 5277 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2); 5278 //combining the appropriate sign change 5279 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5280 //row 1 -row0 5281 //separating +ve and and -ve values. 5282 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 5283 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 5284 5285 //creating mask 00 for +ve and -ve values and FF for zero. 5286 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5287 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5288 //row1-row0 5289 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5290 5291 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 5292 5293 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 5294 //manipulation for row 1 -row 2 5295 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); 5296 //row 2 left 5297 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); 5298 //packing row 0 n row 1 5299 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b); 5300 //row1 -row2 5301 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 5302 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 5303 //creating mask 00 for +ve and -ve values and FF for zero. 5304 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5305 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5306 //combining the appropriate sign change 5307 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 5308 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 5309 5310 //row 1 right 5311 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 5312 //row = 3 5313 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd)); 5314 5315 // row = 4 5316 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd)); 5317 5318 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 5319 5320 //separating +ve and and -ve values.(2,1) 5321 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 5322 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 5323 5324 //creating mask 00 for +ve and -ve values and FF for zero. 5325 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5326 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5327 //row 2 right 5328 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2); 5329 //combining the appropriate sign change 5330 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) 5331 5332 //separating +ve and and -ve values.(3,2) 5333 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); 5334 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); 5335 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1) 5336 //creating mask 00 for +ve and -ve values and FF for zero. 5337 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5338 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5339 //manipulation for row 2 -row 3 5340 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8); 5341 //row 3 left 5342 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14); 5343 //combining the appropriate sign change 5344 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2) 5345 5346 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1) 5347 5348 //separating +ve and and -ve values.(2,3) 5349 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 5350 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 5351 5352 //manipulation for row 3 -bottom 5353 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 6); 5354 //bottom left 5355 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); 5356 5357 //creating mask 00 for +ve and -ve values and FF for zero. 5358 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5359 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5360 //combining the appropriate sign change 5361 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3) 5362 5363 //separating +ve and and -ve values.(3,bottom) 5364 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b); 5365 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b); 5366 5367 //creating mask 00 for +ve and -ve values and FF for zero. 5368 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5369 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5370 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3) 5371 //combining the appropriate sign change 5372 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom) 5373 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3) 5374 5375 5376 //eliminating old left for row 0,1,2,3 5377 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8); 5378 //packing row 2 n row 3 5379 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b); 5380 //row 3 right 5381 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2); 5382 //loading row 3 right into left 5383 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14); 5384 //adding bottom and top values of row 2 and row 3 5385 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2) 5386 //separating +ve and and -ve values.(botttom,3) 5387 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 5388 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 5389 //to store right of row 2 5390 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8); 5391 //creating mask 00 for +ve and -ve values and FF for zero. 5392 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5393 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5394 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration 5395 5396 //storing right of row 2into left 5397 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 5398 //to store right of row 0 5399 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 5400 //storing right of row 1 into left 5401 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 5402 //storing right of row 0 into left 5403 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 5404 5405 5406 //adding constant 2 5407 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 5408 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b); 5409 //shuffle to get sao index 5410 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 5411 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b); 5412 //using availability mask 5413 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 5414 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b); 5415 //adding chroma offset to access U and V 5416 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 5417 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b); 5418 //shuffle to get sao offset 5419 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 5420 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b); 5421 5422 //cnvert to 16 bit then add and then saturated pack 5423 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 5424 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 5425 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 5426 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b); 5427 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 5428 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 5429 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b); 5430 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b); 5431 5432 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b); 5433 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b); 5434 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b); 5435 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b); 5436 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b); 5437 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b); 5438 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b); 5439 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b); 5440 5441 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 5442 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8); 5443 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); 5444 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 5445 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 5446 // row = 1 5447 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 5448 //row = 2 5449 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b); 5450 // row = 3 5451 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b); 5452 5453 src_temp0_16x8b = src_temp1_16x8b; 5454 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); 5455 pu1_src_cpy += (src_strd << 2); 5456 pu1_src_left_cpy += 8; 5457 } 5458 ht_rem = ht & 0x2; 5459 if(ht_rem) 5460 { 5461 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 5462 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 5463 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 5464 // row = 2 5465 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd)); 5466 5467 //manipulation for row 0 -row 1 5468 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 5469 //bottom left 5470 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14); 5471 //separating +ve and and -ve values. 5472 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 5473 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 5474 5475 //creating mask 00 for +ve and -ve values and FF for zero. 5476 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5477 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5478 //manipulation for row 1 - row 0 5479 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2); 5480 //combining the appropriate sign change 5481 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5482 5483 //row1-row0 5484 //separating +ve and and -ve values. 5485 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 5486 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 5487 5488 //creating mask 00 for +ve and -ve values and FF for zero. 5489 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5490 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5491 //combining the appropriate sign chang 5492 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5493 5494 //manipulation for row 1 -bottom 5495 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10); 5496 //bottom left 5497 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); 5498 5499 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1) 5500 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top) 5501 //row1 -bottom 5502 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b); 5503 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b); 5504 5505 //creating mask 00 for +ve and -ve values and FF for zero. 5506 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5507 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5508 //combining the appropriate sign change 5509 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2) 5510 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1) 5511 5512 //manipulation for bottom- row 1 (row 1 right) 5513 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 5514 //adding top and down substraction 5515 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty 5516 //bottom - row 1 5517 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b); 5518 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b); 5519 5520 //eliminating old left for row 0,1 5521 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4); 5522 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8); 5523 //creating mask 00 for +ve and -ve values and FF for zero. 5524 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5525 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5526 //for the next iteration signup0_16x8b 5527 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next 5528 5529 //storing right of row 1 into left 5530 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 5531 //for storing right of row 1 5532 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 5533 5534 src_top_16x8b = src_temp1_16x8b; 5535 //storing right of row 0 into left 5536 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 5537 5538 //adding constant 2 5539 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 5540 5541 //shuffle to get sao index 5542 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 5543 //using availability mask 5544 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 5545 //adding chroma offset to access U and V 5546 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 5547 //shuffle to get sao offset 5548 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 5549 //the next top already in src_top_16x8b 5550 //cnvert to 16 bit then add and then saturated pack 5551 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 5552 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 5553 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 5554 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b); 5555 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b); 5556 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 5557 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b); 5558 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b); 5559 5560 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8); 5561 5562 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); 5563 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 5564 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 5565 // row = 1 5566 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b); 5567 src_temp0_16x8b = src_bottom_16x8b; 5568 pu1_src_cpy += (src_strd << 1); 5569 pu1_src_left_cpy += 4; 5570 } 5571 ht_rem = ht & 0x1; 5572 if(ht_rem) 5573 { 5574 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 5575 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos. 5576 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd)); 5577 5578 5579 //manipulation for row 0 -bottom 5580 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12); 5581 //bottom left 5582 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14); 5583 //separating +ve and and -ve values. 5584 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b); 5585 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b); 5586 //creating mask 00 for +ve and -ve values and FF for zero. 5587 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b); 5588 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b); 5589 //combining the appropriate sign change 5590 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); 5591 //adding top and down substraction 5592 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); 5593 //for row 0 right to put into left store 5594 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 5595 //adding constant 2 5596 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b); 5597 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); 5598 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8); 5599 //left store manipulation 1 5600 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 5601 //filling the left boundary value 5602 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); 5603 src_top_16x8b = src_temp0_16x8b; 5604 5605 //shuffle to get sao index 5606 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b); 5607 //using availability mask 5608 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b); 5609 //adding chroma offset to access U and V 5610 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b); 5611 //shuffle to get sao offset 5612 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b); 5613 5614 //cnvert to 16 bit then add and then saturated pack 5615 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b); 5616 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b); 5617 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b); 5618 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b); 5619 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b); 5620 5621 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); 5622 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos. 5623 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b); 5624 pu1_src_cpy += (src_strd); 5625 src_temp0_16x8b = src_bottom_16x8b; 5626 pu1_src_left_cpy += 2; 5627 } 5628 { //for bottom right 5629 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy); 5630 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2); 5631 src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8); 5632 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14); 5633 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b); 5634 } 5635 if(0 == pu1_avail[3]) 5636 { 5637 src_top_16x8b = src_bottom_16x8b; 5638 } 5639 5640 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b); 5641 pu1_src += 8; 5642 } 5643 pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u; 5644 pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v; 5645 pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u; 5646 pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v; 5647 for(row = 0; row < 2 * ht_tmp; row++) 5648 { 5649 pu1_src_left[row] = au1_src_left_tmp[row]; 5650 } 5651 } 5652 5653 } 5654