1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_intra_pred_filters_x86_intr.c 22 * 23 * @brief 24 * Contains function Definition for intra prediction interpolation filters 25 * 26 * 27 * @author 28 * Ittiam 29 * 30 * @par List of Functions: 31 * - ihevc_intra_pred_ref_filtering_sse42() 32 * - ihevc_intra_pred_luma_dc_sse42() 33 * - ihevc_intra_pred_luma_horz_sse42() 34 * - ihevc_intra_pred_luma_ver_sse42() 35 * - ihevc_intra_pred_luma_mode_3_to_9_sse42() 36 * - ihevc_intra_pred_luma_mode_11_to_17_sse42() 37 * - ihevc_intra_pred_luma_mode_19_to_25_sse42() 38 * - ihevc_intra_pred_luma_mode_27_to_33_sse42() 39 * 40 * @remarks 41 * None 42 * 43 ******************************************************************************* 44 */ 45 46 47 /*****************************************************************************/ 48 /* File Includes */ 49 /*****************************************************************************/ 50 #include <stdlib.h> 51 52 #include "ihevc_typedefs.h" 53 #include "ihevc_intra_pred.h" 54 #include "ihevc_macros.h" 55 #include "ihevc_func_selector.h" 56 #include "ihevc_platform_macros.h" 57 #include "ihevc_common_tables.h" 58 #include "ihevc_defs.h" 59 #include "ihevc_tables_x86_intr.h" 60 61 #include <immintrin.h> 62 63 /****************************************************************************/ 64 /* Constant Macros */ 65 /****************************************************************************/ 66 #define MAX_CU_SIZE 64 67 #define BIT_DEPTH 8 68 #define T32_4NT 128 69 #define T16_4NT 64 70 71 72 /****************************************************************************/ 73 /* Function Macros */ 74 /****************************************************************************/ 75 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x) 76 77 /* tables to shuffle 8-bit values */ 78 79 /*****************************************************************************/ 80 /* global tables Definition */ 81 /*****************************************************************************/ 82 83 84 85 /*****************************************************************************/ 86 /* Function Definition */ 87 /*****************************************************************************/ 88 89 /** 90 ******************************************************************************* 91 * 92 * @brief 93 * Intra prediction interpolation filter for ref_filtering 94 * 95 * 96 * @par Description: 97 * Reference DC filtering for neighboring samples dependent on TU size and 98 * mode Refer to section 8.4.4.2.3 in the standard 99 * 100 * @param[in] pu1_src 101 * UWORD8 pointer to the source 102 * 103 * @param[out] pu1_dst 104 * UWORD8 pointer to the destination 105 * 106 * @param[in] nt 107 * integer Transform Block size 108 * 109 * @param[in] mode 110 * integer intraprediction mode 111 * 112 * @returns 113 * 114 * @remarks 115 * None 116 * 117 ******************************************************************************* 118 */ 119 120 121 void ihevc_intra_pred_ref_filtering_sse42(UWORD8 *pu1_src, 122 WORD32 nt, 123 UWORD8 *pu1_dst, 124 WORD32 mode, 125 WORD32 strong_intra_smoothing_enable_flag) 126 { 127 WORD32 filter_flag; 128 WORD32 i; /* Generic indexing variable */ 129 WORD32 four_nt = 4 * nt; 130 UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1]; 131 WORD32 bi_linear_int_flag = 0; 132 WORD32 abs_cond_left_flag = 0; 133 WORD32 abs_cond_top_flag = 0; 134 WORD32 dc_val = 1 << (BIT_DEPTH - 5); 135 __m128i src_temp1, src_temp2, src_temp3, src_temp7; 136 __m128i src_temp4, src_temp5, src_temp6, src_temp8; 137 138 //WORD32 strong_intra_smoothing_enable_flag = 1; 139 140 141 142 filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)); 143 if(0 == filter_flag) 144 { 145 if(pu1_src == pu1_dst) 146 { 147 return; 148 } 149 else 150 { 151 if(nt == 4) 152 { 153 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); 154 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 155 pu1_dst[four_nt] = pu1_src[four_nt]; 156 157 } 158 159 else if(nt == 8) 160 { 161 162 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); 163 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 164 165 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 166 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 167 168 169 pu1_dst[four_nt] = pu1_src[four_nt]; 170 } 171 else if(nt == 16) 172 { 173 174 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); 175 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 176 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32)); 177 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48)); 178 179 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 180 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 181 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); 182 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); 183 184 pu1_dst[four_nt] = pu1_src[four_nt]; 185 } 186 else if(nt == 32) 187 { 188 189 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); 190 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 191 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32)); 192 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48)); 193 194 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64)); 195 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80)); 196 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96)); 197 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112)); 198 199 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 200 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 201 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); 202 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); 203 204 _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5); 205 _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6); 206 _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7); 207 _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8); 208 209 pu1_dst[four_nt] = pu1_src[four_nt]; 210 } 211 212 } 213 } 214 215 else 216 { 217 /* If strong intra smoothin is enabled and transform size is 32 */ 218 if((1 == strong_intra_smoothing_enable_flag) && (32 == nt)) 219 { 220 /* Strong Intra Filtering */ 221 abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt] 222 - (2 * pu1_src[3 * nt]))) < dc_val; 223 abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0] 224 - (2 * pu1_src[nt]))) < dc_val; 225 226 bi_linear_int_flag = ((1 == abs_cond_left_flag) 227 && (1 == abs_cond_top_flag)); 228 } 229 /* Extremities Untouched*/ 230 au1_flt[0] = pu1_src[0]; 231 au1_flt[4 * nt] = pu1_src[4 * nt]; 232 233 /* Strong filtering of reference samples */ 234 if(1 == bi_linear_int_flag) 235 { 236 au1_flt[2 * nt] = pu1_src[2 * nt]; 237 238 for(i = 1; i < (2 * nt); i++) 239 au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6; 240 241 for(i = 1; i < (2 * nt); i++) 242 au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6; 243 } 244 else 245 { 246 __m128i const_value_8x16; 247 248 const_value_8x16 = _mm_set1_epi16(2); 249 250 au1_flt[0] = pu1_src[0]; 251 au1_flt[4 * nt] = pu1_src[4 * nt]; 252 253 /* Perform bilinear filtering of Reference Samples */ 254 for(i = 0; i < (four_nt); i += 16) 255 { 256 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i)); 257 src_temp2 = _mm_srli_si128(src_temp1, 1); 258 src_temp3 = _mm_srli_si128(src_temp2, 1); 259 260 src_temp1 = _mm_cvtepu8_epi16(src_temp1); 261 src_temp2 = _mm_cvtepu8_epi16(src_temp2); 262 src_temp3 = _mm_cvtepu8_epi16(src_temp3); 263 264 src_temp2 = _mm_slli_epi16(src_temp2, 1); 265 266 src_temp1 = _mm_add_epi16(src_temp1, src_temp2); 267 src_temp1 = _mm_add_epi16(src_temp1, src_temp3); 268 src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16); 269 270 src_temp1 = _mm_srai_epi16(src_temp1, 2); 271 272 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i)); 273 src_temp5 = _mm_srli_si128(src_temp4, 1); 274 src_temp6 = _mm_srli_si128(src_temp5, 1); 275 276 src_temp4 = _mm_cvtepu8_epi16(src_temp4); 277 src_temp5 = _mm_cvtepu8_epi16(src_temp5); 278 src_temp6 = _mm_cvtepu8_epi16(src_temp6); 279 280 src_temp5 = _mm_slli_epi16(src_temp5, 1); 281 282 src_temp4 = _mm_add_epi16(src_temp4, src_temp5); 283 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 284 src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16); 285 286 src_temp4 = _mm_srai_epi16(src_temp4, 2); 287 288 /* converting 16 bit to 8 bit */ 289 src_temp1 = _mm_packus_epi16(src_temp1, src_temp4); 290 291 _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1); 292 } 293 au1_flt[4 * nt] = pu1_src[4 * nt]; 294 } 295 296 if(nt == 4) 297 { 298 src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); 299 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 300 pu1_dst[four_nt] = au1_flt[four_nt]; 301 } 302 else if(nt == 8) 303 { 304 305 src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); 306 src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); 307 308 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 309 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 310 311 pu1_dst[four_nt] = au1_flt[four_nt]; 312 } 313 else if(nt == 16) 314 { 315 316 src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); 317 src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); 318 src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32)); 319 src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48)); 320 321 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 322 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 323 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); 324 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); 325 326 pu1_dst[four_nt] = au1_flt[four_nt]; 327 } 328 329 else if(nt == 32) 330 { 331 332 src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); 333 src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); 334 src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32)); 335 src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48)); 336 337 src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64)); 338 src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80)); 339 src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96)); 340 src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112)); 341 342 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 343 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 344 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); 345 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); 346 347 _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5); 348 _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6); 349 _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7); 350 _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8); 351 352 pu1_dst[four_nt] = au1_flt[four_nt]; 353 } 354 355 } 356 } 357 358 359 360 /** 361 ******************************************************************************* 362 * 363 * @brief 364 * Intra prediction interpolation filter for luma dc 365 * 366 * @par Description: 367 * Intraprediction for DC mode with reference neighboring samples location 368 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 369 * to section 8.4.4.2.5 in the standard 370 * 371 * @param[in] pu1_src 372 * UWORD8 pointer to the source 373 * 374 * @param[out] pu1_dst 375 * UWORD8 pointer to the destination 376 * 377 * @param[in] src_strd 378 * integer source stride 379 * 380 * @param[in] dst_strd 381 * integer destination stride 382 * 383 * @param[in] nt 384 * integer Transform Block size 385 * 386 * @param[in] mode 387 * integer intraprediction mode 388 * 389 * @returns 390 * 391 * @remarks 392 * None 393 * 394 ******************************************************************************* 395 */ 396 397 void ihevc_intra_pred_luma_dc_sse42(UWORD8 *pu1_ref, 398 WORD32 src_strd, 399 UWORD8 *pu1_dst, 400 WORD32 dst_strd, 401 WORD32 nt, 402 WORD32 mode) 403 { 404 405 WORD32 acc_dc; 406 WORD32 dc_val, two_dc_val, three_dc_val; 407 WORD32 row; 408 WORD32 log2nt = 5; 409 WORD32 two_nt, three_nt; 410 __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6; 411 __m128i src_temp8, src_temp9, src_temp10, src_temp2; 412 __m128i m_zero = _mm_set1_epi32(0); 413 __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]); 414 UNUSED(src_strd); 415 UNUSED(mode); 416 417 418 switch(nt) 419 { 420 case 32: 421 log2nt = 5; 422 break; 423 case 16: 424 log2nt = 4; 425 break; 426 case 8: 427 log2nt = 3; 428 break; 429 case 4: 430 log2nt = 2; 431 break; 432 default: 433 break; 434 } 435 two_nt = 2 * nt; 436 three_nt = 3 * nt; 437 438 acc_dc = 0; 439 /* Calculate DC value for the transform block */ 440 441 442 443 if(nt == 32) 444 { 445 __m128i temp; 446 WORD32 itr_count; 447 448 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 449 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16)); 450 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32)); 451 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48)); 452 453 src_temp3 = _mm_sad_epu8(src_temp3, m_zero); 454 src_temp4 = _mm_sad_epu8(src_temp4, m_zero); 455 src_temp7 = _mm_sad_epu8(src_temp7, m_zero); 456 src_temp8 = _mm_sad_epu8(src_temp8, m_zero); 457 458 src_temp4 = _mm_add_epi16(src_temp3, src_temp4); 459 src_temp8 = _mm_add_epi16(src_temp7, src_temp8); 460 src_temp4 = _mm_add_epi16(src_temp4, src_temp8); 461 462 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 463 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 464 465 acc_dc = _mm_cvtsi128_si32(src_temp4); 466 467 acc_dc += pu1_ref[three_nt]; 468 acc_dc -= pu1_ref[two_nt]; 469 470 /* computing acc_dc value */ 471 dc_val = (acc_dc + nt) >> (log2nt + 1); 472 473 two_dc_val = 2 * dc_val; 474 three_dc_val = 3 * dc_val; 475 476 temp = _mm_set1_epi8(dc_val); 477 478 for(itr_count = 0; itr_count < 2; itr_count++) 479 { 480 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 481 _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp); 482 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp); 483 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp); 484 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp); 485 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp); 486 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp); 487 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp); 488 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp); 489 490 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp); 491 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp); 492 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp); 493 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp); 494 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp); 495 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp); 496 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp); 497 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp); 498 499 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp); 500 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp); 501 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp); 502 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp); 503 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp); 504 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp); 505 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp); 506 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp); 507 508 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp); 509 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp); 510 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp); 511 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp); 512 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp); 513 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp); 514 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp); 515 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp); 516 517 pu1_dst += 16 * dst_strd; 518 } 519 } 520 521 else 522 523 { 524 __m128i zero_8x16b; 525 __m128i sm1 = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]); 526 527 /* DC filtering for the first top row and first left column */ 528 529 zero_8x16b = _mm_set1_epi16(0); 530 531 if(nt == 4) /* nt multiple of 4*/ 532 { 533 WORD32 temp1, temp2, temp3; 534 535 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 536 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 537 538 src_temp4 = _mm_cvtepu8_epi16(src_temp3); 539 src_temp2 = _mm_cvtepu8_epi16(src_temp2); 540 541 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 542 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 543 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 544 545 acc_dc = _mm_cvtsi128_si32(src_temp4); 546 acc_dc += pu1_ref[three_nt]; 547 acc_dc -= pu1_ref[two_nt]; 548 549 /* computing acc_dc value */ 550 551 dc_val = (acc_dc + nt) >> (log2nt + 1); 552 553 three_dc_val = 3 * dc_val; 554 555 /* loding 8-bit 16 pixel */ 556 src_temp1 = _mm_set1_epi16(three_dc_val + 2); 557 two_dc_val = 2 * dc_val; 558 559 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ 560 src_temp2 = _mm_add_epi16(src_temp2, src_temp1); 561 562 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */ 563 src_temp2 = _mm_srli_epi16(src_temp2, 2); 564 565 src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b); 566 567 temp1 = _mm_cvtsi128_si32(src_temp2); 568 569 *(WORD32 *)(&pu1_dst[0]) = temp1; 570 571 /* retore first value*/ 572 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) 573 >> 2); 574 575 for(row = 1; row < nt; row++) 576 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) 577 >> 2; 578 579 src_temp2 = _mm_insert_epi8(src_temp2, dc_val, 0); 580 581 src_temp2 = _mm_shuffle_epi8(src_temp2, sm1); 582 src_temp3 = _mm_shuffle_epi8(src_temp2, sm1); 583 src_temp4 = _mm_shuffle_epi8(src_temp2, sm1); 584 585 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(1 * dst_strd) + 0], 0); 586 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(2 * dst_strd) + 0], 0); 587 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(3 * dst_strd) + 0], 0); 588 589 temp1 = _mm_cvtsi128_si32(src_temp2); 590 temp2 = _mm_cvtsi128_si32(src_temp3); 591 temp3 = _mm_cvtsi128_si32(src_temp4); 592 593 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1; 594 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2; 595 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3; 596 597 } 598 else if(nt == 8) /* if nt%8==0*/ 599 { 600 601 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 602 603 src_temp4 = _mm_sad_epu8(src_temp3, m_zero); 604 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 605 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 606 607 acc_dc = _mm_cvtsi128_si32(src_temp4); 608 609 acc_dc += pu1_ref[three_nt]; 610 acc_dc -= pu1_ref[two_nt]; 611 612 /* computing acc_dc value */ 613 614 dc_val = (acc_dc + nt) >> (log2nt + 1); 615 616 three_dc_val = 3 * dc_val; 617 src_temp1 = _mm_set1_epi16(three_dc_val + 2); 618 two_dc_val = 2 * dc_val; 619 620 /* loding 8-bit 16 pixel */ 621 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 622 src_temp2 = _mm_cvtepu8_epi16(src_temp2); 623 624 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ 625 src_temp2 = _mm_add_epi16(src_temp2, src_temp1); 626 627 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */ 628 src_temp2 = _mm_srli_epi16(src_temp2, 2); 629 src_temp2 = _mm_packus_epi16(src_temp2, zero_8x16b); 630 631 _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2); 632 633 /* retore first value*/ 634 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) 635 >> 2); 636 637 for(row = 1; row < nt; row++) 638 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) 639 >> 2; 640 641 /* Fill the remaining rows with DC value*/ 642 643 src_temp1 = _mm_set1_epi8(dc_val); 644 src_temp2 = _mm_set1_epi8(dc_val); 645 src_temp3 = _mm_set1_epi8(dc_val); 646 src_temp4 = _mm_set1_epi8(dc_val); 647 src_temp5 = _mm_set1_epi8(dc_val); 648 src_temp6 = _mm_set1_epi8(dc_val); 649 src_temp7 = _mm_set1_epi8(dc_val); 650 651 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0); 652 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0); 653 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0); 654 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0); 655 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0); 656 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0); 657 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0); 658 659 _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); 660 _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); 661 _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); 662 _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); 663 _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); 664 _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); 665 _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); 666 667 } 668 else if(nt == 16) /* if nt%8==0*/ 669 { 670 671 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 672 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16)); 673 674 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 675 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8)); 676 677 src_temp3 = _mm_sad_epu8(src_temp3, m_zero); 678 src_temp4 = _mm_sad_epu8(src_temp4, m_zero); 679 680 src_temp2 = _mm_cvtepu8_epi16(src_temp2); 681 src_temp10 = _mm_cvtepu8_epi16(src_temp10); 682 683 src_temp4 = _mm_add_epi16(src_temp3, src_temp4); 684 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 685 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 686 687 acc_dc = _mm_cvtsi128_si32(src_temp4); 688 689 acc_dc += pu1_ref[three_nt]; 690 acc_dc -= pu1_ref[two_nt]; 691 692 /* computing acc_dc value */ 693 694 dc_val = (acc_dc + nt) >> (log2nt + 1); 695 696 three_dc_val = 3 * dc_val; 697 src_temp1 = _mm_set1_epi16(three_dc_val + 2); 698 two_dc_val = 2 * dc_val; 699 700 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ 701 src_temp2 = _mm_add_epi16(src_temp2, src_temp1); 702 src_temp10 = _mm_add_epi16(src_temp10, src_temp1); 703 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */ 704 src_temp2 = _mm_srli_epi16(src_temp2, 2); 705 src_temp10 = _mm_srli_epi16(src_temp10, 2); 706 707 src_temp2 = _mm_packus_epi16(src_temp2, src_temp10); 708 709 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2); 710 711 /* retore first value*/ 712 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) 713 >> 2); 714 715 for(row = 1; row < nt; row++) 716 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) 717 >> 2; 718 /* Fill the remaining rows with DC value*/ 719 src_temp1 = _mm_set1_epi8(dc_val); 720 src_temp2 = _mm_set1_epi8(dc_val); 721 src_temp3 = _mm_set1_epi8(dc_val); 722 src_temp4 = _mm_set1_epi8(dc_val); 723 src_temp5 = _mm_set1_epi8(dc_val); 724 src_temp6 = _mm_set1_epi8(dc_val); 725 src_temp7 = _mm_set1_epi8(dc_val); 726 727 for(row = 1; row < nt; row += 8) 728 { 729 730 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0); 731 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0); 732 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0); 733 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0); 734 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0); 735 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0); 736 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0); 737 738 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); 739 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); 740 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); 741 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); 742 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); 743 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); 744 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); 745 746 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((8) * dst_strd)], 0); 747 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((9) * dst_strd)], 0); 748 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((10) * dst_strd)], 0); 749 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((11) * dst_strd)], 0); 750 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((12) * dst_strd)], 0); 751 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((13) * dst_strd)], 0); 752 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((14) * dst_strd)], 0); 753 754 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1); 755 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2); 756 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3); 757 758 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((15) * dst_strd)], 0); 759 760 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4); 761 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5); 762 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6); 763 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7); 764 765 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1); 766 767 } 768 769 } 770 else if(nt == 32) /* if nt%8==0*/ 771 { 772 773 __m128i src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16, src_temp17; 774 775 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 776 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16)); 777 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32)); 778 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48)); 779 780 /* loding 8-bit 16 pixel */ 781 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 782 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8)); 783 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16)); 784 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 24)); 785 786 src_temp3 = _mm_sad_epu8(src_temp3, m_zero); 787 src_temp4 = _mm_sad_epu8(src_temp4, m_zero); 788 src_temp7 = _mm_sad_epu8(src_temp7, m_zero); 789 src_temp8 = _mm_sad_epu8(src_temp8, m_zero); 790 791 src_temp2 = _mm_cvtepu8_epi16(src_temp2); 792 src_temp6 = _mm_cvtepu8_epi16(src_temp6); 793 src_temp9 = _mm_cvtepu8_epi16(src_temp9); 794 src_temp10 = _mm_cvtepu8_epi16(src_temp10); 795 796 src_temp4 = _mm_add_epi16(src_temp3, src_temp4); 797 src_temp8 = _mm_add_epi16(src_temp7, src_temp8); 798 src_temp4 = _mm_add_epi16(src_temp4, src_temp8); 799 800 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 801 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 802 803 acc_dc = _mm_cvtsi128_si32(src_temp4); 804 805 acc_dc += pu1_ref[three_nt]; 806 acc_dc -= pu1_ref[two_nt]; 807 808 /* computing acc_dc value */ 809 810 dc_val = (acc_dc + nt) >> (log2nt + 1); 811 812 three_dc_val = 3 * dc_val; 813 src_temp1 = _mm_set1_epi16(three_dc_val + 2); 814 two_dc_val = 2 * dc_val; 815 816 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ 817 src_temp2 = _mm_add_epi16(src_temp2, src_temp1); 818 src_temp2 = _mm_add_epi16(src_temp6, src_temp1); 819 src_temp2 = _mm_add_epi16(src_temp9, src_temp1); 820 src_temp2 = _mm_add_epi16(src_temp10, src_temp1); 821 822 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */ 823 src_temp2 = _mm_srli_epi16(src_temp2, 2); 824 src_temp6 = _mm_srli_epi16(src_temp6, 2); 825 src_temp9 = _mm_srli_epi16(src_temp9, 2); 826 src_temp10 = _mm_srli_epi16(src_temp10, 2); 827 828 src_temp2 = _mm_packus_epi16(src_temp2, src_temp6); 829 src_temp10 = _mm_packus_epi16(src_temp9, src_temp10); 830 831 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2); 832 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp10); 833 834 /* retore first value*/ 835 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) 836 >> 2); 837 838 for(row = 1; row < nt; row++) 839 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) 840 >> 2; 841 /* Fill the remaining rows with DC value*/ 842 src_temp1 = _mm_insert_epi8(src_temp1, dc_val, 0); 843 844 src_temp2 = src_temp1; 845 src_temp3 = src_temp1; 846 src_temp4 = src_temp1; 847 src_temp5 = src_temp1; 848 src_temp6 = src_temp1; 849 src_temp7 = src_temp1; 850 851 src_temp12 = src_temp1; 852 src_temp13 = src_temp1; 853 src_temp14 = src_temp1; 854 src_temp15 = src_temp1; 855 src_temp16 = src_temp1; 856 src_temp17 = src_temp1; 857 src_temp11 = src_temp1; 858 859 for(row = 1; row < nt; row++) 860 { 861 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0); 862 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0); 863 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0); 864 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0); 865 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0); 866 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0); 867 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0); 868 869 _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), src_temp1); 870 _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), src_temp11); 871 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2); 872 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), src_temp12); 873 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3); 874 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), src_temp13); 875 876 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4); 877 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), src_temp14); 878 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5); 879 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp15); 880 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6); 881 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd) + 16), src_temp16); 882 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7); 883 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd) + 16), src_temp17); 884 885 886 } 887 888 } 889 } 890 } 891 892 /** 893 ******************************************************************************* 894 * 895 * @brief 896 * Intra prediction interpolation filter for horizontal luma variable. 897 * 898 * @par Description: 899 * Horizontal intraprediction(mode 10) with reference samples location 900 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 901 * to section 8.4.4.2.6 in the standard (Special case) 902 * 903 * @param[in] pu1_src 904 * UWORD8 pointer to the source 905 * 906 * @param[out] pu1_dst 907 * UWORD8 pointer to the destination 908 * 909 * @param[in] src_strd 910 * integer source stride 911 * 912 * @param[in] dst_strd 913 * integer destination stride 914 * 915 * @param[in] nt 916 * integer Transform Block size 917 * 918 * @param[in] mode 919 * integer intraprediction mode 920 * 921 * @returns 922 * 923 * @remarks 924 * None 925 * 926 ******************************************************************************* 927 */ 928 929 void ihevc_intra_pred_luma_horz_sse42(UWORD8 *pu1_ref, 930 WORD32 src_strd, 931 UWORD8 *pu1_dst, 932 WORD32 dst_strd, 933 WORD32 nt, 934 WORD32 mode) 935 { 936 937 WORD32 row; 938 WORD32 two_nt; 939 UNUSED(src_strd); 940 UNUSED(mode); 941 942 two_nt = 2 * nt; 943 944 945 if(nt == 32) 946 { 947 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8; 948 __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; 949 __m128i sm = _mm_loadu_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]); 950 951 for(row = 0; row < nt; row += 16) 952 { 953 { 954 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15)); 955 956 src_temp2 = _mm_srli_si128(src_temp1, 1); 957 src_temp3 = _mm_srli_si128(src_temp1, 2); 958 src_temp4 = _mm_srli_si128(src_temp1, 3); 959 src_temp5 = _mm_srli_si128(src_temp1, 4); 960 src_temp6 = _mm_srli_si128(src_temp1, 5); 961 src_temp7 = _mm_srli_si128(src_temp1, 6); 962 src_temp8 = _mm_srli_si128(src_temp1, 7); 963 964 src_temp9 = _mm_srli_si128(src_temp1, 8); 965 src_temp10 = _mm_srli_si128(src_temp1, 9); 966 src_temp11 = _mm_srli_si128(src_temp1, 10); 967 src_temp12 = _mm_srli_si128(src_temp1, 11); 968 src_temp13 = _mm_srli_si128(src_temp1, 12); 969 src_temp14 = _mm_srli_si128(src_temp1, 13); 970 src_temp15 = _mm_srli_si128(src_temp1, 14); 971 src_temp16 = _mm_srli_si128(src_temp1, 15); 972 973 src_temp8 = _mm_shuffle_epi8(src_temp8, sm); 974 src_temp7 = _mm_shuffle_epi8(src_temp7, sm); 975 src_temp6 = _mm_shuffle_epi8(src_temp6, sm); 976 src_temp5 = _mm_shuffle_epi8(src_temp5, sm); 977 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 978 src_temp3 = _mm_shuffle_epi8(src_temp3, sm); 979 src_temp2 = _mm_shuffle_epi8(src_temp2, sm); 980 src_temp1 = _mm_shuffle_epi8(src_temp1, sm); 981 982 src_temp16 = _mm_shuffle_epi8(src_temp16, sm); 983 src_temp15 = _mm_shuffle_epi8(src_temp15, sm); 984 src_temp14 = _mm_shuffle_epi8(src_temp14, sm); 985 src_temp13 = _mm_shuffle_epi8(src_temp13, sm); 986 src_temp12 = _mm_shuffle_epi8(src_temp12, sm); 987 src_temp11 = _mm_shuffle_epi8(src_temp11, sm); 988 src_temp10 = _mm_shuffle_epi8(src_temp10, sm); 989 src_temp9 = _mm_shuffle_epi8(src_temp9, sm); 990 991 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16); 992 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15); 993 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14); 994 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13); 995 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12); 996 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11); 997 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10); 998 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9); 999 1000 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8); 1001 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7); 1002 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6); 1003 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5); 1004 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4); 1005 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3); 1006 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2); 1007 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1); 1008 1009 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16); 1010 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15); 1011 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14); 1012 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13); 1013 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12); 1014 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11); 1015 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10); 1016 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9); 1017 1018 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8); 1019 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7); 1020 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6); 1021 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5); 1022 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4); 1023 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3); 1024 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2); 1025 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1); 1026 1027 } 1028 1029 } 1030 1031 } 1032 else 1033 1034 { 1035 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6; 1036 __m128i src_temp10, zero_8x16b, src_temp7; 1037 1038 /* DC filtering for the first top row and first left column */ 1039 1040 zero_8x16b = _mm_set1_epi16(0); 1041 1042 /*Filtering done for the 1st row */ 1043 1044 src_temp2 = _mm_set1_epi16(pu1_ref[two_nt - 1]); 1045 src_temp10 = _mm_set1_epi16(pu1_ref[two_nt]); 1046 1047 /* loding 8-bit 16 pixels */ 1048 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1049 1050 src_temp4 = _mm_cvtepu8_epi16(src_temp4); 1051 1052 /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/ 1053 src_temp3 = _mm_sub_epi16(src_temp4, src_temp10); 1054 1055 /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/ 1056 src_temp3 = _mm_srai_epi16(src_temp3, 1); 1057 1058 /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/ 1059 src_temp3 = _mm_add_epi16(src_temp2, src_temp3); 1060 1061 if(nt == 4) 1062 { 1063 int temp1, temp2, temp3; 1064 src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b); 1065 temp1 = _mm_cvtsi128_si32(src_temp3); 1066 1067 *(WORD32 *)(&pu1_dst[0]) = temp1; 1068 1069 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 2]); 1070 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 3]); 1071 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 4]); 1072 1073 temp1 = _mm_cvtsi128_si32(src_temp2); 1074 temp2 = _mm_cvtsi128_si32(src_temp3); 1075 temp3 = _mm_cvtsi128_si32(src_temp4); 1076 1077 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 1078 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1; 1079 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2; 1080 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3; 1081 1082 } 1083 else if(nt == 8) 1084 { 1085 src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b); 1086 1087 1088 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]); 1089 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]); 1090 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]); 1091 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]); 1092 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]); 1093 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]); 1094 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]); 1095 1096 _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10); 1097 1098 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 1099 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 1100 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2); 1101 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3); 1102 _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4); 1103 _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5); 1104 _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6); 1105 _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7); 1106 1107 } 1108 else if(nt == 16) 1109 { 1110 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8)); 1111 src_temp4 = _mm_cvtepu8_epi16(src_temp4); 1112 1113 src_temp10 = _mm_sub_epi16(src_temp4, src_temp10); 1114 src_temp10 = _mm_srai_epi16(src_temp10, 1); 1115 src_temp10 = _mm_add_epi16(src_temp2, src_temp10); 1116 1117 src_temp3 = _mm_packus_epi16(src_temp3, src_temp10); 1118 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3); 1119 1120 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 1121 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]); 1122 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]); 1123 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]); 1124 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]); 1125 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]); 1126 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]); 1127 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]); 1128 src_temp10 = _mm_set1_epi8(pu1_ref[two_nt - 9]); 1129 1130 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); 1131 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); 1132 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); 1133 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); 1134 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); 1135 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); 1136 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); 1137 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10); 1138 1139 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 10]); 1140 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 11]); 1141 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 12]); 1142 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 13]); 1143 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 14]); 1144 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 15]); 1145 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 16]); 1146 1147 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1); 1148 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2); 1149 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3); 1150 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4); 1151 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5); 1152 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6); 1153 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7); 1154 1155 } 1156 } 1157 } 1158 1159 /** 1160 ******************************************************************************* 1161 * 1162 * @brief 1163 * Intra prediction interpolation filter for vertical luma variable. 1164 * 1165 * @par Description: 1166 * Horizontal intraprediction with reference neighboring samples location 1167 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 1168 * to section 8.4.4.2.6 in the standard (Special case) 1169 * 1170 * @param[in] pu1_src 1171 * UWORD8 pointer to the source 1172 * 1173 * @param[out] pu1_dst 1174 * UWORD8 pointer to the destination 1175 * 1176 * @param[in] src_strd 1177 * integer source stride 1178 * 1179 * @param[in] dst_strd 1180 * integer destination stride 1181 * 1182 * @param[in] nt 1183 * integer Transform Block size 1184 * 1185 * @param[in] mode 1186 * integer intraprediction mode 1187 * 1188 * @returns 1189 * 1190 * @remarks 1191 * None 1192 * 1193 ******************************************************************************* 1194 */ 1195 1196 1197 void ihevc_intra_pred_luma_ver_sse42(UWORD8 *pu1_ref, 1198 WORD32 src_strd, 1199 UWORD8 *pu1_dst, 1200 WORD32 dst_strd, 1201 WORD32 nt, 1202 WORD32 mode) 1203 { 1204 WORD32 row; 1205 WORD16 s2_predpixel; 1206 WORD32 two_nt = 2 * nt; 1207 __m128i src_temp0, src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7; 1208 1209 UNUSED(src_strd); 1210 UNUSED(mode); 1211 1212 if(nt == 32) 1213 { 1214 __m128i temp1, temp2; 1215 WORD32 itr_count; 1216 1217 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1218 temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16)); 1219 1220 for(itr_count = 0; itr_count < 2; itr_count++) 1221 { 1222 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 1223 _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1); 1224 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1); 1225 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1); 1226 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1); 1227 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1); 1228 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1); 1229 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1); 1230 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1); 1231 1232 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2); 1233 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2); 1234 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2); 1235 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2); 1236 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2); 1237 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2); 1238 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2); 1239 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2); 1240 1241 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1); 1242 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1); 1243 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1); 1244 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1); 1245 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1); 1246 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1); 1247 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1); 1248 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1); 1249 1250 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2); 1251 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2); 1252 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2); 1253 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2); 1254 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2); 1255 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2); 1256 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2); 1257 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2); 1258 1259 pu1_dst += 16 * dst_strd; 1260 } 1261 } 1262 1263 else 1264 1265 { 1266 /*Filtering done for the 1st column */ 1267 for(row = nt - 1; row >= 0; row--) 1268 { 1269 s2_predpixel = pu1_ref[two_nt + 1] 1270 + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); 1271 pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel); 1272 } 1273 1274 /* Replication to next columns*/ 1275 1276 if(nt == 4) 1277 { 1278 int temp1, temp2, temp3, temp4; 1279 1280 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1281 src_temp3 = src_temp2; 1282 src_temp4 = src_temp2; 1283 src_temp5 = src_temp2; 1284 1285 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[(0 * dst_strd)], 0); 1286 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[(1 * dst_strd)], 0); 1287 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[(2 * dst_strd)], 0); 1288 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[(3 * dst_strd)], 0); 1289 1290 temp1 = _mm_cvtsi128_si32(src_temp2); 1291 temp2 = _mm_cvtsi128_si32(src_temp3); 1292 temp3 = _mm_cvtsi128_si32(src_temp4); 1293 temp4 = _mm_cvtsi128_si32(src_temp5); 1294 1295 /* loding 4-bit 8 pixels values */ 1296 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; 1297 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2; 1298 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3; 1299 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4; 1300 1301 } 1302 else if(nt == 8) 1303 { 1304 1305 src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1306 src_temp1 = src_temp0; 1307 src_temp2 = src_temp0; 1308 src_temp3 = src_temp0; 1309 src_temp4 = src_temp0; 1310 src_temp5 = src_temp0; 1311 src_temp6 = src_temp0; 1312 src_temp7 = src_temp0; 1313 1314 src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((0) * dst_strd)], 0); 1315 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((1) * dst_strd)], 0); 1316 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((2) * dst_strd)], 0); 1317 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((3) * dst_strd)], 0); 1318 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((4) * dst_strd)], 0); 1319 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((5) * dst_strd)], 0); 1320 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((6) * dst_strd)], 0); 1321 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((7) * dst_strd)], 0); 1322 1323 _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0); 1324 _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); 1325 _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); 1326 _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); 1327 _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); 1328 _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); 1329 _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); 1330 _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); 1331 1332 1333 } 1334 else if(nt == 16) 1335 { 1336 for(row = 0; row < nt; row += 8) 1337 { 1338 1339 src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1340 src_temp1 = src_temp0; 1341 src_temp2 = src_temp0; 1342 src_temp3 = src_temp0; 1343 src_temp4 = src_temp0; 1344 src_temp5 = src_temp0; 1345 src_temp6 = src_temp0; 1346 src_temp7 = src_temp0; 1347 1348 src_temp0 = _mm_insert_epi8(src_temp0, pu1_dst[((row + 0) * dst_strd)], 0); 1349 src_temp1 = _mm_insert_epi8(src_temp1, pu1_dst[((row + 1) * dst_strd)], 0); 1350 src_temp2 = _mm_insert_epi8(src_temp2, pu1_dst[((row + 2) * dst_strd)], 0); 1351 src_temp3 = _mm_insert_epi8(src_temp3, pu1_dst[((row + 3) * dst_strd)], 0); 1352 src_temp4 = _mm_insert_epi8(src_temp4, pu1_dst[((row + 4) * dst_strd)], 0); 1353 src_temp5 = _mm_insert_epi8(src_temp5, pu1_dst[((row + 5) * dst_strd)], 0); 1354 src_temp6 = _mm_insert_epi8(src_temp6, pu1_dst[((row + 6) * dst_strd)], 0); 1355 src_temp7 = _mm_insert_epi8(src_temp7, pu1_dst[((row + 7) * dst_strd)], 0); 1356 1357 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0); 1358 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp1); 1359 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp2); 1360 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp3); 1361 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp4); 1362 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp5); 1363 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp6); 1364 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp7); 1365 1366 } 1367 1368 } 1369 1370 1371 } 1372 } 1373 1374 1375 /** 1376 ******************************************************************************* 1377 * 1378 * @brief 1379 * Intra prediction interpolation filter for luma mode 3 to mode 9 1380 * 1381 * @par Description: 1382 * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with 1383 * reference neighboring samples location pointed by 'pu1_ref' to the TU 1384 * block location pointed by 'pu1_dst' 1385 * 1386 * @param[in] pu1_src 1387 * UWORD8 pointer to the source 1388 * 1389 * @param[out] pu1_dst 1390 * UWORD8 pointer to the destination 1391 * 1392 * @param[in] src_strd 1393 * integer source stride 1394 * 1395 * @param[in] dst_strd 1396 * integer destination stride 1397 * 1398 * @param[in] nt 1399 * integer Transform Block size 1400 * 1401 * @param[in] mode 1402 * integer intraprediction mode 1403 * 1404 * @returns 1405 * 1406 * @remarks 1407 * None 1408 * 1409 ******************************************************************************* 1410 */ 1411 1412 1413 void ihevc_intra_pred_luma_mode_3_to_9_sse42(UWORD8 *pu1_ref, 1414 WORD32 src_strd, 1415 UWORD8 *pu1_dst, 1416 WORD32 dst_strd, 1417 WORD32 nt, 1418 WORD32 mode) 1419 { 1420 WORD32 row, col; 1421 WORD32 two_nt = 2 * nt; 1422 WORD32 intra_pred_ang; 1423 1424 1425 __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 1426 __m128i fract_4x32b, intra_pred_ang_4x32b; 1427 __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3; 1428 UNUSED(src_strd); 1429 1430 1431 /* Intra Pred Angle according to the mode */ 1432 intra_pred_ang = gai4_ihevc_ang_table[mode]; 1433 1434 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 1435 /* samples dependent on distance to obtain destination sample */ 1436 1437 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 1438 /* samples dependent on distance to obtain destination sample */ 1439 1440 const_temp_4x32b = _mm_set1_epi16(16); 1441 const_temp2_4x32b = _mm_set1_epi32(31); 1442 const_temp3_4x32b = _mm_set1_epi32(32); 1443 const_temp4_4x32b = _mm_set1_epi32(4); 1444 1445 two_nt_4x32b = _mm_set1_epi32(two_nt - nt); 1446 1447 1448 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); 1449 1450 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 1451 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); 1452 1453 row_4x32b = _mm_set_epi32(4, 3, 2, 1); 1454 1455 if(nt == 4) 1456 { 1457 1458 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 1459 int temp11, temp21, temp31, temp41; 1460 // WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16]; 1461 1462 __m128i fract1_8x16b, fract2_8x16b; 1463 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 1464 1465 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1466 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b; 1467 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; 1468 1469 /* pos = ((row + 1) * intra_pred_ang); */ 1470 res_temp5_4x32b = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b); 1471 1472 /* idx = pos >> 5; */ 1473 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 1474 1475 /* fract = pos & (31); */ 1476 ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 1477 1478 /*(32 - fract) */ 1479 row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b); 1480 1481 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 1482 fract2_8x16b = _mm_slli_epi16(row_4x32b, 8); 1483 1484 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 1485 row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */ 1486 1487 fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b); 1488 fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b); 1489 1490 temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00); 1491 temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 1492 temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00); 1493 temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 1494 1495 ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */ 1496 ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */ 1497 ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */ 1498 ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/ 1499 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/ 1500 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/ 1501 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/ 1502 1503 /* loding 8-bit 16 pixels */ 1504 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/ 1505 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/ 1506 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/ 1507 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/ 1508 1509 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ 1510 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ 1511 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ 1512 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ 1513 1514 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1515 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 1516 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 1517 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 1518 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 1519 1520 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1521 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 1522 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 1523 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 1524 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 1525 1526 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1527 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 1528 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 1529 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 1530 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 1531 1532 /* converting 16 bit to 8 bit */ 1533 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 1534 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 1535 1536 1537 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 1538 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 1539 1540 src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 1541 src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4); 1542 src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 1543 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12); 1544 1545 temp11 = _mm_cvtsi128_si32(src_temp7_8x16b); 1546 temp21 = _mm_cvtsi128_si32(src_temp1_8x16b); 1547 temp31 = _mm_cvtsi128_si32(src_temp2_8x16b); 1548 temp41 = _mm_cvtsi128_si32(src_temp3_8x16b); 1549 1550 /* loding 4-bit 8 pixels values */ 1551 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; 1552 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; 1553 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; 1554 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; 1555 1556 } 1557 1558 else if(nt == 16 || nt == 32) 1559 { 1560 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 1561 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 1562 const_temp2_4x32b = _mm_set1_epi16(31); 1563 const_temp4_4x32b = _mm_set1_epi16(8); 1564 const_temp3_4x32b = _mm_set1_epi16(32); 1565 two_nt_4x32b = _mm_set1_epi16(two_nt); 1566 1567 for(col = 0; col < nt; col += 8) 1568 { 1569 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 1570 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 1571 //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; 1572 1573 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 1574 1575 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 1576 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 1577 1578 /* pos = ((row + 1) * intra_pred_ang); */ 1579 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 1580 1581 /* idx = pos >> 5; */ 1582 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 1583 1584 /*(32 - fract) */ 1585 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 1586 1587 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 1588 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 1589 1590 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 1591 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 1592 1593 1594 fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b); 1595 fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b); 1596 1597 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 1598 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 1599 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 1600 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 1601 1602 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 1603 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 1604 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 1605 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 1606 1607 /* fract = pos & (31); */ 1608 ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 1609 1610 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 1611 1612 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 1613 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 1614 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 1615 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 1616 1617 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 1618 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 1619 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 1620 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 1621 1622 for(row = 0; row < nt; row += 8) 1623 { 1624 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1625 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1626 1627 1628 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 1629 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 1630 1631 /* loding 8-bit 16 pixels */ 1632 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/ 1633 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/ 1634 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/ 1635 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/ 1636 1637 /* loding 8-bit 16 pixels */ 1638 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/ 1639 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/ 1640 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/ 1641 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/ 1642 1643 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ 1644 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ 1645 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ 1646 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ 1647 1648 src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/ 1649 src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/ 1650 src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/ 1651 src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/ 1652 1653 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1654 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 1655 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 1656 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 1657 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 1658 1659 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1660 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 1661 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 1662 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 1663 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 1664 1665 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1666 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 1667 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 1668 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 1669 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 1670 1671 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1672 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 1673 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 1674 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 1675 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 1676 1677 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1678 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 1679 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 1680 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 1681 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 1682 1683 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1684 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 1685 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 1686 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 1687 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 1688 1689 /* converting 16 bit to 8 bit */ 1690 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 1691 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 1692 1693 /* converting 16 bit to 8 bit */ 1694 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ 1695 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ 1696 1697 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 1698 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 1699 1700 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 1701 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 1702 1703 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 1704 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 1705 1706 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 1707 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 1708 1709 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 1710 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 1711 1712 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 1713 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 1714 1715 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 1716 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 1717 1718 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 1719 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 1720 1721 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b); /* row=7*/ 1722 1723 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b); /* row=6*/ 1724 1725 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b); /* row=5*/ 1726 1727 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b); /* row=4*/ 1728 1729 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b); /* row=3*/ 1730 1731 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b); /* row=2*/ 1732 1733 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b); /* row=1*/ 1734 1735 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b); /* row=0*/ 1736 1737 } 1738 } 1739 } 1740 else 1741 { 1742 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 1743 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 1744 const_temp2_4x32b = _mm_set1_epi16(31); 1745 const_temp4_4x32b = _mm_set1_epi16(8); 1746 const_temp3_4x32b = _mm_set1_epi16(32); 1747 two_nt_4x32b = _mm_set1_epi16(two_nt - nt); 1748 { 1749 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 1750 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 1751 1752 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 1753 1754 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 1755 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 1756 1757 /* pos = ((row + 1) * intra_pred_ang); */ 1758 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 1759 1760 /* idx = pos >> 5; */ 1761 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 1762 1763 /* fract = pos & (31); */ 1764 ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 1765 1766 /*(32 - fract) */ 1767 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 1768 1769 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 1770 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 1771 1772 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 1773 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 1774 1775 1776 fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b); 1777 fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b); 1778 1779 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 1780 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 1781 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 1782 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 1783 1784 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 1785 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 1786 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 1787 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 1788 1789 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 1790 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 1791 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 1792 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 1793 1794 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 1795 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 1796 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 1797 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 1798 1799 { 1800 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1801 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1802 1803 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 1804 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 1805 1806 /* loding 8-bit 16 pixels */ 1807 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/ 1808 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/ 1809 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/ 1810 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/ 1811 1812 /* loding 8-bit 16 pixels */ 1813 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/ 1814 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/ 1815 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/ 1816 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/ 1817 1818 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ 1819 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ 1820 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ 1821 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ 1822 1823 src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/ 1824 src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/ 1825 src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/ 1826 src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/ 1827 1828 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1829 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 1830 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 1831 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 1832 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 1833 1834 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1835 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 1836 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 1837 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 1838 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 1839 1840 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1841 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 1842 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 1843 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 1844 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 1845 1846 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1847 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 1848 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 1849 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 1850 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 1851 1852 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1853 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 1854 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 1855 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 1856 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 1857 1858 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1859 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 1860 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 1861 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 1862 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 1863 1864 /* converting 16 bit to 8 bit */ 1865 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 1866 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 1867 1868 /* converting 16 bit to 8 bit */ 1869 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ 1870 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ 1871 1872 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 1873 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 1874 1875 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 1876 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 1877 1878 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 1879 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 1880 1881 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 1882 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 1883 1884 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 1885 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 1886 1887 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 1888 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 1889 1890 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 1891 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 1892 1893 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 1894 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 1895 1896 _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b); /* row=0*/ 1897 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b); /* row=1*/ 1898 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b); /* row=2*/ 1899 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b); /* row=3*/ 1900 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b); /* row=4*/ 1901 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b); /* row=5*/ 1902 1903 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b); /* row=6*/ 1904 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b); /* row=7*/ 1905 1906 } 1907 } 1908 } 1909 1910 } 1911 1912 /** 1913 ******************************************************************************* 1914 * 1915 * @brief 1916 * Intra prediction interpolation filter for luma mode 11 to mode 17 1917 * 1918 * @par Description: 1919 * Intraprediction for mode 11 to 17 (negative angle, horizontal mode ) 1920 * with reference neighboring samples location pointed by 'pu1_ref' to the 1921 * TU block location pointed by 'pu1_dst' 1922 * 1923 * @param[in] pu1_src 1924 * UWORD8 pointer to the source 1925 * 1926 * @param[out] pu1_dst 1927 * UWORD8 pointer to the destination 1928 * 1929 * @param[in] src_strd 1930 * integer source stride 1931 * 1932 * @param[in] dst_strd 1933 * integer destination stride 1934 * 1935 * @param[in] nt 1936 * integer Transform Block size 1937 * 1938 * @param[in] mode 1939 * integer intraprediction mode 1940 * 1941 * @returns 1942 * 1943 * @remarks 1944 * None 1945 * 1946 ******************************************************************************* 1947 */ 1948 1949 1950 void ihevc_intra_pred_luma_mode_11_to_17_sse42(UWORD8 *pu1_ref, 1951 WORD32 src_strd, 1952 UWORD8 *pu1_dst, 1953 WORD32 dst_strd, 1954 WORD32 nt, 1955 WORD32 mode) 1956 { 1957 1958 /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/ 1959 /* for ref main & side samples assignment,can be combined for */ 1960 /* optimzation*/ 1961 1962 WORD32 row, col, k; 1963 WORD32 two_nt; 1964 WORD32 intra_pred_ang, inv_ang, inv_ang_sum; 1965 WORD32 ref_idx; 1966 1967 __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 1968 __m128i fract_4x32b, intra_pred_ang_4x32b; 1969 __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3; 1970 1971 1972 UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2]; 1973 UWORD8 *ref_main; 1974 UWORD8 *ref_temp; 1975 UNUSED(src_strd); 1976 1977 inv_ang_sum = 128; 1978 two_nt = 2 * nt; 1979 ref_temp = ref_tmp + 1; 1980 ref_main = ref_temp + nt - 1; 1981 intra_pred_ang = gai4_ihevc_ang_table[mode]; 1982 1983 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 1984 /* samples dependent on distance to obtain destination sample */ 1985 const_temp_4x32b = _mm_set1_epi16(16); 1986 const_temp2_4x32b = _mm_set1_epi32(31); 1987 const_temp3_4x32b = _mm_set1_epi32(32); 1988 const_temp4_4x32b = _mm_set1_epi32(4); 1989 1990 two_nt_4x32b = _mm_set1_epi32(1); 1991 1992 1993 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); 1994 1995 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 1996 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); 1997 1998 row_4x32b = _mm_set_epi32(4, 3, 2, 1); 1999 2000 if(nt == 4) 2001 { 2002 2003 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 2004 int temp11, temp21, temp31, temp41; 2005 // WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16]; 2006 2007 __m128i fract1_8x16b, fract2_8x16b; 2008 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2009 2010 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 2011 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 2012 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; 2013 2014 /* Intermediate reference samples for negative angle modes */ 2015 /* This have to be removed during optimization*/ 2016 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */ 2017 inv_ang = gai4_ihevc_inv_ang_table[mode - 11]; 2018 2019 ref_main = ref_temp + nt - 1; 2020 for(k = 0; k < nt + 1; k++) 2021 ref_temp[k + nt - 1] = pu1_ref[two_nt - k]; 2022 2023 ref_main = ref_temp + nt - 1; 2024 ref_idx = (nt * intra_pred_ang) >> 5; 2025 2026 /* SIMD Optimization can be done using look-up table for the loop */ 2027 /* For negative angled derive the main reference samples from side */ 2028 /* reference samples refer to section 8.4.4.2.6 */ 2029 for(k = -1; k > ref_idx; k--) 2030 { 2031 inv_ang_sum += inv_ang; 2032 ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)]; 2033 } 2034 2035 2036 /* pos = ((row + 1) * intra_pred_ang); */ 2037 res_temp5_4x32b = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b); 2038 2039 /* idx = pos >> 5; */ 2040 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2041 2042 /* fract = pos & (31); */ 2043 ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 2044 2045 /*(32 - fract) */ 2046 row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b); 2047 2048 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 2049 fract2_8x16b = _mm_slli_epi16(row_4x32b, 8); 2050 2051 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 2052 row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */ 2053 2054 fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b); 2055 fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b); 2056 2057 temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00); 2058 temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 2059 temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00); 2060 temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 2061 2062 ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */ 2063 ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */ 2064 ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */ 2065 ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/ 2066 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/ 2067 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/ 2068 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/ 2069 2070 /* loding 8-bit 16 pixels */ 2071 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/ 2072 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/ 2073 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/ 2074 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/ 2075 2076 src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ 2077 src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ 2078 src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ 2079 src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ 2080 2081 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2082 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 2083 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 2084 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 2085 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 2086 2087 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2088 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 2089 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 2090 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 2091 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 2092 2093 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2094 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 2095 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 2096 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 2097 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 2098 2099 /* converting 16 bit to 8 bit */ 2100 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 2101 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 2102 2103 2104 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 2105 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 2106 2107 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 2108 src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4); 2109 src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8); 2110 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12); 2111 2112 temp11 = _mm_cvtsi128_si32(src_temp7_8x16b); 2113 temp21 = _mm_cvtsi128_si32(src_temp1_8x16b); 2114 temp31 = _mm_cvtsi128_si32(src_temp2_8x16b); 2115 temp41 = _mm_cvtsi128_si32(src_temp3_8x16b); 2116 2117 /* loding 8-bit 4 pixels values */ 2118 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; 2119 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; 2120 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; 2121 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; 2122 } 2123 2124 else if(nt == 32) 2125 { 2126 2127 2128 __m128i temp1, temp2, temp3, temp11, temp12; 2129 __m128i src_values0, src_values1; 2130 /* Intermediate reference samples for negative angle modes */ 2131 2132 ref_temp[two_nt - 1] = pu1_ref[two_nt - nt]; 2133 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1)); 2134 temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17)); 2135 temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); 2136 2137 /* For negative angled derive the main reference samples from side */ 2138 2139 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ 2140 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/ 2141 2142 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode])); 2143 temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); 2144 2145 src_values0 = _mm_shuffle_epi8(src_values0, temp2); 2146 src_values1 = _mm_shuffle_epi8(src_values1, temp2); 2147 src_values0 = _mm_shuffle_epi8(src_values0, temp12); 2148 src_values1 = _mm_shuffle_epi8(src_values1, temp11); 2149 2150 temp1 = _mm_shuffle_epi8(temp1, temp2); 2151 temp3 = _mm_shuffle_epi8(temp3, temp2); 2152 2153 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3); 2154 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1); 2155 _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0); 2156 _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1); 2157 2158 2159 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2160 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 2161 const_temp2_4x32b = _mm_set1_epi16(31); 2162 const_temp4_4x32b = _mm_set1_epi16(8); 2163 const_temp3_4x32b = _mm_set1_epi16(32); 2164 two_nt_4x32b = _mm_set1_epi16(1); 2165 2166 for(col = 0; col < nt; col += 8) 2167 { 2168 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 2169 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 2170 // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; 2171 2172 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 2173 2174 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2175 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 2176 2177 /* pos = ((row + 1) * intra_pred_ang); */ 2178 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2179 2180 /* idx = pos >> 5; */ 2181 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2182 2183 /* fract = pos & (31); */ 2184 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 2185 2186 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 2187 /*(32 - fract) */ 2188 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 2189 2190 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 2191 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 2192 2193 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 2194 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 2195 2196 2197 fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); 2198 fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); 2199 2200 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 2201 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 2202 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 2203 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 2204 2205 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 2206 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 2207 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 2208 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 2209 2210 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 2211 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 2212 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 2213 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 2214 2215 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 2216 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 2217 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 2218 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 2219 2220 for(row = 0; row < nt; row += 8) 2221 { 2222 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 2223 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 2224 2225 2226 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 2227 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 2228 2229 /* loding 8-bit 16 pixels */ 2230 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/ 2231 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/ 2232 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/ 2233 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/ 2234 2235 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/ 2236 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/ 2237 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/ 2238 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/ 2239 2240 /* loding 8-bit 16 pixels */ 2241 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/ 2242 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/ 2243 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/ 2244 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/ 2245 2246 src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ 2247 src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ 2248 src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ 2249 src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ 2250 2251 src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ 2252 src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ 2253 src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ 2254 src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ 2255 2256 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2257 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 2258 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 2259 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 2260 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 2261 2262 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2263 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 2264 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 2265 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 2266 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 2267 2268 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2269 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 2270 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 2271 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 2272 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 2273 2274 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2275 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 2276 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 2277 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 2278 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 2279 2280 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2281 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 2282 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 2283 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 2284 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 2285 2286 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2287 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 2288 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 2289 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 2290 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 2291 2292 /* converting 16 bit to 8 bit */ 2293 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 2294 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 2295 2296 /* converting 16 bit to 8 bit */ 2297 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ 2298 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ 2299 2300 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 2301 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 2302 2303 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 2304 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 2305 2306 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 2307 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 2308 2309 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 2310 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 2311 2312 2313 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 2314 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 2315 2316 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 2317 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 2318 2319 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 2320 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 2321 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 2322 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 2323 2324 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/ 2325 2326 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/ 2327 2328 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/ 2329 2330 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/ 2331 2332 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/ 2333 2334 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/ 2335 2336 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/ 2337 2338 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/ 2339 2340 } 2341 } 2342 } 2343 else if(nt == 16) 2344 { 2345 2346 __m128i temp1, temp2, temp11, src_values0; 2347 /* Intermediate reference samples for negative angle modes */ 2348 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 2349 ref_temp[two_nt - 1] = pu1_ref[two_nt - nt]; 2350 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1)); 2351 temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); 2352 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ 2353 2354 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); 2355 2356 src_values0 = _mm_shuffle_epi8(src_values0, temp2); 2357 temp1 = _mm_shuffle_epi8(temp1, temp2); 2358 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 2359 2360 _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0); 2361 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); 2362 2363 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2364 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 2365 const_temp2_4x32b = _mm_set1_epi16(31); 2366 const_temp4_4x32b = _mm_set1_epi16(8); 2367 const_temp3_4x32b = _mm_set1_epi16(32); 2368 two_nt_4x32b = _mm_set1_epi16(1); 2369 2370 for(col = 0; col < nt; col += 8) 2371 { 2372 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 2373 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 2374 // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; 2375 2376 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 2377 2378 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2379 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 2380 2381 /* pos = ((row + 1) * intra_pred_ang); */ 2382 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2383 2384 /* idx = pos >> 5; */ 2385 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2386 2387 /* fract = pos & (31); */ 2388 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 2389 2390 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 2391 /*(32 - fract) */ 2392 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 2393 2394 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 2395 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 2396 2397 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 2398 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 2399 2400 2401 fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); 2402 fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); 2403 2404 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 2405 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 2406 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 2407 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 2408 2409 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 2410 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 2411 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 2412 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 2413 2414 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 2415 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 2416 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 2417 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 2418 2419 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 2420 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 2421 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 2422 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 2423 2424 for(row = 0; row < nt; row += 8) 2425 { 2426 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 2427 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 2428 2429 2430 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 2431 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 2432 2433 /* loding 8-bit 16 pixels */ 2434 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/ 2435 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/ 2436 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/ 2437 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/ 2438 2439 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/ 2440 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/ 2441 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/ 2442 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/ 2443 2444 /* loding 8-bit 16 pixels */ 2445 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/ 2446 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/ 2447 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/ 2448 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/ 2449 2450 src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ 2451 src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ 2452 src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ 2453 src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ 2454 2455 src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ 2456 src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ 2457 src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ 2458 src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ 2459 2460 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2461 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 2462 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 2463 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 2464 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 2465 2466 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2467 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 2468 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 2469 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 2470 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 2471 2472 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2473 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 2474 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 2475 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 2476 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 2477 2478 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2479 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 2480 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 2481 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 2482 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 2483 2484 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2485 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 2486 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 2487 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 2488 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 2489 2490 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2491 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 2492 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 2493 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 2494 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 2495 2496 /* converting 16 bit to 8 bit */ 2497 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 2498 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 2499 2500 /* converting 16 bit to 8 bit */ 2501 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ 2502 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ 2503 2504 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 2505 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 2506 2507 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 2508 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 2509 2510 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 2511 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 2512 2513 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 2514 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 2515 2516 2517 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 2518 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 2519 2520 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 2521 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 2522 2523 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 2524 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 2525 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 2526 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 2527 2528 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/ 2529 2530 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/ 2531 2532 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/ 2533 2534 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/ 2535 2536 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/ 2537 2538 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/ 2539 2540 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/ 2541 2542 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/ 2543 2544 } 2545 } 2546 } 2547 else 2548 { 2549 2550 2551 __m128i temp1, temp2, temp11, src_values0; 2552 /* Intermediate reference samples for negative angle modes */ 2553 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 2554 ref_temp[two_nt - 1] = pu1_ref[nt]; 2555 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1)); 2556 2557 /* For negative angled derive the main reference samples from side */ 2558 2559 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ 2560 temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); 2561 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); 2562 2563 src_values0 = _mm_shuffle_epi8(src_values0, temp2); 2564 temp1 = _mm_shuffle_epi8(temp1, temp2); 2565 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 2566 src_values0 = _mm_srli_si128(src_values0, 8); 2567 2568 _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1); 2569 _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0); 2570 2571 2572 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2573 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 2574 const_temp2_4x32b = _mm_set1_epi16(31); 2575 const_temp4_4x32b = _mm_set1_epi16(8); 2576 const_temp3_4x32b = _mm_set1_epi16(32); 2577 two_nt_4x32b = _mm_set1_epi16(1); 2578 2579 { 2580 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 2581 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 2582 //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; 2583 2584 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 2585 2586 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2587 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 2588 2589 /* pos = ((row + 1) * intra_pred_ang); */ 2590 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2591 2592 /* idx = pos >> 5; */ 2593 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2594 2595 /* fract = pos & (31); */ 2596 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 2597 2598 /*(32 - fract) */ 2599 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 2600 2601 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 2602 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 2603 2604 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 2605 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 2606 2607 fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); 2608 fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); 2609 2610 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 2611 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 2612 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 2613 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 2614 2615 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 2616 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 2617 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 2618 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 2619 2620 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 2621 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 2622 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 2623 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 2624 2625 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 2626 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 2627 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 2628 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 2629 2630 { 2631 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 2632 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 2633 2634 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 2635 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 2636 2637 /* loding 8-bit 16 pixels */ 2638 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/ 2639 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/ 2640 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/ 2641 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/ 2642 2643 /* loding 8-bit 16 pixels */ 2644 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/ 2645 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/ 2646 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/ 2647 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/ 2648 2649 src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ 2650 src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ 2651 src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ 2652 src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ 2653 2654 src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ 2655 src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ 2656 src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ 2657 src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ 2658 2659 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2660 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 2661 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 2662 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 2663 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 2664 2665 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2666 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 2667 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 2668 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 2669 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 2670 2671 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2672 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 2673 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 2674 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 2675 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 2676 2677 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2678 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* row=0*/ 2679 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* row=1*/ 2680 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* row=2*/ 2681 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* row=3*/ 2682 2683 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2684 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 2685 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 2686 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 2687 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 2688 2689 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2690 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 2691 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 2692 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 2693 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 2694 2695 /* converting 16 bit to 8 bit */ 2696 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 2697 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 2698 2699 /* converting 16 bit to 8 bit */ 2700 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/ 2701 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/ 2702 2703 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 2704 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 2705 2706 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 2707 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 2708 2709 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 2710 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 2711 2712 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 2713 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 2714 2715 2716 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 2717 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 2718 2719 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 2720 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 2721 2722 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 2723 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 2724 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 2725 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 2726 2727 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b); /* row=0*/ 2728 2729 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b); /* row=1*/ 2730 2731 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b); /* row=2*/ 2732 2733 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b); /* row=3*/ 2734 2735 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b); /* row=4*/ 2736 2737 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b); /* row=5*/ 2738 2739 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b); /* row=6*/ 2740 2741 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b); /* row=7*/ 2742 2743 } 2744 } 2745 } 2746 2747 } 2748 2749 2750 2751 /** 2752 ******************************************************************************* 2753 * 2754 * @brief 2755 * Intra prediction interpolation filter for luma mode 19 to mode 25 2756 * 2757 * @par Description: 2758 * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with 2759 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2760 * block location pointed by 'pu1_dst' 2761 * 2762 * @param[in] pu1_src 2763 * UWORD8 pointer to the source 2764 * 2765 * @param[out] pu1_dst 2766 * UWORD8 pointer to the destination 2767 * 2768 * @param[in] src_strd 2769 * integer source stride 2770 * 2771 * @param[in] dst_strd 2772 * integer destination stride 2773 * 2774 * @param[in] nt 2775 * integer Transform Block size 2776 * 2777 * @param[in] mode 2778 * integer intraprediction mode 2779 * 2780 * @returns 2781 * 2782 * @remarks 2783 * None 2784 * 2785 ******************************************************************************* 2786 */ 2787 2788 2789 void ihevc_intra_pred_luma_mode_19_to_25_sse42(UWORD8 *pu1_ref, 2790 WORD32 src_strd, 2791 UWORD8 *pu1_dst, 2792 WORD32 dst_strd, 2793 WORD32 nt, 2794 WORD32 mode) 2795 { 2796 2797 WORD32 row, k; 2798 WORD32 two_nt, intra_pred_ang; 2799 WORD32 inv_ang, inv_ang_sum; 2800 //WORD32 ref_main_idx, pos, fract, idx; 2801 WORD32 ref_idx; 2802 UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2]; 2803 UWORD8 *ref_main, *ref_temp; 2804 2805 __m128i /*fract_8x16b,*/ const_temp_8x16b, sm3; 2806 __m128i temp1, temp2, temp3, temp4; 2807 __m128i temp11, temp12, temp13, temp14; 2808 UNUSED(src_strd); 2809 2810 two_nt = 2 * nt; 2811 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2812 inv_ang = gai4_ihevc_inv_ang_table[mode - 12]; 2813 2814 /* Intermediate reference samples for negative angle modes */ 2815 /* This have to be removed during optimization*/ 2816 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 2817 ref_temp = ref_tmp + 1; 2818 ref_main = ref_temp + nt - 1; 2819 2820 2821 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); 2822 2823 2824 2825 const_temp_8x16b = _mm_set1_epi16(16); 2826 2827 if(nt == 32) 2828 { 2829 2830 __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; 2831 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 2832 __m128i row_4x32b, two_nt_4x32b, src_values12; 2833 2834 __m128i src_values0, src_values1, src_values2, src_values3; 2835 __m128i src_values4, src_values5, src_values6, src_values7; 2836 WORD32 col = 0; 2837 2838 /* Intermediate reference samples for negative angle modes */ 2839 /* This have to be removed during optimization*/ 2840 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 2841 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; 2842 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt)); 2843 temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16)); 2844 2845 /* SIMD Optimization can be done using look-up table for the loop */ 2846 /* For negative angled derive the main reference samples from side */ 2847 /* reference samples refer to section 8.4.4.2.6 */ 2848 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/ 2849 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/ 2850 2851 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19])); 2852 temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); 2853 2854 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 2855 src_values1 = _mm_shuffle_epi8(src_values1, temp12); 2856 2857 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); 2858 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3); 2859 _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1); 2860 _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0); 2861 2862 const_temp2_4x32b = _mm_set1_epi16(31); 2863 const_temp3_4x32b = _mm_set1_epi16(32); 2864 const_temp8_4x32b = _mm_set1_epi16(8); 2865 2866 two_nt_4x32b = _mm_set1_epi16(1); 2867 2868 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 2869 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2870 2871 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 2872 2873 for(row = 0; row < nt; row += 8) 2874 { 2875 2876 WORD16 ref_main_idx[9]; 2877 2878 __m128i res_temp5_4x32b; 2879 __m128i fract1_8x16b, fract2_8x16b; 2880 2881 /* pos = ((row + 1) * intra_pred_ang); */ 2882 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2883 2884 /* fract = pos & (31); */ 2885 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 2886 2887 /* idx = pos >> 5; */ 2888 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2889 2890 /*(32 - fract) */ 2891 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 2892 2893 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 2894 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 2895 2896 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 2897 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 2898 2899 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 2900 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 2901 2902 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 2903 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 2904 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 2905 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 2906 2907 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 2908 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 2909 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 2910 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 2911 2912 row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); 2913 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 2914 for(col = 0; col < nt; col += 16) 2915 { 2916 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col)); 2917 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col)); 2918 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col)); 2919 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col)); 2920 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col)); 2921 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col)); 2922 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col)); 2923 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col)); 2924 2925 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 2926 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 2927 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 2928 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 2929 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 2930 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 2931 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 2932 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 2933 2934 2935 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 2936 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 2937 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 2938 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 2939 src_values4 = _mm_maddubs_epi16(src_values4, temp1); 2940 src_values5 = _mm_maddubs_epi16(src_values5, temp2); 2941 src_values6 = _mm_maddubs_epi16(src_values6, temp3); 2942 src_values7 = _mm_maddubs_epi16(src_values7, temp4); 2943 2944 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2945 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 2946 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 2947 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 2948 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 2949 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 2950 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 2951 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 2952 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 2953 2954 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2955 src_values0 = _mm_srai_epi16(src_values0, 5); 2956 src_values1 = _mm_srai_epi16(src_values1, 5); 2957 src_values2 = _mm_srai_epi16(src_values2, 5); 2958 src_values3 = _mm_srai_epi16(src_values3, 5); 2959 src_values4 = _mm_srai_epi16(src_values4, 5); 2960 src_values5 = _mm_srai_epi16(src_values5, 5); 2961 src_values6 = _mm_srai_epi16(src_values6, 5); 2962 src_values7 = _mm_srai_epi16(src_values7, 5); 2963 2964 /* converting 16 bit to 8 bit */ 2965 src_values0 = _mm_packus_epi16(src_values0, src_values4); 2966 src_values1 = _mm_packus_epi16(src_values1, src_values5); 2967 src_values2 = _mm_packus_epi16(src_values2, src_values6); 2968 src_values3 = _mm_packus_epi16(src_values3, src_values7); 2969 2970 /* loading 8-bit 8 pixels values */ 2971 _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/ 2972 _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/ 2973 _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/ 2974 _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/ 2975 2976 2977 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col)); 2978 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col)); 2979 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col)); 2980 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col)); 2981 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col)); 2982 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col)); 2983 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col)); 2984 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col)); 2985 2986 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 2987 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 2988 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 2989 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 2990 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 2991 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 2992 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 2993 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 2994 2995 2996 src_values0 = _mm_maddubs_epi16(src_values0, temp11); 2997 src_values1 = _mm_maddubs_epi16(src_values1, temp12); 2998 src_values2 = _mm_maddubs_epi16(src_values2, temp13); 2999 src_values3 = _mm_maddubs_epi16(src_values3, temp14); 3000 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 3001 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 3002 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 3003 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 3004 3005 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3006 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3007 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3008 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3009 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3010 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3011 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3012 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3013 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3014 3015 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3016 src_values0 = _mm_srai_epi16(src_values0, 5); 3017 src_values1 = _mm_srai_epi16(src_values1, 5); 3018 src_values2 = _mm_srai_epi16(src_values2, 5); 3019 src_values3 = _mm_srai_epi16(src_values3, 5); 3020 src_values4 = _mm_srai_epi16(src_values4, 5); 3021 src_values5 = _mm_srai_epi16(src_values5, 5); 3022 src_values6 = _mm_srai_epi16(src_values6, 5); 3023 src_values7 = _mm_srai_epi16(src_values7, 5); 3024 3025 /* converting 16 bit to 8 bit */ 3026 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3027 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3028 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3029 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3030 3031 /* loading 8-bit 8 pixels values */ 3032 _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/ 3033 _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/ 3034 _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/ 3035 _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/ 3036 3037 } 3038 pu1_dst += 8 * dst_strd; 3039 } 3040 3041 } 3042 else if(nt == 16) /* for nt = 16 case */ 3043 { 3044 3045 __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; 3046 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 3047 __m128i row_4x32b, two_nt_4x32b, src_values12; 3048 __m128i src_values0, src_values1, src_values2, src_values3; 3049 __m128i src_values4, src_values5, src_values6, src_values7; 3050 3051 3052 /* Intermediate reference samples for negative angle modes */ 3053 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 3054 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; 3055 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt)); 3056 3057 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/ 3058 3059 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); 3060 3061 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 3062 3063 _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0); 3064 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); 3065 3066 const_temp2_4x32b = _mm_set1_epi16(31); 3067 const_temp3_4x32b = _mm_set1_epi16(32); 3068 const_temp8_4x32b = _mm_set1_epi16(8); 3069 3070 two_nt_4x32b = _mm_set1_epi16(1); 3071 3072 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 3073 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3074 3075 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3076 3077 for(row = 0; row < nt; row += 8) 3078 { 3079 3080 WORD16 ref_main_idx[9]; 3081 3082 __m128i res_temp5_4x32b; 3083 __m128i fract1_8x16b, fract2_8x16b; 3084 3085 /* pos = ((row + 1) * intra_pred_ang); */ 3086 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3087 3088 /* fract = pos & (31); */ 3089 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3090 3091 /* idx = pos >> 5; */ 3092 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3093 3094 /*(32 - fract) */ 3095 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 3096 3097 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 3098 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 3099 3100 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 3101 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 3102 3103 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 3104 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 3105 3106 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 3107 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 3108 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 3109 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 3110 3111 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 3112 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 3113 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 3114 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 3115 3116 row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); 3117 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 3118 3119 { 3120 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0])); 3121 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1])); 3122 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2])); 3123 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3])); 3124 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8)); 3125 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8)); 3126 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8)); 3127 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8)); 3128 3129 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3130 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3131 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3132 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3133 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3134 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3135 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3136 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3137 3138 3139 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 3140 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 3141 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 3142 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 3143 src_values4 = _mm_maddubs_epi16(src_values4, temp1); 3144 src_values5 = _mm_maddubs_epi16(src_values5, temp2); 3145 src_values6 = _mm_maddubs_epi16(src_values6, temp3); 3146 src_values7 = _mm_maddubs_epi16(src_values7, temp4); 3147 3148 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3149 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3150 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3151 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3152 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3153 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3154 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3155 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3156 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3157 3158 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3159 src_values0 = _mm_srai_epi16(src_values0, 5); 3160 src_values1 = _mm_srai_epi16(src_values1, 5); 3161 src_values2 = _mm_srai_epi16(src_values2, 5); 3162 src_values3 = _mm_srai_epi16(src_values3, 5); 3163 src_values4 = _mm_srai_epi16(src_values4, 5); 3164 src_values5 = _mm_srai_epi16(src_values5, 5); 3165 src_values6 = _mm_srai_epi16(src_values6, 5); 3166 src_values7 = _mm_srai_epi16(src_values7, 5); 3167 3168 /* converting 16 bit to 8 bit */ 3169 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3170 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3171 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3172 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3173 3174 /* loading 8-bit 8 pixels values */ 3175 _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/ 3176 _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/ 3177 _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/ 3178 _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/ 3179 3180 3181 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4])); 3182 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5])); 3183 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6])); 3184 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7])); 3185 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8)); 3186 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8)); 3187 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8)); 3188 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8)); 3189 3190 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3191 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3192 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3193 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3194 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3195 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3196 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3197 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3198 3199 3200 src_values0 = _mm_maddubs_epi16(src_values0, temp11); 3201 src_values1 = _mm_maddubs_epi16(src_values1, temp12); 3202 src_values2 = _mm_maddubs_epi16(src_values2, temp13); 3203 src_values3 = _mm_maddubs_epi16(src_values3, temp14); 3204 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 3205 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 3206 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 3207 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 3208 3209 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3210 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3211 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3212 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3213 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3214 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3215 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3216 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3217 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3218 3219 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3220 src_values0 = _mm_srai_epi16(src_values0, 5); 3221 src_values1 = _mm_srai_epi16(src_values1, 5); 3222 src_values2 = _mm_srai_epi16(src_values2, 5); 3223 src_values3 = _mm_srai_epi16(src_values3, 5); 3224 src_values4 = _mm_srai_epi16(src_values4, 5); 3225 src_values5 = _mm_srai_epi16(src_values5, 5); 3226 src_values6 = _mm_srai_epi16(src_values6, 5); 3227 src_values7 = _mm_srai_epi16(src_values7, 5); 3228 3229 /* converting 16 bit to 8 bit */ 3230 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3231 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3232 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3233 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3234 3235 /* loading 8-bit 8 pixels values */ 3236 _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/ 3237 _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/ 3238 _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/ 3239 _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/ 3240 3241 } 3242 pu1_dst += 8 * dst_strd; 3243 } 3244 } 3245 else if(nt == 8) 3246 { 3247 3248 3249 __m128i const_temp2_4x32b, const_temp3_4x32b; 3250 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 3251 3252 __m128i row_4x32b, two_nt_4x32b, src_values12; 3253 __m128i src_values0, src_values1, src_values2, src_values3; 3254 __m128i src_values4, src_values5, src_values6, src_values7; 3255 3256 3257 /* Intermediate reference samples for negative angle modes */ 3258 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 3259 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; 3260 temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt)); 3261 3262 /* For negative angled derive the main reference samples from side */ 3263 3264 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/ 3265 3266 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); 3267 3268 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 3269 src_values0 = _mm_srli_si128(src_values0, 8); 3270 _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1); 3271 _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0); 3272 3273 3274 3275 const_temp2_4x32b = _mm_set1_epi16(31); 3276 const_temp3_4x32b = _mm_set1_epi16(32); 3277 3278 3279 two_nt_4x32b = _mm_set1_epi16(1); 3280 3281 3282 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 3283 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3284 3285 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3286 3287 { 3288 3289 WORD16 ref_main_idx[9]; 3290 3291 __m128i res_temp5_4x32b; 3292 __m128i fract1_8x16b, fract2_8x16b; 3293 3294 /* pos = ((row + 1) * intra_pred_ang); */ 3295 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3296 3297 /* fract = pos & (31); */ 3298 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3299 3300 /* idx = pos >> 5; */ 3301 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3302 3303 /*(32 - fract) */ 3304 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 3305 3306 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 3307 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 3308 3309 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 3310 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 3311 3312 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 3313 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 3314 3315 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 3316 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 3317 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 3318 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 3319 3320 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 3321 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 3322 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 3323 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 3324 3325 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 3326 3327 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0])); /* col = 0-7 */ 3328 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1])); /* col = 8-15 */ 3329 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2])); /* col = 16-23 */ 3330 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3])); /* col = 24-31 */ 3331 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4])); /* col = 32-39 */ 3332 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5])); /* col = 40-47 */ 3333 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6])); /* col = 48-55 */ 3334 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7])); /* col = 56-63*/ 3335 3336 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3337 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3338 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3339 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3340 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3341 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3342 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3343 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3344 3345 3346 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 3347 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 3348 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 3349 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 3350 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 3351 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 3352 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 3353 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 3354 3355 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3356 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3357 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3358 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3359 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3360 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3361 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3362 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3363 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3364 3365 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3366 src_values0 = _mm_srai_epi16(src_values0, 5); 3367 src_values1 = _mm_srai_epi16(src_values1, 5); 3368 src_values2 = _mm_srai_epi16(src_values2, 5); 3369 src_values3 = _mm_srai_epi16(src_values3, 5); 3370 src_values4 = _mm_srai_epi16(src_values4, 5); 3371 src_values5 = _mm_srai_epi16(src_values5, 5); 3372 src_values6 = _mm_srai_epi16(src_values6, 5); 3373 src_values7 = _mm_srai_epi16(src_values7, 5); 3374 3375 /* converting 16 bit to 8 bit */ 3376 src_values0 = _mm_packus_epi16(src_values0, src_values1); 3377 src_values2 = _mm_packus_epi16(src_values2, src_values3); 3378 src_values1 = _mm_srli_si128(src_values0, 8); 3379 src_values3 = _mm_srli_si128(src_values2, 8); 3380 src_values4 = _mm_packus_epi16(src_values4, src_values5); 3381 src_values6 = _mm_packus_epi16(src_values6, src_values7); 3382 src_values5 = _mm_srli_si128(src_values4, 8); 3383 src_values7 = _mm_srli_si128(src_values6, 8); 3384 3385 /* loading 8-bit 8 pixels values */ 3386 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/ 3387 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/ 3388 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/ 3389 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/ 3390 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/ 3391 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/ 3392 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/ 3393 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/ 3394 } 3395 } 3396 else /* if nt =4*/ 3397 { 3398 3399 __m128i const_temp2_4x32b, const_temp3_4x32b; 3400 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 3401 3402 __m128i row_4x32b, two_nt_4x32b, src_values12; 3403 3404 3405 for(k = 0; k < (nt + 1); k++) 3406 ref_temp[k + nt - 1] = pu1_ref[two_nt + k]; 3407 ref_idx = (nt * intra_pred_ang) >> 5; 3408 inv_ang_sum = 128; 3409 3410 for(k = -1; k > ref_idx; k--) 3411 { 3412 inv_ang_sum += inv_ang; 3413 ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)]; 3414 } 3415 3416 3417 const_temp2_4x32b = _mm_set1_epi32(31); 3418 const_temp3_4x32b = _mm_set1_epi32(32); 3419 3420 two_nt_4x32b = _mm_set1_epi32(1); 3421 3422 3423 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 3424 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); 3425 3426 row_4x32b = _mm_set_epi32(4, 3, 2, 1); 3427 { 3428 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 3429 int temp11, temp21, temp31, temp41; 3430 3431 3432 __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b; 3433 __m128i src_values0, src_values1, src_values2, src_values3; 3434 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; 3435 3436 /* pos = ((row + 1) * intra_pred_ang); */ 3437 res_temp5_4x32b = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b); 3438 3439 /* fract = pos & (31); */ 3440 src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 3441 3442 ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */ 3443 ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */ 3444 ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */ 3445 ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/ 3446 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/ 3447 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/ 3448 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/ 3449 3450 /* idx = pos >> 5; */ 3451 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3452 3453 /*(32 - fract) */ 3454 src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11); 3455 3456 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 3457 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 3458 3459 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 3460 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 3461 3462 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 3463 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 3464 3465 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 3466 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 3467 temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 3468 temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 3469 3470 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */ 3471 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */ 3472 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */ 3473 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */ 3474 3475 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3476 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3477 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3478 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3479 3480 3481 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 3482 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 3483 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 3484 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 3485 3486 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3487 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3488 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3489 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3490 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3491 3492 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3493 src_values0 = _mm_srai_epi16(src_values0, 5); 3494 src_values1 = _mm_srai_epi16(src_values1, 5); 3495 src_values2 = _mm_srai_epi16(src_values2, 5); 3496 src_values3 = _mm_srai_epi16(src_values3, 5); 3497 3498 /* converting 16 bit to 8 bit */ 3499 src_values0 = _mm_packus_epi16(src_values0, src_values1); 3500 src_values2 = _mm_packus_epi16(src_values2, src_values3); 3501 src_values1 = _mm_srli_si128(src_values0, 8); 3502 src_values3 = _mm_srli_si128(src_values2, 8); 3503 3504 temp11 = _mm_cvtsi128_si32(src_values0); 3505 temp21 = _mm_cvtsi128_si32(src_values1); 3506 temp31 = _mm_cvtsi128_si32(src_values2); 3507 temp41 = _mm_cvtsi128_si32(src_values3); 3508 3509 /* loding 4-bit 8 pixels values */ 3510 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; 3511 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; 3512 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; 3513 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; 3514 3515 } 3516 } 3517 } 3518 3519 3520 /** 3521 ******************************************************************************* 3522 * 3523 * @brief 3524 * Intra prediction interpolation filter for luma mode 27 to mode 33 3525 * 3526 * @par Description: 3527 * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with 3528 * reference neighboring samples location pointed by 'pu1_ref' to the TU 3529 * block location pointed by 'pu1_dst' 3530 * 3531 * @param[in] pu1_src 3532 * UWORD8 pointer to the source 3533 * 3534 * @param[out] pu1_dst 3535 * UWORD8 pointer to the destination 3536 * 3537 * @param[in] src_strd 3538 * integer source stride 3539 * 3540 * @param[in] dst_strd 3541 * integer destination stride 3542 * 3543 * @param[in] nt 3544 * integer Transform Block size 3545 * 3546 * @param[in] mode 3547 * integer intraprediction mode 3548 * 3549 * @returns 3550 * 3551 * @remarks 3552 * None 3553 * 3554 ******************************************************************************* 3555 */ 3556 3557 3558 void ihevc_intra_pred_luma_mode_27_to_33_sse42(UWORD8 *pu1_ref, 3559 WORD32 src_strd, 3560 UWORD8 *pu1_dst, 3561 WORD32 dst_strd, 3562 WORD32 nt, 3563 WORD32 mode) 3564 { 3565 WORD32 row; 3566 WORD32 two_nt; 3567 WORD32 intra_pred_ang; 3568 3569 __m128i temp11, temp12, temp13, temp14; 3570 3571 __m128i const_temp_8x16b; 3572 __m128i temp1, temp2, temp3, temp4, sm3; 3573 UNUSED(src_strd); 3574 3575 two_nt = 2 * nt; 3576 intra_pred_ang = gai4_ihevc_ang_table[mode]; 3577 3578 const_temp_8x16b = _mm_set1_epi16(16); 3579 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); 3580 if(nt == 32) 3581 { 3582 3583 __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; 3584 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 3585 __m128i row_4x32b, two_nt_4x32b, src_values12; 3586 int col = 0; 3587 3588 const_temp2_4x32b = _mm_set1_epi16(31); 3589 const_temp3_4x32b = _mm_set1_epi16(32); 3590 const_temp8_4x32b = _mm_set1_epi16(8); 3591 3592 two_nt_4x32b = _mm_set1_epi16(two_nt + 1); 3593 3594 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 3595 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3596 3597 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3598 3599 for(row = 0; row < nt; row += 8) 3600 { 3601 3602 WORD16 ref_main_idx[9]; 3603 3604 __m128i res_temp5_4x32b; 3605 __m128i fract1_8x16b, fract2_8x16b; 3606 __m128i src_values0, src_values1, src_values2, src_values3; 3607 __m128i src_values4, src_values5, src_values6, src_values7; 3608 3609 /* pos = ((row + 1) * intra_pred_ang); */ 3610 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3611 3612 /* fract = pos & (31); */ 3613 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3614 3615 /* idx = pos >> 5; */ 3616 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3617 3618 /*(32 - fract) */ 3619 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 3620 3621 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 3622 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 3623 3624 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 3625 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 3626 3627 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 3628 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 3629 3630 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 3631 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 3632 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 3633 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 3634 3635 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 3636 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 3637 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 3638 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 3639 3640 row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); 3641 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 3642 for(col = 0; col < nt; col += 16) 3643 { 3644 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col)); 3645 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col)); 3646 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col)); 3647 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col)); 3648 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col)); 3649 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col)); 3650 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col)); 3651 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col)); 3652 3653 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3654 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3655 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3656 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3657 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3658 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3659 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3660 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3661 3662 3663 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 3664 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 3665 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 3666 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 3667 src_values4 = _mm_maddubs_epi16(src_values4, temp1); 3668 src_values5 = _mm_maddubs_epi16(src_values5, temp2); 3669 src_values6 = _mm_maddubs_epi16(src_values6, temp3); 3670 src_values7 = _mm_maddubs_epi16(src_values7, temp4); 3671 3672 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3673 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3674 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3675 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3676 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3677 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3678 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3679 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3680 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3681 3682 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3683 src_values0 = _mm_srai_epi16(src_values0, 5); 3684 src_values1 = _mm_srai_epi16(src_values1, 5); 3685 src_values2 = _mm_srai_epi16(src_values2, 5); 3686 src_values3 = _mm_srai_epi16(src_values3, 5); 3687 src_values4 = _mm_srai_epi16(src_values4, 5); 3688 src_values5 = _mm_srai_epi16(src_values5, 5); 3689 src_values6 = _mm_srai_epi16(src_values6, 5); 3690 src_values7 = _mm_srai_epi16(src_values7, 5); 3691 3692 /* converting 16 bit to 8 bit */ 3693 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3694 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3695 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3696 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3697 3698 /* loading 8-bit 8 pixels values */ 3699 _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/ 3700 _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/ 3701 _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/ 3702 _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/ 3703 3704 3705 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col)); 3706 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col)); 3707 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col)); 3708 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col)); 3709 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col)); 3710 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col)); 3711 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col)); 3712 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col)); 3713 3714 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3715 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3716 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3717 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3718 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3719 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3720 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3721 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3722 3723 3724 src_values0 = _mm_maddubs_epi16(src_values0, temp11); 3725 src_values1 = _mm_maddubs_epi16(src_values1, temp12); 3726 src_values2 = _mm_maddubs_epi16(src_values2, temp13); 3727 src_values3 = _mm_maddubs_epi16(src_values3, temp14); 3728 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 3729 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 3730 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 3731 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 3732 3733 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3734 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3735 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3736 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3737 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3738 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3739 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3740 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3741 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3742 3743 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3744 src_values0 = _mm_srai_epi16(src_values0, 5); 3745 src_values1 = _mm_srai_epi16(src_values1, 5); 3746 src_values2 = _mm_srai_epi16(src_values2, 5); 3747 src_values3 = _mm_srai_epi16(src_values3, 5); 3748 src_values4 = _mm_srai_epi16(src_values4, 5); 3749 src_values5 = _mm_srai_epi16(src_values5, 5); 3750 src_values6 = _mm_srai_epi16(src_values6, 5); 3751 src_values7 = _mm_srai_epi16(src_values7, 5); 3752 3753 /* converting 16 bit to 8 bit */ 3754 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3755 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3756 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3757 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3758 3759 /* loading 8-bit 8 pixels values */ 3760 _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/ 3761 _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/ 3762 _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/ 3763 _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/ 3764 3765 } 3766 pu1_dst += 8 * dst_strd; 3767 } 3768 3769 } 3770 else if(nt == 16) /* for nt = 16 case */ 3771 { 3772 3773 __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; 3774 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 3775 __m128i row_4x32b, two_nt_4x32b, src_values12; 3776 3777 3778 const_temp2_4x32b = _mm_set1_epi16(31); 3779 const_temp3_4x32b = _mm_set1_epi16(32); 3780 const_temp8_4x32b = _mm_set1_epi16(8); 3781 3782 two_nt_4x32b = _mm_set1_epi16(two_nt + 1); 3783 3784 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 3785 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3786 3787 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3788 3789 for(row = 0; row < nt; row += 8) 3790 { 3791 3792 WORD16 ref_main_idx[9]; 3793 3794 __m128i res_temp5_4x32b; 3795 __m128i fract1_8x16b, fract2_8x16b; 3796 __m128i src_values0, src_values1, src_values2, src_values3; 3797 __m128i src_values4, src_values5, src_values6, src_values7; 3798 3799 /* pos = ((row + 1) * intra_pred_ang); */ 3800 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3801 3802 /* fract = pos & (31); */ 3803 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3804 3805 /* idx = pos >> 5; */ 3806 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3807 3808 /*(32 - fract) */ 3809 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 3810 3811 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 3812 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 3813 3814 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 3815 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 3816 3817 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 3818 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 3819 3820 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 3821 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 3822 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 3823 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 3824 3825 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 3826 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 3827 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 3828 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 3829 3830 row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); 3831 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 3832 3833 { 3834 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); 3835 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); 3836 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); 3837 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); 3838 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8)); 3839 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8)); 3840 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8)); 3841 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8)); 3842 3843 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3844 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3845 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3846 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3847 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3848 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3849 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3850 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3851 3852 3853 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 3854 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 3855 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 3856 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 3857 src_values4 = _mm_maddubs_epi16(src_values4, temp1); 3858 src_values5 = _mm_maddubs_epi16(src_values5, temp2); 3859 src_values6 = _mm_maddubs_epi16(src_values6, temp3); 3860 src_values7 = _mm_maddubs_epi16(src_values7, temp4); 3861 3862 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3863 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3864 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3865 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3866 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3867 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3868 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3869 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3870 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3871 3872 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3873 src_values0 = _mm_srai_epi16(src_values0, 5); 3874 src_values1 = _mm_srai_epi16(src_values1, 5); 3875 src_values2 = _mm_srai_epi16(src_values2, 5); 3876 src_values3 = _mm_srai_epi16(src_values3, 5); 3877 src_values4 = _mm_srai_epi16(src_values4, 5); 3878 src_values5 = _mm_srai_epi16(src_values5, 5); 3879 src_values6 = _mm_srai_epi16(src_values6, 5); 3880 src_values7 = _mm_srai_epi16(src_values7, 5); 3881 3882 /* converting 16 bit to 8 bit */ 3883 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3884 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3885 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3886 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3887 3888 /* loading 8-bit 8 pixels values */ 3889 _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/ 3890 _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/ 3891 _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/ 3892 _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/ 3893 3894 3895 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); 3896 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); 3897 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); 3898 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); 3899 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8)); 3900 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8)); 3901 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8)); 3902 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8)); 3903 3904 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3905 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3906 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3907 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3908 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3909 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3910 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3911 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3912 3913 3914 src_values0 = _mm_maddubs_epi16(src_values0, temp11); 3915 src_values1 = _mm_maddubs_epi16(src_values1, temp12); 3916 src_values2 = _mm_maddubs_epi16(src_values2, temp13); 3917 src_values3 = _mm_maddubs_epi16(src_values3, temp14); 3918 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 3919 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 3920 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 3921 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 3922 3923 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3924 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3925 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3926 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3927 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3928 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3929 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3930 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3931 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3932 3933 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3934 src_values0 = _mm_srai_epi16(src_values0, 5); 3935 src_values1 = _mm_srai_epi16(src_values1, 5); 3936 src_values2 = _mm_srai_epi16(src_values2, 5); 3937 src_values3 = _mm_srai_epi16(src_values3, 5); 3938 src_values4 = _mm_srai_epi16(src_values4, 5); 3939 src_values5 = _mm_srai_epi16(src_values5, 5); 3940 src_values6 = _mm_srai_epi16(src_values6, 5); 3941 src_values7 = _mm_srai_epi16(src_values7, 5); 3942 3943 /* converting 16 bit to 8 bit */ 3944 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3945 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3946 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3947 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3948 3949 /* loading 8-bit 8 pixels values */ 3950 _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/ 3951 _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/ 3952 _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/ 3953 _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/ 3954 3955 } 3956 pu1_dst += 8 * dst_strd; 3957 } 3958 3959 } 3960 else if(nt == 8) 3961 { 3962 3963 __m128i const_temp2_4x32b, const_temp3_4x32b; 3964 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 3965 __m128i row_4x32b, two_nt_4x32b, src_values12; 3966 3967 3968 const_temp2_4x32b = _mm_set1_epi16(31); 3969 const_temp3_4x32b = _mm_set1_epi16(32); 3970 3971 two_nt_4x32b = _mm_set1_epi16(two_nt + 1); 3972 3973 3974 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 3975 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3976 3977 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3978 3979 //for(row = 0; row < nt; row +=4) 3980 { 3981 3982 WORD16 ref_main_idx[9]; 3983 3984 __m128i res_temp5_4x32b; 3985 __m128i fract1_8x16b, fract2_8x16b; 3986 __m128i src_values0, src_values1, src_values2, src_values3; 3987 __m128i src_values4, src_values5, src_values6, src_values7; 3988 3989 /* pos = ((row + 1) * intra_pred_ang); */ 3990 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3991 3992 /* fract = pos & (31); */ 3993 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3994 3995 /* idx = pos >> 5; */ 3996 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3997 3998 /*(32 - fract) */ 3999 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 4000 4001 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 4002 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 4003 4004 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 4005 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 4006 4007 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 4008 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 4009 4010 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 4011 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 4012 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 4013 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 4014 4015 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 4016 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 4017 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 4018 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 4019 4020 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 4021 4022 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); /* col = 0-7 */ 4023 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); /* col = 8-15 */ 4024 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); /* col = 16-23 */ 4025 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); /* col = 24-31 */ 4026 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); /* col = 32-39 */ 4027 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); /* col = 40-47 */ 4028 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); /* col = 48-55 */ 4029 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); /* col = 56-63*/ 4030 4031 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4032 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4033 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4034 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4035 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4036 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4037 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4038 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4039 4040 4041 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 4042 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 4043 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 4044 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 4045 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 4046 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 4047 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 4048 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 4049 4050 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4051 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4052 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4053 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4054 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4055 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4056 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4057 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4058 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4059 4060 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4061 src_values0 = _mm_srai_epi16(src_values0, 5); 4062 src_values1 = _mm_srai_epi16(src_values1, 5); 4063 src_values2 = _mm_srai_epi16(src_values2, 5); 4064 src_values3 = _mm_srai_epi16(src_values3, 5); 4065 src_values4 = _mm_srai_epi16(src_values4, 5); 4066 src_values5 = _mm_srai_epi16(src_values5, 5); 4067 src_values6 = _mm_srai_epi16(src_values6, 5); 4068 src_values7 = _mm_srai_epi16(src_values7, 5); 4069 4070 /* converting 16 bit to 8 bit */ 4071 src_values0 = _mm_packus_epi16(src_values0, src_values1); 4072 src_values2 = _mm_packus_epi16(src_values2, src_values3); 4073 src_values1 = _mm_srli_si128(src_values0, 8); 4074 src_values3 = _mm_srli_si128(src_values2, 8); 4075 src_values4 = _mm_packus_epi16(src_values4, src_values5); 4076 src_values6 = _mm_packus_epi16(src_values6, src_values7); 4077 src_values5 = _mm_srli_si128(src_values4, 8); 4078 src_values7 = _mm_srli_si128(src_values6, 8); 4079 4080 /* loading 8-bit 8 pixels values */ 4081 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/ 4082 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/ 4083 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/ 4084 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/ 4085 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/ 4086 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/ 4087 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/ 4088 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/ 4089 } 4090 4091 } 4092 else /* if nt =4*/ 4093 { 4094 4095 __m128i const_temp2_4x32b, const_temp3_4x32b; 4096 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 4097 4098 __m128i row_4x32b, two_nt_4x32b, src_values12; 4099 4100 4101 const_temp2_4x32b = _mm_set1_epi32(31); 4102 const_temp3_4x32b = _mm_set1_epi32(32); 4103 4104 two_nt_4x32b = _mm_set1_epi32(two_nt + 1); 4105 4106 4107 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 4108 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); 4109 4110 row_4x32b = _mm_set_epi32(4, 3, 2, 1); 4111 { 4112 int temp11, temp21, temp31, temp41; 4113 4114 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 4115 4116 __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b; 4117 __m128i src_values0, src_values1, src_values2, src_values3; 4118 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; 4119 4120 /* pos = ((row + 1) * intra_pred_ang); */ 4121 res_temp5_4x32b = _mm_mullo_epi32(row_4x32b, intra_pred_ang_4x32b); 4122 4123 /* fract = pos & (31); */ 4124 src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 4125 4126 ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */ 4127 ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */ 4128 ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */ 4129 ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/ 4130 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/ 4131 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/ 4132 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/ 4133 4134 /* idx = pos >> 5; */ 4135 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 4136 4137 /*(32 - fract) */ 4138 src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11); 4139 4140 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 4141 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 4142 4143 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 4144 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 4145 4146 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 4147 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 4148 4149 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 4150 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 4151 temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 4152 temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 4153 4154 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */ 4155 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */ 4156 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */ 4157 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */ 4158 4159 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4160 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4161 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4162 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4163 4164 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 4165 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 4166 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 4167 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 4168 4169 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4170 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4171 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4172 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4173 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4174 4175 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4176 src_values0 = _mm_srai_epi16(src_values0, 5); 4177 src_values1 = _mm_srai_epi16(src_values1, 5); 4178 src_values2 = _mm_srai_epi16(src_values2, 5); 4179 src_values3 = _mm_srai_epi16(src_values3, 5); 4180 4181 /* converting 16 bit to 8 bit */ 4182 src_values0 = _mm_packus_epi16(src_values0, src_values1); 4183 src_values2 = _mm_packus_epi16(src_values2, src_values3); 4184 src_values1 = _mm_srli_si128(src_values0, 8); 4185 src_values3 = _mm_srli_si128(src_values2, 8); 4186 4187 temp11 = _mm_cvtsi128_si32(src_values0); 4188 temp21 = _mm_cvtsi128_si32(src_values1); 4189 temp31 = _mm_cvtsi128_si32(src_values2); 4190 temp41 = _mm_cvtsi128_si32(src_values3); 4191 4192 /* loding 4-bit 8 pixels values */ 4193 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; 4194 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; 4195 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; 4196 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; 4197 4198 } 4199 } 4200 } 4201 4202