1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_intra_pred_filters_atom_intr.c 22 * 23 * @brief 24 * Contains function Definition for intra prediction interpolation filters 25 * 26 * 27 * @author 28 * Ittiam 29 * 30 * @par List of Functions: 31 * - ihevc_intra_pred_luma_planar_ssse3() 32 * - ihevc_intra_pred_luma_dc_ssse3() 33 * - ihevc_intra_pred_luma_horz_ssse3() 34 * - ihevc_intra_pred_luma_ver_ssse3() 35 * - ihevc_intra_pred_luma_mode2_ssse3() 36 * - ihevc_intra_pred_luma_mode_18_34_ssse3() 37 * - ihevc_intra_pred_luma_mode_3_to_9_ssse3() 38 * - ihevc_intra_pred_luma_mode_11_to_17_ssse3() 39 * - ihevc_intra_pred_luma_mode_19_to_25_ssse3() 40 * - ihevc_intra_pred_luma_mode_27_to_33_ssse3() 41 * - ihevc_intra_pred_luma_ref_substitution_ssse3() 42 * 43 * @remarks 44 * None 45 * 46 ******************************************************************************* 47 */ 48 49 50 /*****************************************************************************/ 51 /* File Includes */ 52 /*****************************************************************************/ 53 #include <stdlib.h> 54 55 #include "ihevc_typedefs.h" 56 #include "ihevc_intra_pred.h" 57 #include "ihevc_platform_macros.h" 58 #include "ihevc_macros.h" 59 #include "ihevc_func_selector.h" 60 #include "ihevc_common_tables.h" 61 #include "ihevc_defs.h" 62 #include "ihevc_tables_x86_intr.h" 63 64 #include <immintrin.h> 65 66 /****************************************************************************/ 67 /* Constant Macros */ 68 /****************************************************************************/ 69 #define MAX_CU_SIZE 64 70 #define BIT_DEPTH 8 71 #define T32_4NT 128 72 #define T16_4NT 64 73 74 75 /****************************************************************************/ 76 /* Function Macros */ 77 /****************************************************************************/ 78 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x) 79 80 /* tables to shuffle 8-bit values */ 81 82 83 /*****************************************************************************/ 84 /* global tables Definition */ 85 /*****************************************************************************/ 86 87 88 /*****************************************************************************/ 89 /* Function Definition */ 90 /*****************************************************************************/ 91 92 93 /** 94 ******************************************************************************* 95 * 96 * @brief 97 * Intra prediction interpolation filter for pu1_ref substitution 98 * 99 * 100 * @par Description: 101 * Reference substitution process for samples unavailable for prediction 102 * Refer to section 8.4.4.2.2 103 * 104 * @param[in] pu1_top_left 105 * UWORD8 pointer to the top-left 106 * 107 * @param[in] pu1_top 108 * UWORD8 pointer to the top 109 * 110 * @param[in] pu1_left 111 * UWORD8 pointer to the left 112 * 113 * @param[in] src_strd 114 * WORD32 Source stride 115 * 116 * @param[in] nbr_flags 117 * WORD32 neighbor availability flags 118 * 119 * @param[in] nt 120 * WORD32 transform Block size 121 * 122 * @param[in] dst_strd 123 * WORD32 Destination stride 124 * 125 * @returns 126 * 127 * @remarks 128 * None 129 * 130 ******************************************************************************* 131 */ 132 133 void ihevc_intra_pred_luma_ref_substitution_ssse3(UWORD8 *pu1_top_left, 134 UWORD8 *pu1_top, 135 UWORD8 *pu1_left, 136 WORD32 src_strd, 137 WORD32 nt, 138 WORD32 nbr_flags, 139 UWORD8 *pu1_dst, 140 WORD32 dst_strd) 141 { 142 UWORD8 pu1_ref; 143 WORD32 dc_val, i; 144 WORD32 total_samples = (4 * nt) + 1; 145 WORD32 two_nt = 2 * nt; 146 147 WORD32 three_nt = 3 * nt; 148 WORD32 get_bits; 149 WORD32 next; 150 WORD32 bot_left, left, top, tp_right, tp_left; 151 152 WORD32 idx, nbr_id_from_bl, frwd_nbr_flag; 153 UNUSED(dst_strd); 154 155 dc_val = 1 << (BIT_DEPTH - 1); 156 157 158 /* Neighbor Flag Structure*/ 159 /* MSB ---> LSB */ 160 /* Top-Left | Top-Right | Top | Left | Bottom-Left 161 1 4 4 4 4 162 */ 163 /* If no neighbor flags are present, fill the neighbor samples with DC value */ 164 if(nbr_flags == 0) 165 { 166 for(i = 0; i < total_samples; i++) 167 { 168 pu1_dst[i] = dc_val; 169 } 170 } 171 else 172 { 173 /* Else fill the corresponding samples */ 174 pu1_dst[two_nt] = *pu1_top_left; 175 for(i = 0; i < two_nt; i++) 176 pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd]; 177 for(i = 0; i < two_nt; i++) 178 pu1_dst[two_nt + 1 + i] = pu1_top[i]; 179 180 if(nt <= 8) 181 { 182 /* 1 bit extraction for all the neighboring blocks */ 183 tp_left = (nbr_flags & 0x10000) >> 16; 184 bot_left = (nbr_flags & 0x8) >> 3; 185 left = (nbr_flags & 0x80) >> 7; 186 top = (nbr_flags & 0x100) >> 8; 187 tp_right = (nbr_flags & 0x1000) >> 12; 188 189 next = 1; 190 191 /* If bottom -left is not available, reverse substitution process*/ 192 if(bot_left == 0) 193 { 194 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right }; 195 196 /* Check for the 1st available sample from bottom-left*/ 197 while(!a_nbr_flag[next]) 198 next++; 199 200 /* If Left, top-left are available*/ 201 if(next <= 2) 202 { 203 idx = nt * next; 204 pu1_ref = pu1_dst[idx]; 205 for(i = 0; i < idx; i++) 206 pu1_dst[i] = pu1_ref; 207 } 208 else /* If top, top-right are available */ 209 { 210 /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/ 211 idx = (nt * (next - 1)) + 1; 212 pu1_ref = pu1_dst[idx]; 213 for(i = 0; i < idx; i++) 214 pu1_dst[i] = pu1_ref; 215 } 216 } 217 218 /* Forward Substitution Process */ 219 /* If left is Unavailable, copy the last bottom-left value */ 220 if(left == 0) 221 { 222 for(i = 0; i < nt; i++) 223 pu1_dst[nt + i] = pu1_dst[nt - 1]; 224 } 225 /* If top-left is Unavailable, copy the last left value */ 226 if(tp_left == 0) 227 pu1_dst[two_nt] = pu1_dst[two_nt - 1]; 228 /* If top is Unavailable, copy the last top-left value */ 229 if(top == 0) 230 { 231 for(i = 0; i < nt; i++) 232 pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt]; 233 } 234 /* If to right is Unavailable, copy the last top value */ 235 if(tp_right == 0) 236 { 237 for(i = 0; i < nt; i++) 238 pu1_dst[three_nt + 1 + i] = pu1_dst[three_nt]; 239 } 240 } 241 242 if(nt == 16) 243 { 244 WORD32 nbr_flags_temp = 0; 245 nbr_flags_temp = ((nbr_flags & 0xC) >> 2) + ((nbr_flags & 0xC0) >> 4) 246 + ((nbr_flags & 0x300) >> 4) 247 + ((nbr_flags & 0x3000) >> 6) 248 + ((nbr_flags & 0x10000) >> 8); 249 250 /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/ 251 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ 252 { 253 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */ 254 255 if(nbr_id_from_bl == 64) 256 nbr_id_from_bl = 32; 257 258 if(nbr_id_from_bl == 32) 259 { 260 /* for top left : 1 pel per nbr bit */ 261 if(!((nbr_flags_temp >> 8) & 0x1)) 262 { 263 nbr_id_from_bl++; 264 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */ 265 //nbr_id_from_bl += idx * 8; 266 } 267 } 268 /* Reverse Substitution Process*/ 269 if(nbr_id_from_bl) 270 { 271 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ 272 pu1_ref = pu1_dst[nbr_id_from_bl]; 273 for(i = (nbr_id_from_bl - 1); i >= 0; i--) 274 { 275 pu1_dst[i] = pu1_ref; 276 } 277 } 278 } 279 280 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ 281 while(nbr_id_from_bl < ((T16_4NT) + 1)) 282 { 283 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ 284 /* Devide by 8 to obtain the original index */ 285 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ 286 287 /* The Top-left flag is at the last bit location of nbr_flags*/ 288 if(nbr_id_from_bl == (T16_4NT / 2)) 289 { 290 get_bits = GET_BITS(nbr_flags_temp, 8); 291 292 /* only pel substitution for TL */ 293 if(!get_bits) 294 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; 295 } 296 else 297 { 298 get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag); 299 if(!get_bits) 300 { 301 /* 8 pel substitution (other than TL) */ 302 pu1_ref = pu1_dst[nbr_id_from_bl - 1]; 303 for(i = 0; i < 8; i++) 304 pu1_dst[nbr_id_from_bl + i] = pu1_ref; 305 } 306 307 } 308 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8; 309 } 310 311 312 } 313 314 if(nt == 32) 315 { 316 /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/ 317 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */ 318 { 319 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */ 320 321 if(nbr_id_from_bl == 64) 322 { 323 /* for top left : 1 pel per nbr bit */ 324 if(!((nbr_flags >> 16) & 0x1)) 325 { 326 /* top left not available */ 327 nbr_id_from_bl++; 328 /* top and top right; 8 pels per nbr bit */ 329 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8; 330 } 331 } 332 /* Reverse Substitution Process*/ 333 if(nbr_id_from_bl) 334 { 335 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */ 336 pu1_ref = pu1_dst[nbr_id_from_bl]; 337 for(i = (nbr_id_from_bl - 1); i >= 0; i--) 338 pu1_dst[i] = pu1_ref; 339 } 340 } 341 342 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */ 343 while(nbr_id_from_bl < ((T32_4NT) + 1)) 344 { 345 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */ 346 /* Devide by 8 to obtain the original index */ 347 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/ 348 349 /* The Top-left flag is at the last bit location of nbr_flags*/ 350 if(nbr_id_from_bl == (T32_4NT / 2)) 351 { 352 get_bits = GET_BITS(nbr_flags, 16); 353 /* only pel substitution for TL */ 354 if(!get_bits) 355 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1]; 356 } 357 else 358 { 359 get_bits = GET_BITS(nbr_flags, frwd_nbr_flag); 360 if(!get_bits) 361 { 362 /* 8 pel substitution (other than TL) */ 363 pu1_ref = pu1_dst[nbr_id_from_bl - 1]; 364 for(i = 0; i < 8; i++) 365 pu1_dst[nbr_id_from_bl + i] = pu1_ref; 366 } 367 368 } 369 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8; 370 } 371 } 372 373 } 374 } 375 376 /** 377 ******************************************************************************* 378 * 379 * @brief 380 * Intra prediction interpolation filter for ref_filtering 381 * 382 * 383 * @par Description: 384 * Reference DC filtering for neighboring samples dependent on TU size and 385 * mode Refer to section 8.4.4.2.3 in the standard 386 * 387 * @param[in] pu1_src 388 * UWORD8 pointer to the source 389 * 390 * @param[out] pu1_dst 391 * UWORD8 pointer to the destination 392 * 393 * @param[in] nt 394 * integer Transform Block size 395 * 396 * @param[in] mode 397 * integer intraprediction mode 398 * 399 * @returns 400 * 401 * @remarks 402 * None 403 * 404 ******************************************************************************* 405 */ 406 407 void ihevc_intra_pred_ref_filtering_ssse3(UWORD8 *pu1_src, 408 WORD32 nt, 409 UWORD8 *pu1_dst, 410 WORD32 mode, 411 WORD32 strong_intra_smoothing_enable_flag) 412 { 413 WORD32 filter_flag; 414 WORD32 i; /* Generic indexing variable */ 415 WORD32 four_nt = 4 * nt; 416 UWORD8 au1_flt[(4 * MAX_CU_SIZE) + 1]; 417 WORD32 bi_linear_int_flag = 0; 418 WORD32 abs_cond_left_flag = 0; 419 WORD32 abs_cond_top_flag = 0; 420 WORD32 dc_val = 1 << (BIT_DEPTH - 5); 421 __m128i src_temp1, src_temp2, src_temp3, src_temp7; 422 __m128i src_temp4, src_temp5, src_temp6, src_temp8; 423 424 //WORD32 strong_intra_smoothing_enable_flag = 1; 425 426 filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2)); 427 if(0 == filter_flag) 428 { 429 if(pu1_src == pu1_dst) 430 { 431 return; 432 } 433 else 434 { 435 if(nt == 4) 436 { 437 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); 438 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 439 pu1_dst[four_nt] = pu1_src[four_nt]; 440 441 } 442 443 else if(nt == 8) 444 { 445 446 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); 447 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 448 449 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 450 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 451 452 453 pu1_dst[four_nt] = pu1_src[four_nt]; 454 } 455 else if(nt == 16) 456 { 457 458 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); 459 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 460 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32)); 461 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48)); 462 463 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 464 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 465 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); 466 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); 467 468 pu1_dst[four_nt] = pu1_src[four_nt]; 469 } 470 else if(nt == 32) 471 { 472 473 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src)); 474 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_src + 16)); 475 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_src + 32)); 476 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 48)); 477 478 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_src + 64)); 479 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_src + 80)); 480 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_src + 96)); 481 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_src + 112)); 482 483 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 484 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 485 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); 486 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); 487 488 _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5); 489 _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6); 490 _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7); 491 _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8); 492 493 pu1_dst[four_nt] = pu1_src[four_nt]; 494 } 495 496 } 497 } 498 499 else 500 { 501 /* If strong intra smoothin is enabled and transform size is 32 */ 502 if((1 == strong_intra_smoothing_enable_flag) && (32 == nt)) 503 { 504 /* Strong Intra Filtering */ 505 abs_cond_top_flag = (abs(pu1_src[2 * nt] + pu1_src[4 * nt] 506 - (2 * pu1_src[3 * nt]))) < dc_val; 507 abs_cond_left_flag = (abs(pu1_src[2 * nt] + pu1_src[0] 508 - (2 * pu1_src[nt]))) < dc_val; 509 510 bi_linear_int_flag = ((1 == abs_cond_left_flag) 511 && (1 == abs_cond_top_flag)); 512 } 513 /* Extremities Untouched*/ 514 au1_flt[0] = pu1_src[0]; 515 au1_flt[4 * nt] = pu1_src[4 * nt]; 516 517 /* Strong filtering of reference samples */ 518 if(1 == bi_linear_int_flag) 519 { 520 au1_flt[2 * nt] = pu1_src[2 * nt]; 521 522 for(i = 1; i < (2 * nt); i++) 523 au1_flt[i] = (((2 * nt) - i) * pu1_src[0] + i * pu1_src[2 * nt] + 32) >> 6; 524 525 for(i = 1; i < (2 * nt); i++) 526 au1_flt[i + (2 * nt)] = (((2 * nt) - i) * pu1_src[2 * nt] + i * pu1_src[4 * nt] + 32) >> 6; 527 } 528 else 529 { 530 __m128i const_value_8x16, zero_8x16b; 531 532 const_value_8x16 = _mm_set1_epi16(2); 533 534 au1_flt[0] = pu1_src[0]; 535 au1_flt[4 * nt] = pu1_src[4 * nt]; 536 537 zero_8x16b = _mm_setzero_si128(); 538 539 /* Perform bilinear filtering of Reference Samples */ 540 for(i = 0; i < (four_nt); i += 16) 541 { 542 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_src + i)); 543 src_temp2 = _mm_srli_si128(src_temp1, 1); 544 src_temp3 = _mm_srli_si128(src_temp2, 1); 545 546 src_temp1 = _mm_unpacklo_epi8(src_temp1, zero_8x16b); 547 src_temp2 = _mm_unpacklo_epi8(src_temp2, zero_8x16b); 548 src_temp3 = _mm_unpacklo_epi8(src_temp3, zero_8x16b); 549 550 src_temp2 = _mm_slli_epi16(src_temp2, 1); 551 552 src_temp1 = _mm_add_epi16(src_temp1, src_temp2); 553 src_temp1 = _mm_add_epi16(src_temp1, src_temp3); 554 src_temp1 = _mm_add_epi16(src_temp1, const_value_8x16); 555 556 src_temp1 = _mm_srai_epi16(src_temp1, 2); 557 558 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_src + 8 + i)); 559 src_temp5 = _mm_srli_si128(src_temp4, 1); 560 src_temp6 = _mm_srli_si128(src_temp5, 1); 561 562 src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b); 563 src_temp5 = _mm_unpacklo_epi8(src_temp5, zero_8x16b); 564 src_temp6 = _mm_unpacklo_epi8(src_temp6, zero_8x16b); 565 566 src_temp5 = _mm_slli_epi16(src_temp5, 1); 567 568 src_temp4 = _mm_add_epi16(src_temp4, src_temp5); 569 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 570 src_temp4 = _mm_add_epi16(src_temp4, const_value_8x16); 571 572 src_temp4 = _mm_srai_epi16(src_temp4, 2); 573 574 /* converting 16 bit to 8 bit */ 575 src_temp1 = _mm_packus_epi16(src_temp1, src_temp4); 576 577 _mm_storeu_si128((__m128i *)(au1_flt + 1 + i), src_temp1); 578 } 579 au1_flt[4 * nt] = pu1_src[4 * nt]; 580 } 581 582 if(nt == 4) 583 { 584 src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); 585 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 586 pu1_dst[four_nt] = au1_flt[four_nt]; 587 } 588 else if(nt == 8) 589 { 590 591 src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); 592 src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); 593 594 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 595 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 596 597 pu1_dst[four_nt] = au1_flt[four_nt]; 598 } 599 else if(nt == 16) 600 { 601 602 src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); 603 src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); 604 src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32)); 605 src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48)); 606 607 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 608 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 609 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); 610 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); 611 612 pu1_dst[four_nt] = au1_flt[four_nt]; 613 } 614 615 else if(nt == 32) 616 { 617 618 src_temp1 = _mm_loadu_si128((__m128i *)(au1_flt)); 619 src_temp2 = _mm_loadu_si128((__m128i *)(au1_flt + 16)); 620 src_temp3 = _mm_loadu_si128((__m128i *)(au1_flt + 32)); 621 src_temp4 = _mm_loadu_si128((__m128i *)(au1_flt + 48)); 622 623 src_temp5 = _mm_loadu_si128((__m128i *)(au1_flt + 64)); 624 src_temp6 = _mm_loadu_si128((__m128i *)(au1_flt + 80)); 625 src_temp7 = _mm_loadu_si128((__m128i *)(au1_flt + 96)); 626 src_temp8 = _mm_loadu_si128((__m128i *)(au1_flt + 112)); 627 628 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp1); 629 _mm_storeu_si128((__m128i *)(pu1_dst + 16), src_temp2); 630 _mm_storeu_si128((__m128i *)(pu1_dst + 32), src_temp3); 631 _mm_storeu_si128((__m128i *)(pu1_dst + 48), src_temp4); 632 633 _mm_storeu_si128((__m128i *)(pu1_dst + 64), src_temp5); 634 _mm_storeu_si128((__m128i *)(pu1_dst + 80), src_temp6); 635 _mm_storeu_si128((__m128i *)(pu1_dst + 96), src_temp7); 636 _mm_storeu_si128((__m128i *)(pu1_dst + 112), src_temp8); 637 638 pu1_dst[four_nt] = au1_flt[four_nt]; 639 } 640 641 } 642 } 643 644 /** 645 ******************************************************************************* 646 * 647 * @brief 648 * Intra prediction interpolation filter for luma planar 649 * 650 * @par Description: 651 * Planar Intraprediction with reference neighboring samples location 652 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 653 * to section 8.4.4.2.4 in the standard 654 * 655 * @param[in] pu1_src 656 * UWORD8 pointer to the source 657 * 658 * @param[out] pu1_dst 659 * UWORD8 pointer to the destination 660 * 661 * @param[in] src_strd 662 * integer source stride 663 * 664 * @param[in] dst_strd 665 * integer destination stride 666 * 667 * @param[in] nt 668 * integer Transform Block size 669 * 670 * @param[in] mode 671 * integer intraprediction mode 672 * 673 * @returns 674 * 675 * @remarks 676 * None 677 * 678 ******************************************************************************* 679 */ 680 681 682 void ihevc_intra_pred_luma_planar_ssse3(UWORD8 *pu1_ref, 683 WORD32 src_strd, 684 UWORD8 *pu1_dst, 685 WORD32 dst_strd, 686 WORD32 nt, 687 WORD32 mode) 688 { 689 690 691 WORD32 row, col; 692 WORD32 two_nt, three_nt; 693 UWORD16 temp; 694 695 __m128i pu1_ref_16x8b, const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 696 __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b, const_temp8_4x32b; 697 __m128i nt_row_16x8b, nt_row1_16x8b, nt_row2_16x8b, nt_row3_16x8b; //nt-1-row 698 __m128i row_16x8b, row1_16x8b, row2_16x8b, row3_16x8b; //row+1 699 UNUSED(src_strd); 700 UNUSED(mode); 701 702 two_nt = 2 * nt; 703 three_nt = 3 * nt; 704 705 /* Planar filtering */ 706 temp = pu1_ref[nt - 1]; 707 temp = (temp << 8) | ((UWORD16)pu1_ref[three_nt + 1]); 708 /* setting vallues in registera*/ 709 pu1_ref_16x8b = _mm_set1_epi16(temp); 710 const_temp6_4x32b = _mm_set1_epi16(nt); 711 712 713 714 if(nt == 32) /* for nt multiple of 8*/ 715 { 716 717 718 const_temp4_4x32b = _mm_set1_epi16(0x0400); 719 const_temp1_4x32b = _mm_set1_epi16(0x0100); 720 const_temp8_4x32b = _mm_set1_epi16(0x0008); 721 //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row 722 //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7); 723 nt_row_16x8b = _mm_set_epi16(0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x1f1e, 0x1f1f); 724 //(y+1) (x+1) ; x= 0..15 , y = row 725 //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1); 726 row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101); 727 728 for(row = 0; row < nt; row += 1) 729 { 730 __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b; 731 __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b; 732 733 __m128i src_temp_8x16b, src_temp1_8x16b; 734 735 736 res_temp1_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]); 737 738 nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp8_4x32b); 739 row1_16x8b = _mm_add_epi16(row_16x8b, const_temp8_4x32b); 740 nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b); 741 row2_16x8b = _mm_add_epi16(row1_16x8b, const_temp8_4x32b); 742 nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp8_4x32b); 743 row3_16x8b = _mm_add_epi16(row2_16x8b, const_temp8_4x32b); 744 /* loding 8bit 16 pixles*/ 745 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 746 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); 747 748 res_temp4_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/ 749 res_temp5_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=1*/ 750 res_temp6_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=2*/ 751 res_temp7_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp1_8x16b); /* row=3*/ 752 753 /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */ 754 res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b); 755 res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b); 756 res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b); 757 res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b); 758 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */ 759 res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b); 760 res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b); 761 res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b); 762 res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b); 763 764 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b); 765 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b); 766 res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b); 767 res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b); 768 /*res_temp + nt)*/ 769 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); 770 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b); 771 res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b); 772 res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b); 773 774 res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 6); //log2(32)+1 775 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 6); 776 res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 6); 777 res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 6); 778 779 res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, res_temp1_8x16b); 780 res_temp1_8x16b = _mm_packus_epi16(res_temp2_8x16b, res_temp3_8x16b); 781 782 783 _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b); 784 _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd) + 16), res_temp1_8x16b); 785 786 787 nt_row_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b); 788 row_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b); 789 } 790 } 791 else if(nt == 16) /* for nt multiple of 8*/ 792 { 793 794 const_temp4_4x32b = _mm_set1_epi16(0x0400); 795 const_temp1_4x32b = _mm_set1_epi16(0x0100); 796 const_temp8_4x32b = _mm_set1_epi16(0x0008); 797 //(nt-1-y) (nt-1-x) ; x= 0..15 , y = row 798 //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7); 799 nt_row_16x8b = _mm_set_epi16(0x0f08, 0x0f09, 0x0f0a, 0x0f0b, 0x0f0c, 0x0f0d, 0x0f0e, 0x0f0f); 800 //(y+1) (x+1) ; x= 0..15 , y = row 801 //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1); 802 row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101); 803 804 for(row = 0; row < nt; row += 2) 805 { 806 __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b; 807 __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b; 808 809 __m128i src_temp_8x16b; 810 811 812 res_temp1_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]); 813 res_temp2_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]); 814 815 816 nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b); 817 row1_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b); 818 nt_row2_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp8_4x32b); 819 row2_16x8b = _mm_add_epi16(row_16x8b, const_temp8_4x32b); 820 nt_row3_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp8_4x32b); 821 row3_16x8b = _mm_add_epi16(row1_16x8b, const_temp8_4x32b); 822 /* loding 8bit 16 pixles*/ 823 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 824 825 826 res_temp4_8x16b = _mm_unpacklo_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=0*/ 827 res_temp5_8x16b = _mm_unpacklo_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=1*/ 828 res_temp6_8x16b = _mm_unpackhi_epi8(res_temp1_8x16b, src_temp_8x16b); /* row=2*/ 829 res_temp7_8x16b = _mm_unpackhi_epi8(res_temp2_8x16b, src_temp_8x16b); /* row=3*/ 830 831 /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */ 832 res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b); 833 res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b); 834 res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b); 835 res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b); 836 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */ 837 res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b); 838 res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b); 839 res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b); 840 res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b); 841 842 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b); 843 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b); 844 res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b); 845 res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b); 846 /*res_temp + nt)*/ 847 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); 848 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b); 849 res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b); 850 res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b); 851 852 res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 5); //log2(16)+1 853 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 5); 854 res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 5); 855 res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 5); 856 857 res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, res_temp2_8x16b); 858 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, res_temp3_8x16b); 859 860 _mm_storeu_si128((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b); 861 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b); 862 863 nt_row_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b); 864 row_16x8b = _mm_add_epi16(row1_16x8b, const_temp1_4x32b); 865 } 866 } 867 else if(nt == 8) 868 { 869 870 871 const_temp4_4x32b = _mm_set1_epi16(0x0400); 872 const_temp1_4x32b = _mm_set1_epi16(0x0100); 873 zero_8x16b = _mm_set1_epi32(0); 874 875 //(nt-1-y) (nt-1-x) ; x= 0..7 , y = row 876 //const_temp5_4x32b = _mm_set_epi8(nt_row, 0,nt_row, 1,nt_row, 2,nt_row, 3,nt_row, 4,nt_row, 5,nt_row, 6,nt_row, 7); 877 nt_row_16x8b = _mm_set_epi16(0x0700, 0x0701, 0x0702, 0x0703, 0x0704, 0x0705, 0x0706, 0x0707); 878 //(y+1) (x+1) ; x= 0..7 , y = row 879 //const_temp3_4x32b = _mm_set_epi16(row1,8,row1, 7,row1, 6, row1, 5,row1, 4, row1, 3, row1, 2, row1, 1); 880 row_16x8b = _mm_set_epi16(0x0108, 0x0107, 0x0106, 0x0105, 0x0104, 0x0103, 0x0102, 0x0101); 881 882 for(row = 0; row < nt; row += 4) 883 { 884 __m128i res_temp_8x16b, res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b; 885 __m128i res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b; 886 887 __m128i src_temp_8x16b; 888 889 890 res_temp4_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 1 - row]); 891 res_temp5_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 2 - row]); 892 res_temp6_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 3 - row]); 893 res_temp7_8x16b = _mm_set1_epi8(pu1_ref[two_nt - 4 - row]); 894 895 nt_row1_16x8b = _mm_sub_epi16(nt_row_16x8b, const_temp1_4x32b); 896 row1_16x8b = _mm_add_epi16(row_16x8b, const_temp1_4x32b); 897 nt_row2_16x8b = _mm_sub_epi16(nt_row1_16x8b, const_temp1_4x32b); 898 row2_16x8b = _mm_add_epi16(row1_16x8b, const_temp1_4x32b); 899 nt_row3_16x8b = _mm_sub_epi16(nt_row2_16x8b, const_temp1_4x32b); 900 row3_16x8b = _mm_add_epi16(row2_16x8b, const_temp1_4x32b); 901 /* loding 8bit 16 pixles*/ 902 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 903 904 res_temp4_8x16b = _mm_unpacklo_epi8(res_temp4_8x16b, src_temp_8x16b); /* row=0*/ 905 res_temp5_8x16b = _mm_unpacklo_epi8(res_temp5_8x16b, src_temp_8x16b); /* row=1*/ 906 res_temp6_8x16b = _mm_unpacklo_epi8(res_temp6_8x16b, src_temp_8x16b); /* row=2*/ 907 res_temp7_8x16b = _mm_unpacklo_epi8(res_temp7_8x16b, src_temp_8x16b); /* row=3*/ 908 909 /*(row + 1) * pu1_ref[nt - 1] + (col + 1) * pu1_ref[three_nt + 1] */ 910 res_temp_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row_16x8b); 911 res_temp1_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row1_16x8b); 912 res_temp2_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row2_16x8b); 913 res_temp3_8x16b = _mm_maddubs_epi16(pu1_ref_16x8b, row3_16x8b); 914 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] + (nt - 1 - col)* pu1_ref[two_nt - 1 - row] */ 915 res_temp4_8x16b = _mm_maddubs_epi16(res_temp4_8x16b, nt_row_16x8b); 916 res_temp5_8x16b = _mm_maddubs_epi16(res_temp5_8x16b, nt_row1_16x8b); 917 res_temp6_8x16b = _mm_maddubs_epi16(res_temp6_8x16b, nt_row2_16x8b); 918 res_temp7_8x16b = _mm_maddubs_epi16(res_temp7_8x16b, nt_row3_16x8b); 919 920 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp4_8x16b); 921 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp5_8x16b); 922 res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, res_temp6_8x16b); 923 res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp7_8x16b); 924 /*res_temp + nt)*/ 925 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); 926 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, const_temp6_4x32b); 927 res_temp2_8x16b = _mm_add_epi16(res_temp2_8x16b, const_temp6_4x32b); 928 res_temp3_8x16b = _mm_add_epi16(res_temp3_8x16b, const_temp6_4x32b); 929 930 res_temp_8x16b = _mm_srli_epi16(res_temp_8x16b, 4); //log2(16)+1 931 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 4); 932 res_temp2_8x16b = _mm_srli_epi16(res_temp2_8x16b, 4); 933 res_temp3_8x16b = _mm_srli_epi16(res_temp3_8x16b, 4); 934 935 res_temp_8x16b = _mm_packus_epi16(res_temp_8x16b, zero_8x16b); 936 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b); 937 res_temp2_8x16b = _mm_packus_epi16(res_temp2_8x16b, zero_8x16b); 938 res_temp3_8x16b = _mm_packus_epi16(res_temp3_8x16b, zero_8x16b); 939 940 _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd)), res_temp_8x16b); 941 _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), res_temp1_8x16b); 942 _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), res_temp2_8x16b); 943 _mm_storel_epi64((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), res_temp3_8x16b); 944 945 nt_row_16x8b = _mm_sub_epi16(nt_row3_16x8b, const_temp1_4x32b); 946 row_16x8b = _mm_add_epi16(row3_16x8b, const_temp1_4x32b); 947 } 948 } 949 else 950 { 951 952 /* for nt multiple of 4*/ 953 const_temp7_4x32b = _mm_set1_epi16(4); 954 const_temp4_4x32b = _mm_set1_epi16(nt - 1); 955 const_temp_4x32b = _mm_set1_epi16(pu1_ref[three_nt + 1]); 956 const_temp1_4x32b = _mm_set1_epi16(pu1_ref[nt - 1]); 957 zero_8x16b = _mm_set1_epi32(0); 958 959 for(row = 0; row < nt; row++) 960 { 961 __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b; 962 __m128i res_temp3_8x16b; 963 964 const_temp2_4x32b = _mm_set1_epi16(pu1_ref[two_nt - 1 - row]); 965 const_temp3_4x32b = _mm_set1_epi16((row + 1)); 966 967 968 row_8x16b = _mm_set1_epi16((nt - 1 - row)); 969 970 const_temp5_4x32b = _mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0); 971 col_8x16b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 972 973 const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b); 974 975 /*(row + 1) * pu1_ref[nt - 1]*/ 976 res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b); 977 978 /*(row + 1) * pu1_ref[nt - 1] + nt)*/ 979 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); 980 981 for(col = 0; col < nt; col += 4) 982 { 983 __m128i src_temp_8x16b; 984 int temp1; 985 986 /* loding 8bit 16 pixles*/ 987 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + col)); 988 989 src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b); /* row=0*/ 990 991 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */ 992 res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b); 993 994 /*(col + 1) * pu1_ref[three_nt + 1]*/ 995 res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b); 996 997 /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/ 998 res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b); 999 1000 res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b); 1001 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 1002 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b); 1003 1004 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, 3); //log2(16)+1 1005 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b); 1006 1007 temp1 = _mm_cvtsi128_si32(res_temp1_8x16b); 1008 1009 *(WORD32 *)(&pu1_dst[(row * dst_strd) + col]) = temp1; 1010 1011 const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b); 1012 col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b); 1013 } /* inner loop ends here */ 1014 } 1015 } 1016 1017 1018 } 1019 1020 /** 1021 ******************************************************************************* 1022 * 1023 * @brief 1024 * Intra prediction interpolation filter for luma dc 1025 * 1026 * @par Description: 1027 * Intraprediction for DC mode with reference neighboring samples location 1028 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 1029 * to section 8.4.4.2.5 in the standard 1030 * 1031 * @param[in] pu1_src 1032 * UWORD8 pointer to the source 1033 * 1034 * @param[out] pu1_dst 1035 * UWORD8 pointer to the destination 1036 * 1037 * @param[in] src_strd 1038 * integer source stride 1039 * 1040 * @param[in] dst_strd 1041 * integer destination stride 1042 * 1043 * @param[in] nt 1044 * integer Transform Block size 1045 * 1046 * @param[in] mode 1047 * integer intraprediction mode 1048 * 1049 * @returns 1050 * 1051 * @remarks 1052 * None 1053 * 1054 ******************************************************************************* 1055 */ 1056 1057 void ihevc_intra_pred_luma_dc_ssse3(UWORD8 *pu1_ref, 1058 WORD32 src_strd, 1059 UWORD8 *pu1_dst, 1060 WORD32 dst_strd, 1061 WORD32 nt, 1062 WORD32 mode) 1063 { 1064 1065 WORD32 acc_dc; 1066 WORD32 dc_val, two_dc_val, three_dc_val; 1067 WORD32 row; 1068 WORD32 log2nt = 5; 1069 WORD32 two_nt, three_nt; 1070 __m128i src_temp1, src_temp7, src_temp3, src_temp4, src_temp5, src_temp6; 1071 __m128i src_temp8, src_temp10, src_temp2; 1072 __m128i m_zero = _mm_setzero_si128(); 1073 __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK5[0]); 1074 UNUSED(src_strd); 1075 UNUSED(mode); 1076 1077 1078 switch(nt) 1079 { 1080 case 32: 1081 log2nt = 5; 1082 break; 1083 case 16: 1084 log2nt = 4; 1085 break; 1086 case 8: 1087 log2nt = 3; 1088 break; 1089 case 4: 1090 log2nt = 2; 1091 break; 1092 default: 1093 break; 1094 } 1095 two_nt = 2 * nt; 1096 three_nt = 3 * nt; 1097 1098 acc_dc = 0; 1099 /* Calculate DC value for the transform block */ 1100 1101 1102 1103 if(nt == 32) 1104 { 1105 __m128i temp; 1106 WORD32 itr_count; 1107 1108 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 1109 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16)); 1110 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 32)); 1111 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 48)); 1112 1113 src_temp3 = _mm_sad_epu8(src_temp3, m_zero); 1114 src_temp4 = _mm_sad_epu8(src_temp4, m_zero); 1115 src_temp7 = _mm_sad_epu8(src_temp7, m_zero); 1116 src_temp8 = _mm_sad_epu8(src_temp8, m_zero); 1117 1118 src_temp4 = _mm_add_epi16(src_temp3, src_temp4); 1119 src_temp8 = _mm_add_epi16(src_temp7, src_temp8); 1120 src_temp4 = _mm_add_epi16(src_temp4, src_temp8); 1121 1122 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 1123 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 1124 1125 acc_dc = _mm_cvtsi128_si32(src_temp4); 1126 1127 acc_dc += pu1_ref[three_nt]; 1128 acc_dc -= pu1_ref[two_nt]; 1129 1130 /* computing acc_dc value */ 1131 dc_val = (acc_dc + nt) >> (log2nt + 1); 1132 1133 two_dc_val = 2 * dc_val; 1134 three_dc_val = 3 * dc_val; 1135 1136 temp = _mm_set1_epi8(dc_val); 1137 1138 for(itr_count = 0; itr_count < 2; itr_count++) 1139 { 1140 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 1141 _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp); 1142 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp); 1143 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp); 1144 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp); 1145 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp); 1146 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp); 1147 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp); 1148 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp); 1149 1150 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp); 1151 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp); 1152 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp); 1153 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp); 1154 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp); 1155 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp); 1156 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp); 1157 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp); 1158 1159 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp); 1160 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp); 1161 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp); 1162 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp); 1163 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp); 1164 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp); 1165 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp); 1166 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp); 1167 1168 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp); 1169 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp); 1170 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp); 1171 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp); 1172 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp); 1173 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp); 1174 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp); 1175 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp); 1176 1177 pu1_dst += 16 * dst_strd; 1178 } 1179 } 1180 else 1181 1182 { 1183 __m128i sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]); 1184 1185 /* DC filtering for the first top row and first left column */ 1186 1187 1188 1189 if(nt == 4) /* nt multiple of 4*/ 1190 { 1191 WORD32 temp1, temp2, temp3; 1192 1193 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 1194 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1195 1196 src_temp4 = _mm_unpacklo_epi8(src_temp3, m_zero); 1197 src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero); 1198 1199 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 1200 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 1201 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 1202 1203 acc_dc = _mm_cvtsi128_si32(src_temp4); 1204 acc_dc += pu1_ref[three_nt]; 1205 acc_dc -= pu1_ref[two_nt]; 1206 1207 /* computing acc_dc value */ 1208 1209 dc_val = (acc_dc + nt) >> (log2nt + 1); 1210 1211 three_dc_val = 3 * dc_val; 1212 1213 /* loding 8-bit 16 pixel */ 1214 src_temp1 = _mm_set1_epi16(three_dc_val + 2); 1215 two_dc_val = 2 * dc_val; 1216 1217 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ 1218 src_temp2 = _mm_add_epi16(src_temp2, src_temp1); 1219 1220 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2) >> 2 */ 1221 src_temp2 = _mm_srli_epi16(src_temp2, 2); 1222 1223 src_temp2 = _mm_packus_epi16(src_temp2, m_zero); 1224 1225 temp1 = _mm_cvtsi128_si32(src_temp2); 1226 1227 *(WORD32 *)(&pu1_dst[0]) = temp1; 1228 1229 src_temp2 = _mm_insert_epi16(src_temp2, dc_val, 0); 1230 1231 src_temp2 = _mm_shuffle_epi8(src_temp2, sm1); 1232 src_temp3 = _mm_shuffle_epi8(src_temp2, sm1); 1233 src_temp4 = _mm_shuffle_epi8(src_temp2, sm1); 1234 1235 temp1 = _mm_cvtsi128_si32(src_temp2); 1236 temp2 = _mm_cvtsi128_si32(src_temp3); 1237 temp3 = _mm_cvtsi128_si32(src_temp4); 1238 1239 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1; 1240 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2; 1241 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3; 1242 1243 /* retore first value*/ 1244 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) 1245 >> 2); 1246 1247 for(row = 1; row < nt; row++) 1248 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) 1249 >> 2; 1250 1251 } 1252 else if(nt == 8) /* if nt%8==0*/ 1253 { 1254 1255 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 1256 1257 src_temp4 = _mm_sad_epu8(src_temp3, m_zero); 1258 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 1259 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 1260 1261 acc_dc = _mm_cvtsi128_si32(src_temp4); 1262 1263 acc_dc += pu1_ref[three_nt]; 1264 acc_dc -= pu1_ref[two_nt]; 1265 1266 /* computing acc_dc value */ 1267 1268 dc_val = (acc_dc + nt) >> (log2nt + 1); 1269 1270 three_dc_val = 3 * dc_val; 1271 src_temp1 = _mm_set1_epi16(three_dc_val + 2); 1272 two_dc_val = 2 * dc_val; 1273 1274 /* loding 8-bit 16 pixel */ 1275 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1276 src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero); 1277 1278 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ 1279 src_temp2 = _mm_add_epi16(src_temp2, src_temp1); 1280 1281 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */ 1282 src_temp2 = _mm_srli_epi16(src_temp2, 2); 1283 src_temp2 = _mm_packus_epi16(src_temp2, m_zero); 1284 1285 _mm_storel_epi64((__m128i *)(pu1_dst), src_temp2); 1286 1287 /* Fill the remaining rows with DC value*/ 1288 1289 src_temp1 = _mm_set1_epi8(dc_val); 1290 src_temp2 = _mm_set1_epi8(dc_val); 1291 src_temp3 = _mm_set1_epi8(dc_val); 1292 src_temp4 = _mm_set1_epi8(dc_val); 1293 src_temp5 = _mm_set1_epi8(dc_val); 1294 src_temp6 = _mm_set1_epi8(dc_val); 1295 src_temp7 = _mm_set1_epi8(dc_val); 1296 1297 _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); 1298 _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); 1299 _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); 1300 _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); 1301 _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); 1302 _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); 1303 _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); 1304 1305 /* retore first value*/ 1306 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) 1307 >> 2); 1308 1309 for(row = 1; row < nt; row++) 1310 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) 1311 >> 2; 1312 1313 } 1314 else /* if nt == 16*/ 1315 { 1316 1317 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt)); 1318 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 16)); 1319 1320 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1321 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8)); 1322 1323 src_temp3 = _mm_sad_epu8(src_temp3, m_zero); 1324 src_temp4 = _mm_sad_epu8(src_temp4, m_zero); 1325 1326 src_temp2 = _mm_unpacklo_epi8(src_temp2, m_zero); 1327 src_temp10 = _mm_unpacklo_epi8(src_temp10, m_zero); 1328 1329 src_temp4 = _mm_add_epi16(src_temp3, src_temp4); 1330 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 1331 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 1332 1333 acc_dc = _mm_cvtsi128_si32(src_temp4); 1334 1335 acc_dc += pu1_ref[three_nt]; 1336 acc_dc -= pu1_ref[two_nt]; 1337 1338 /* computing acc_dc value */ 1339 1340 dc_val = (acc_dc + nt) >> (log2nt + 1); 1341 1342 three_dc_val = 3 * dc_val; 1343 src_temp1 = _mm_set1_epi16(three_dc_val + 2); 1344 two_dc_val = 2 * dc_val; 1345 1346 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2 */ 1347 src_temp2 = _mm_add_epi16(src_temp2, src_temp1); 1348 src_temp10 = _mm_add_epi16(src_temp10, src_temp1); 1349 /*(pu1_ref[two_nt + 1 + col] + three_dc_val + 2)>>2 */ 1350 src_temp2 = _mm_srli_epi16(src_temp2, 2); 1351 src_temp10 = _mm_srli_epi16(src_temp10, 2); 1352 1353 src_temp2 = _mm_packus_epi16(src_temp2, src_temp10); 1354 1355 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp2); 1356 1357 /* Fill the remaining rows with DC value*/ 1358 src_temp1 = _mm_set1_epi8(dc_val); 1359 src_temp2 = _mm_set1_epi8(dc_val); 1360 src_temp3 = _mm_set1_epi8(dc_val); 1361 src_temp4 = _mm_set1_epi8(dc_val); 1362 src_temp5 = _mm_set1_epi8(dc_val); 1363 src_temp6 = _mm_set1_epi8(dc_val); 1364 src_temp7 = _mm_set1_epi8(dc_val); 1365 1366 for(row = 1; row < nt; row += 8) 1367 { 1368 1369 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); 1370 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); 1371 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); 1372 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); 1373 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); 1374 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); 1375 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); 1376 1377 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp1); 1378 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp2); 1379 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp3); 1380 1381 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp4); 1382 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp5); 1383 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp6); 1384 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp7); 1385 1386 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp1); 1387 1388 } 1389 1390 /* retore first value*/ 1391 pu1_dst[0] = ((pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) 1392 >> 2); 1393 1394 for(row = 1; row < nt; row++) 1395 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val + 2) 1396 >> 2; 1397 1398 } 1399 } 1400 } 1401 1402 /** 1403 ******************************************************************************* 1404 * 1405 * @brief 1406 * Intra prediction interpolation filter for horizontal luma variable. 1407 * 1408 * @par Description: 1409 * Horizontal intraprediction(mode 10) with reference samples location 1410 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 1411 * to section 8.4.4.2.6 in the standard (Special case) 1412 * 1413 * @param[in] pu1_src 1414 * UWORD8 pointer to the source 1415 * 1416 * @param[out] pu1_dst 1417 * UWORD8 pointer to the destination 1418 * 1419 * @param[in] src_strd 1420 * integer source stride 1421 * 1422 * @param[in] dst_strd 1423 * integer destination stride 1424 * 1425 * @param[in] nt 1426 * integer Transform Block size 1427 * 1428 * @param[in] mode 1429 * integer intraprediction mode 1430 * 1431 * @returns 1432 * 1433 * @remarks 1434 * None 1435 * 1436 ******************************************************************************* 1437 */ 1438 1439 void ihevc_intra_pred_luma_horz_ssse3(UWORD8 *pu1_ref, 1440 WORD32 src_strd, 1441 UWORD8 *pu1_dst, 1442 WORD32 dst_strd, 1443 WORD32 nt, 1444 WORD32 mode) 1445 { 1446 1447 WORD32 row; 1448 WORD32 two_nt; 1449 UNUSED(src_strd); 1450 UNUSED(mode); 1451 1452 two_nt = 2 * nt; 1453 1454 1455 if(nt == 32) 1456 { 1457 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8; 1458 __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; 1459 __m128i sm = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASK4[0]); 1460 1461 for(row = 0; row < nt; row += 16) 1462 { 1463 { 1464 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 - row - 15)); 1465 1466 src_temp2 = _mm_srli_si128(src_temp1, 1); 1467 src_temp3 = _mm_srli_si128(src_temp1, 2); 1468 src_temp4 = _mm_srli_si128(src_temp1, 3); 1469 src_temp5 = _mm_srli_si128(src_temp1, 4); 1470 src_temp6 = _mm_srli_si128(src_temp1, 5); 1471 src_temp7 = _mm_srli_si128(src_temp1, 6); 1472 src_temp8 = _mm_srli_si128(src_temp1, 7); 1473 1474 src_temp9 = _mm_srli_si128(src_temp1, 8); 1475 src_temp10 = _mm_srli_si128(src_temp1, 9); 1476 src_temp11 = _mm_srli_si128(src_temp1, 10); 1477 src_temp12 = _mm_srli_si128(src_temp1, 11); 1478 src_temp13 = _mm_srli_si128(src_temp1, 12); 1479 src_temp14 = _mm_srli_si128(src_temp1, 13); 1480 src_temp15 = _mm_srli_si128(src_temp1, 14); 1481 src_temp16 = _mm_srli_si128(src_temp1, 15); 1482 1483 src_temp8 = _mm_shuffle_epi8(src_temp8, sm); 1484 src_temp7 = _mm_shuffle_epi8(src_temp7, sm); 1485 src_temp6 = _mm_shuffle_epi8(src_temp6, sm); 1486 src_temp5 = _mm_shuffle_epi8(src_temp5, sm); 1487 src_temp4 = _mm_shuffle_epi8(src_temp4, sm); 1488 src_temp3 = _mm_shuffle_epi8(src_temp3, sm); 1489 src_temp2 = _mm_shuffle_epi8(src_temp2, sm); 1490 src_temp1 = _mm_shuffle_epi8(src_temp1, sm); 1491 1492 src_temp16 = _mm_shuffle_epi8(src_temp16, sm); 1493 src_temp15 = _mm_shuffle_epi8(src_temp15, sm); 1494 src_temp14 = _mm_shuffle_epi8(src_temp14, sm); 1495 src_temp13 = _mm_shuffle_epi8(src_temp13, sm); 1496 src_temp12 = _mm_shuffle_epi8(src_temp12, sm); 1497 src_temp11 = _mm_shuffle_epi8(src_temp11, sm); 1498 src_temp10 = _mm_shuffle_epi8(src_temp10, sm); 1499 src_temp9 = _mm_shuffle_epi8(src_temp9, sm); 1500 1501 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp16); 1502 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp15); 1503 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp14); 1504 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp13); 1505 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp12); 1506 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp11); 1507 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp10); 1508 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp9); 1509 1510 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 8) * dst_strd)), src_temp8); 1511 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 9) * dst_strd)), src_temp7); 1512 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 10) * dst_strd)), src_temp6); 1513 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 11) * dst_strd)), src_temp5); 1514 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 12) * dst_strd)), src_temp4); 1515 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 13) * dst_strd)), src_temp3); 1516 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 14) * dst_strd)), src_temp2); 1517 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 15) * dst_strd)), src_temp1); 1518 1519 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 0) * dst_strd)), src_temp16); 1520 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 1) * dst_strd)), src_temp15); 1521 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 2) * dst_strd)), src_temp14); 1522 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 3) * dst_strd)), src_temp13); 1523 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 4) * dst_strd)), src_temp12); 1524 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 5) * dst_strd)), src_temp11); 1525 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 6) * dst_strd)), src_temp10); 1526 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 7) * dst_strd)), src_temp9); 1527 1528 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 8) * dst_strd)), src_temp8); 1529 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 9) * dst_strd)), src_temp7); 1530 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 10) * dst_strd)), src_temp6); 1531 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 11) * dst_strd)), src_temp5); 1532 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 12) * dst_strd)), src_temp4); 1533 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 13) * dst_strd)), src_temp3); 1534 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 14) * dst_strd)), src_temp2); 1535 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((row + 15) * dst_strd)), src_temp1); 1536 1537 } 1538 1539 } 1540 1541 } 1542 else 1543 1544 { 1545 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6; 1546 __m128i src_temp10, zero_8x16b, src_temp7; 1547 1548 /* DC filtering for the first top row and first left column */ 1549 1550 zero_8x16b = _mm_set1_epi16(0); 1551 1552 /*Filtering done for the 1st row */ 1553 1554 src_temp2 = _mm_set1_epi16(pu1_ref[two_nt - 1]); 1555 src_temp10 = _mm_set1_epi16(pu1_ref[two_nt]); 1556 1557 /* loding 8-bit 16 pixels */ 1558 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1559 1560 src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b); 1561 1562 /*(pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt])*/ 1563 src_temp3 = _mm_sub_epi16(src_temp4, src_temp10); 1564 1565 /* ((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/ 1566 src_temp3 = _mm_srai_epi16(src_temp3, 1); 1567 1568 /* pu1_ref[two_nt - 1]+((pu1_ref[two_nt + 1 + col] - pu1_ref[two_nt]) >> 1)*/ 1569 src_temp3 = _mm_add_epi16(src_temp2, src_temp3); 1570 1571 if(nt == 4) 1572 { 1573 int temp1, temp2, temp3; 1574 src_temp3 = _mm_packus_epi16(src_temp3, zero_8x16b); 1575 temp1 = _mm_cvtsi128_si32(src_temp3); 1576 1577 *(WORD32 *)(&pu1_dst[0]) = temp1; 1578 1579 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 2]); 1580 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 3]); 1581 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 4]); 1582 1583 temp1 = _mm_cvtsi128_si32(src_temp2); 1584 temp2 = _mm_cvtsi128_si32(src_temp3); 1585 temp3 = _mm_cvtsi128_si32(src_temp4); 1586 1587 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 1588 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1; 1589 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp2; 1590 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp3; 1591 1592 } 1593 else if(nt == 8) 1594 { 1595 src_temp10 = _mm_packus_epi16(src_temp3, zero_8x16b); 1596 1597 1598 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]); 1599 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]); 1600 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]); 1601 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]); 1602 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]); 1603 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]); 1604 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]); 1605 1606 _mm_storel_epi64((__m128i *)(pu1_dst), src_temp10); 1607 1608 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 1609 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 1610 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp2); 1611 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp3); 1612 _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4); 1613 _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp5); 1614 _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp6); 1615 _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp7); 1616 1617 } 1618 else if(nt == 16) 1619 { 1620 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 8)); 1621 src_temp4 = _mm_unpacklo_epi8(src_temp4, zero_8x16b); 1622 //src_temp4 = _mm_cvtepu8_epi16 (src_temp4); 1623 1624 src_temp10 = _mm_sub_epi16(src_temp4, src_temp10); 1625 src_temp10 = _mm_srai_epi16(src_temp10, 1); 1626 src_temp10 = _mm_add_epi16(src_temp2, src_temp10); 1627 1628 src_temp3 = _mm_packus_epi16(src_temp3, src_temp10); 1629 _mm_storeu_si128((__m128i *)(pu1_dst), src_temp3); 1630 1631 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 1632 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 2]); 1633 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 3]); 1634 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 4]); 1635 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 5]); 1636 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 6]); 1637 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 7]); 1638 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 8]); 1639 src_temp10 = _mm_set1_epi8(pu1_ref[two_nt - 9]); 1640 1641 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); 1642 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp2); 1643 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp3); 1644 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp4); 1645 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp5); 1646 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp6); 1647 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp7); 1648 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), src_temp10); 1649 1650 src_temp1 = _mm_set1_epi8(pu1_ref[two_nt - 10]); 1651 src_temp2 = _mm_set1_epi8(pu1_ref[two_nt - 11]); 1652 src_temp3 = _mm_set1_epi8(pu1_ref[two_nt - 12]); 1653 src_temp4 = _mm_set1_epi8(pu1_ref[two_nt - 13]); 1654 src_temp5 = _mm_set1_epi8(pu1_ref[two_nt - 14]); 1655 src_temp6 = _mm_set1_epi8(pu1_ref[two_nt - 15]); 1656 src_temp7 = _mm_set1_epi8(pu1_ref[two_nt - 16]); 1657 1658 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), src_temp1); 1659 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), src_temp2); 1660 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), src_temp3); 1661 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), src_temp4); 1662 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), src_temp5); 1663 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), src_temp6); 1664 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), src_temp7); 1665 1666 } 1667 } 1668 } 1669 1670 1671 /** 1672 ******************************************************************************* 1673 * 1674 * @brief 1675 * Intra prediction interpolation filter for vertical luma variable. 1676 * 1677 * @par Description: 1678 * Horizontal intraprediction with reference neighboring samples location 1679 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 1680 * to section 8.4.4.2.6 in the standard (Special case) 1681 * 1682 * @param[in] pu1_src 1683 * UWORD8 pointer to the source 1684 * 1685 * @param[out] pu1_dst 1686 * UWORD8 pointer to the destination 1687 * 1688 * @param[in] src_strd 1689 * integer source stride 1690 * 1691 * @param[in] dst_strd 1692 * integer destination stride 1693 * 1694 * @param[in] nt 1695 * integer Transform Block size 1696 * 1697 * @param[in] mode 1698 * integer intraprediction mode 1699 * 1700 * @returns 1701 * 1702 * @remarks 1703 * None 1704 * 1705 ******************************************************************************* 1706 */ 1707 1708 1709 void ihevc_intra_pred_luma_ver_ssse3(UWORD8 *pu1_ref, 1710 WORD32 src_strd, 1711 UWORD8 *pu1_dst, 1712 WORD32 dst_strd, 1713 WORD32 nt, 1714 WORD32 mode) 1715 { 1716 WORD32 row; 1717 WORD16 s2_predpixel; 1718 WORD32 two_nt = 2 * nt; 1719 __m128i src_temp0, src_temp2; 1720 UNUSED(src_strd); 1721 UNUSED(mode); 1722 1723 1724 if(nt == 32) 1725 { 1726 __m128i temp1, temp2; 1727 WORD32 itr_count; 1728 1729 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1730 temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1 + 16)); 1731 1732 for(itr_count = 0; itr_count < 2; itr_count++) 1733 { 1734 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 1735 _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1); 1736 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1); 1737 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1); 1738 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1); 1739 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1); 1740 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1); 1741 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1); 1742 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1); 1743 1744 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2); 1745 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2); 1746 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2); 1747 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2); 1748 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2); 1749 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2); 1750 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2); 1751 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2); 1752 1753 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1); 1754 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1); 1755 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1); 1756 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1); 1757 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1); 1758 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1); 1759 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1); 1760 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1); 1761 1762 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2); 1763 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2); 1764 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2); 1765 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2); 1766 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2); 1767 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2); 1768 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2); 1769 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2); 1770 1771 pu1_dst += 16 * dst_strd; 1772 } 1773 } 1774 else 1775 { 1776 /* Replication to next columns*/ 1777 1778 if(nt == 4) 1779 { 1780 int temp1; 1781 1782 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1783 1784 temp1 = _mm_cvtsi128_si32(src_temp2); 1785 1786 /* loding 4-bit 8 pixels values */ 1787 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; 1788 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp1; 1789 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp1; 1790 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp1; 1791 1792 } 1793 else if(nt == 8) 1794 { 1795 1796 src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1797 1798 _mm_storel_epi64((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp0); 1799 _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp0); 1800 _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp0); 1801 _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp0); 1802 _mm_storel_epi64((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp0); 1803 _mm_storel_epi64((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp0); 1804 _mm_storel_epi64((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp0); 1805 _mm_storel_epi64((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp0); 1806 1807 1808 } 1809 else if(nt == 16) 1810 { 1811 for(row = 0; row < nt; row += 8) 1812 { 1813 1814 src_temp0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); 1815 1816 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp0); 1817 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp0); 1818 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp0); 1819 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp0); 1820 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp0); 1821 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp0); 1822 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp0); 1823 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp0); 1824 1825 } 1826 1827 } 1828 1829 /*Filtering done for the 1st column */ 1830 for(row = nt - 1; row >= 0; row--) 1831 { 1832 s2_predpixel = pu1_ref[two_nt + 1] 1833 + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); 1834 pu1_dst[row * dst_strd] = CLIP_U8(s2_predpixel); 1835 } 1836 1837 1838 } 1839 } 1840 1841 /** 1842 ******************************************************************************* 1843 * 1844 * @brief 1845 * Intra prediction interpolation filter for luma mode2. 1846 * 1847 * @par Description: 1848 * Intraprediction for mode 2 (sw angle) with reference neighboring samples 1849 * location pointed by 'pu1_ref' to the TU block location pointed by 1850 * 'pu1_dst' Refer to section 8.4.4.2.6 in the standard 1851 * 1852 * @param[in] pu1_src 1853 * UWORD8 pointer to the source 1854 * 1855 * @param[out] pu1_dst 1856 * UWORD8 pointer to the destination 1857 * 1858 * @param[in] src_strd 1859 * integer source stride 1860 * 1861 * @param[in] dst_strd 1862 * integer destination stride 1863 * 1864 * @param[in] nt 1865 * integer Transform Block size 1866 * 1867 * @param[in] mode 1868 * integer intraprediction mode 1869 * 1870 * @returns 1871 * 1872 * @remarks 1873 * None 1874 * 1875 ******************************************************************************* 1876 */ 1877 1878 void ihevc_intra_pred_luma_mode2_ssse3(UWORD8 *pu1_ref, 1879 WORD32 src_strd, 1880 UWORD8 *pu1_dst, 1881 WORD32 dst_strd, 1882 WORD32 nt, 1883 WORD32 mode) 1884 { 1885 WORD32 row, col; 1886 WORD32 two_nt = 2 * nt; 1887 1888 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8; 1889 __m128i sm1, sm2, sm3; 1890 UNUSED(src_strd); 1891 UNUSED(mode); 1892 1893 1894 sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY1[0]); 1895 sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY2[0]); 1896 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY3[0]); 1897 1898 /* For the angle 45, replication is done from the corresponding angle */ 1899 /* intra_pred_ang = tan(angle) in q5 format */ 1900 1901 if(nt == 4) 1902 { 1903 int temp1, temp2, temp3, temp4; 1904 1905 /*pu1_ref[two_nt - row - (col+1) - 1]*/ 1906 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 8)); 1907 src_temp2 = _mm_srli_si128(src_temp1, 1); 1908 src_temp3 = _mm_srli_si128(src_temp1, 2); 1909 src_temp4 = _mm_srli_si128(src_temp1, 3); 1910 1911 src_temp4 = _mm_shuffle_epi8(src_temp4, sm1); 1912 src_temp3 = _mm_shuffle_epi8(src_temp3, sm1); 1913 src_temp2 = _mm_shuffle_epi8(src_temp2, sm1); 1914 src_temp1 = _mm_shuffle_epi8(src_temp1, sm1); 1915 1916 temp1 = _mm_cvtsi128_si32(src_temp4); 1917 temp2 = _mm_cvtsi128_si32(src_temp3); 1918 temp3 = _mm_cvtsi128_si32(src_temp2); 1919 temp4 = _mm_cvtsi128_si32(src_temp1); 1920 1921 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 1922 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; 1923 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2; 1924 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3; 1925 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4; 1926 1927 1928 } 1929 else if(nt == 8) 1930 { 1931 /*pu1_ref[two_nt - row - (col+1) - 1]*/ 1932 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); 1933 src_temp2 = _mm_srli_si128(src_temp1, 1); 1934 src_temp3 = _mm_srli_si128(src_temp1, 2); 1935 src_temp4 = _mm_srli_si128(src_temp1, 3); 1936 src_temp5 = _mm_srli_si128(src_temp1, 4); 1937 src_temp6 = _mm_srli_si128(src_temp1, 5); 1938 src_temp7 = _mm_srli_si128(src_temp1, 6); 1939 src_temp8 = _mm_srli_si128(src_temp1, 7); 1940 1941 src_temp1 = _mm_shuffle_epi8(src_temp1, sm2); 1942 src_temp2 = _mm_shuffle_epi8(src_temp2, sm2); 1943 src_temp3 = _mm_shuffle_epi8(src_temp3, sm2); 1944 src_temp4 = _mm_shuffle_epi8(src_temp4, sm2); 1945 src_temp5 = _mm_shuffle_epi8(src_temp5, sm2); 1946 src_temp6 = _mm_shuffle_epi8(src_temp6, sm2); 1947 src_temp7 = _mm_shuffle_epi8(src_temp7, sm2); 1948 src_temp8 = _mm_shuffle_epi8(src_temp8, sm2); 1949 1950 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8); 1951 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7); 1952 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6); 1953 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5); 1954 _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4); 1955 _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3); 1956 _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2); 1957 _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); 1958 1959 } 1960 else 1961 { 1962 for(row = 0; row < nt; row += 8) 1963 { 1964 for(col = 0; col < nt; col += 16) 1965 { /*pu1_ref[two_nt - row - (col+1) - 1]*/ 1966 1967 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0) - (col + 16) - 1)); 1968 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1) - (col + 16) - 1)); 1969 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2) - (col + 16) - 1)); 1970 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3) - (col + 16) - 1)); 1971 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4) - (col + 16) - 1)); 1972 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5) - (col + 16) - 1)); 1973 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6) - (col + 16) - 1)); 1974 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7) - (col + 16) - 1)); 1975 1976 src_temp1 = _mm_shuffle_epi8(src_temp1, sm3); 1977 src_temp2 = _mm_shuffle_epi8(src_temp2, sm3); 1978 src_temp3 = _mm_shuffle_epi8(src_temp3, sm3); 1979 src_temp4 = _mm_shuffle_epi8(src_temp4, sm3); 1980 src_temp5 = _mm_shuffle_epi8(src_temp5, sm3); 1981 src_temp6 = _mm_shuffle_epi8(src_temp6, sm3); 1982 src_temp7 = _mm_shuffle_epi8(src_temp7, sm3); 1983 src_temp8 = _mm_shuffle_epi8(src_temp8, sm3); 1984 1985 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), src_temp1); 1986 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), src_temp2); 1987 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), src_temp3); 1988 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), src_temp4); 1989 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), src_temp5); 1990 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), src_temp6); 1991 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), src_temp7); 1992 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), src_temp8); 1993 } 1994 } 1995 } 1996 1997 } 1998 1999 /** 2000 ******************************************************************************* 2001 * 2002 * @brief 2003 * Intra prediction interpolation filter for luma mode 18 & mode 34. 2004 * 2005 * @par Description: 2006 * Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with 2007 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2008 * block location pointed by 'pu1_dst' 2009 * 2010 * @param[in] pu1_src 2011 * UWORD8 pointer to the source 2012 * 2013 * @param[out] pu1_dst 2014 * UWORD8 pointer to the destination 2015 * 2016 * @param[in] src_strd 2017 * integer source stride 2018 * 2019 * @param[in] dst_strd 2020 * integer destination stride 2021 * 2022 * @param[in] nt 2023 * integer Transform Block size 2024 * 2025 * @param[in] mode 2026 * integer intraprediction mode 2027 * 2028 * @returns 2029 * 2030 * @remarks 2031 * None 2032 * 2033 ******************************************************************************* 2034 */ 2035 2036 void ihevc_intra_pred_luma_mode_18_34_ssse3(UWORD8 *pu1_ref, 2037 WORD32 src_strd, 2038 UWORD8 *pu1_dst, 2039 WORD32 dst_strd, 2040 WORD32 nt, 2041 WORD32 mode) 2042 { 2043 WORD32 row; 2044 WORD32 two_nt = 2 * nt; 2045 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8; 2046 UNUSED(src_strd); 2047 if(mode == 34) 2048 { 2049 if(nt == 4) 2050 { 2051 2052 int temp1, temp2, temp3, temp4; 2053 2054 /*pu1_ref[two_nt + col + idx + 1]*/ 2055 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2)); 2056 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3)); 2057 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4)); 2058 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5)); 2059 2060 temp1 = _mm_cvtsi128_si32(src_temp1); 2061 temp2 = _mm_cvtsi128_si32(src_temp2); 2062 temp3 = _mm_cvtsi128_si32(src_temp3); 2063 temp4 = _mm_cvtsi128_si32(src_temp4); 2064 2065 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 2066 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; 2067 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2; 2068 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3; 2069 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4; 2070 2071 } 2072 else if(nt == 8) 2073 { 2074 /*pu1_ref[two_nt + col + idx + 1]*/ 2075 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 2)); 2076 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 3)); 2077 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 4)); 2078 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 5)); 2079 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 6)); 2080 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 7)); 2081 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 8)); 2082 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 9)); 2083 2084 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 2085 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2); 2086 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3); 2087 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4); 2088 _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5); 2089 _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6); 2090 _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7); 2091 _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8); 2092 2093 } 2094 else if(nt == 16) 2095 { 2096 for(row = 0; row < nt; row += 8) 2097 { 2098 /*pu1_ref[two_nt + col + idx + 1]*/ 2099 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 0) + 2)); 2100 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 1) + 2)); 2101 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 2) + 2)); 2102 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 3) + 2)); 2103 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 4) + 2)); 2104 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 5) + 2)); 2105 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 6) + 2)); 2106 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (row + 7) + 2)); 2107 2108 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1); 2109 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2); 2110 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3); 2111 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4); 2112 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5); 2113 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6); 2114 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7); 2115 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8); 2116 2117 2118 } 2119 } 2120 else 2121 { 2122 __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; 2123 for(row = 0; row < nt; row += 8) 2124 { 2125 /*pu1_ref[two_nt + col + idx + 1]*/ 2126 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 0) + 2)); 2127 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (0 + 16) + 2)); 2128 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 0) + 2)); 2129 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (1 + 16) + 2)); 2130 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 0) + 2)); 2131 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (2 + 16) + 2)); 2132 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 0) + 2)); 2133 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (3 + 16) + 2)); 2134 2135 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1); 2136 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9); 2137 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2); 2138 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10); 2139 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3); 2140 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11); 2141 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4); 2142 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12); 2143 2144 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 0) + 2)); 2145 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (4 + 16) + 2)); 2146 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 0) + 2)); 2147 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (5 + 16) + 2)); 2148 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 0) + 2)); 2149 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (6 + 16) + 2)); 2150 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 0) + 2)); 2151 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + (7 + 16) + 2)); 2152 2153 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5); 2154 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13); 2155 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6); 2156 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14); 2157 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7); 2158 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15); 2159 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8); 2160 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16); 2161 2162 pu1_ref += 8; 2163 pu1_dst += 8 * dst_strd; 2164 } 2165 } 2166 } 2167 else 2168 { 2169 if(nt == 4) 2170 { 2171 int temp1, temp2, temp3, temp4; 2172 2173 /*pu1_ref[two_nt + col + idx + 1]*/ 2174 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3)); 2175 src_temp2 = _mm_srli_si128(src_temp1, 1); 2176 src_temp3 = _mm_srli_si128(src_temp1, 2); 2177 src_temp4 = _mm_srli_si128(src_temp1, 3); 2178 2179 temp1 = _mm_cvtsi128_si32(src_temp4); 2180 temp2 = _mm_cvtsi128_si32(src_temp3); 2181 temp3 = _mm_cvtsi128_si32(src_temp2); 2182 temp4 = _mm_cvtsi128_si32(src_temp1); 2183 2184 /*pu1_dst[(row * dst_strd) + col] = pu1_ref[two_nt - 1 - row];*/ 2185 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp1; 2186 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp2; 2187 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp3; 2188 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp4; 2189 2190 } 2191 else if(nt == 8) 2192 { 2193 /*pu1_ref[two_nt + col + idx + 1]*/ 2194 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7)); 2195 src_temp2 = _mm_srli_si128(src_temp1, 1); 2196 src_temp3 = _mm_srli_si128(src_temp1, 2); 2197 src_temp4 = _mm_srli_si128(src_temp1, 3); 2198 src_temp5 = _mm_srli_si128(src_temp1, 4); 2199 src_temp6 = _mm_srli_si128(src_temp1, 5); 2200 src_temp7 = _mm_srli_si128(src_temp1, 6); 2201 src_temp8 = _mm_srli_si128(src_temp1, 7); 2202 2203 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp8); 2204 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp7); 2205 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp6); 2206 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp5); 2207 _mm_storel_epi64((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp4); 2208 _mm_storel_epi64((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp3); 2209 _mm_storel_epi64((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp2); 2210 _mm_storel_epi64((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); 2211 2212 2213 } 2214 else if(nt == 16) 2215 { 2216 for(row = 0; row < nt; row += 8) 2217 { 2218 /*pu1_ref[two_nt + col + idx + 1]*/ 2219 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 0))); 2220 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 1))); 2221 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 2))); 2222 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 3))); 2223 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 4))); 2224 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 5))); 2225 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 6))); 2226 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - (row + 7))); 2227 2228 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), src_temp1); 2229 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), src_temp2); 2230 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), src_temp3); 2231 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), src_temp4); 2232 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 4) * dst_strd)), src_temp5); 2233 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 5) * dst_strd)), src_temp6); 2234 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 6) * dst_strd)), src_temp7); 2235 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 7) * dst_strd)), src_temp8); 2236 2237 } 2238 2239 } 2240 else 2241 { 2242 __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; 2243 for(row = 0; row < nt; row += 8) 2244 { 2245 /*pu1_ref[two_nt + col + idx + 1]*/ 2246 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 0)); 2247 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 0 + 16)); 2248 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 0)); 2249 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 1 + 16)); 2250 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 0)); 2251 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 2 + 16)); 2252 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 0)); 2253 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 3 + 16)); 2254 2255 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1); 2256 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9); 2257 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2); 2258 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10); 2259 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3); 2260 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11); 2261 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4); 2262 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12); 2263 2264 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 0)); 2265 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 4 + 16)); 2266 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 0)); 2267 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 5 + 16)); 2268 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 0)); 2269 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 6 + 16)); 2270 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 0)); 2271 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 7 + 16)); 2272 2273 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5); 2274 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13); 2275 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6); 2276 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14); 2277 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7); 2278 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15); 2279 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8); 2280 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16); 2281 2282 pu1_ref -= 8; 2283 pu1_dst += 8 * dst_strd; 2284 } 2285 } 2286 } 2287 } 2288 2289 2290 /** 2291 ******************************************************************************* 2292 * 2293 * @brief 2294 * Intra prediction interpolation filter for luma mode 3 to mode 9 2295 * 2296 * @par Description: 2297 * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with 2298 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2299 * block location pointed by 'pu1_dst' 2300 * 2301 * @param[in] pu1_src 2302 * UWORD8 pointer to the source 2303 * 2304 * @param[out] pu1_dst 2305 * UWORD8 pointer to the destination 2306 * 2307 * @param[in] src_strd 2308 * integer source stride 2309 * 2310 * @param[in] dst_strd 2311 * integer destination stride 2312 * 2313 * @param[in] nt 2314 * integer Transform Block size 2315 * 2316 * @param[in] mode 2317 * integer intraprediction mode 2318 * 2319 * @returns 2320 * 2321 * @remarks 2322 * None 2323 * 2324 ******************************************************************************* 2325 */ 2326 2327 void ihevc_intra_pred_luma_mode_3_to_9_ssse3(UWORD8 *pu1_ref, 2328 WORD32 src_strd, 2329 UWORD8 *pu1_dst, 2330 WORD32 dst_strd, 2331 WORD32 nt, 2332 WORD32 mode) 2333 { 2334 WORD32 row, col; 2335 WORD32 two_nt = 2 * nt; 2336 WORD32 intra_pred_ang; 2337 2338 2339 __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b, zero_8x16b; 2340 __m128i fract_4x32b, intra_pred_ang_4x32b; 2341 __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3; 2342 UNUSED(src_strd); 2343 2344 /* Intra Pred Angle according to the mode */ 2345 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2346 2347 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 2348 /* samples dependent on distance to obtain destination sample */ 2349 2350 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 2351 /* samples dependent on distance to obtain destination sample */ 2352 2353 const_temp_4x32b = _mm_set1_epi16(16); 2354 const_temp2_4x32b = _mm_set1_epi32(31); 2355 const_temp3_4x32b = _mm_set1_epi32(32); 2356 const_temp4_4x32b = _mm_set1_epi32(4); 2357 2358 two_nt_4x32b = _mm_set1_epi32(two_nt - nt); 2359 2360 2361 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); 2362 2363 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 2364 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); 2365 2366 row_4x32b = _mm_set_epi32(4, 3, 2, 1); 2367 2368 if(nt == 4) 2369 { 2370 2371 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 2372 int temp11, temp21, temp31, temp41; 2373 // WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16]; 2374 2375 __m128i fract1_8x16b, fract2_8x16b, sign_8x16b; 2376 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2377 2378 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 2379 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b; //, src_temp8_8x16b; 2380 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; 2381 2382 row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); 2383 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2384 2385 /* pos = ((row + 1) * intra_pred_ang); */ 2386 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2387 zero_8x16b = _mm_setzero_si128(); 2388 sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); 2389 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); 2390 2391 /* idx = pos >> 5; */ 2392 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2393 2394 /* fract = pos & (31); */ 2395 ref_main_idx_4x32b = _mm_sub_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 2396 2397 /*(32 - fract) */ 2398 row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b); 2399 2400 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 2401 fract2_8x16b = _mm_slli_epi16(row_4x32b, 8); 2402 2403 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 2404 row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */ 2405 2406 fract2_8x16b = _mm_unpackhi_epi8(row_4x32b, fract_4x32b); 2407 fract1_8x16b = _mm_unpacklo_epi8(row_4x32b, fract_4x32b); 2408 2409 temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00); 2410 temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 2411 temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00); 2412 temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 2413 2414 ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */ 2415 ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */ 2416 ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */ 2417 ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/ 2418 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/ 2419 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/ 2420 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/ 2421 2422 /* loding 8-bit 16 pixels */ 2423 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 - 1)); /* col=0*/ 2424 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2 - 1)); /* col=1*/ 2425 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3 - 1)); /* col=2*/ 2426 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4 - 1)); /* col=3*/ 2427 2428 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ 2429 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ 2430 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ 2431 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ 2432 2433 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2434 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 2435 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 2436 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 2437 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 2438 2439 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2440 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 2441 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 2442 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 2443 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 2444 2445 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2446 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 2447 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 2448 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 2449 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 2450 2451 /* converting 16 bit to 8 bit */ 2452 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 2453 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 2454 2455 2456 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 2457 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 2458 2459 src_temp3_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 2460 src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 4); 2461 src_temp1_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 2462 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 12); 2463 2464 temp11 = _mm_cvtsi128_si32(src_temp7_8x16b); 2465 temp21 = _mm_cvtsi128_si32(src_temp1_8x16b); 2466 temp31 = _mm_cvtsi128_si32(src_temp2_8x16b); 2467 temp41 = _mm_cvtsi128_si32(src_temp3_8x16b); 2468 2469 /* loding 4-bit 8 pixels values */ 2470 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; 2471 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; 2472 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; 2473 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; 2474 2475 } 2476 2477 else if(nt == 16 || nt == 32) 2478 { 2479 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2480 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 2481 const_temp2_4x32b = _mm_set1_epi16(31); 2482 const_temp4_4x32b = _mm_set1_epi16(8); 2483 const_temp3_4x32b = _mm_set1_epi16(32); 2484 two_nt_4x32b = _mm_set1_epi16(two_nt); 2485 2486 for(col = 0; col < nt; col += 8) 2487 { 2488 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 2489 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 2490 //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; 2491 2492 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 2493 2494 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2495 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 2496 2497 /* pos = ((row + 1) * intra_pred_ang); */ 2498 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2499 2500 /* idx = pos >> 5; */ 2501 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2502 2503 /*(32 - fract) */ 2504 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 2505 2506 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 2507 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 2508 2509 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 2510 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 2511 2512 2513 fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b); 2514 fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b); 2515 2516 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 2517 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 2518 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 2519 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 2520 2521 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 2522 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 2523 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 2524 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 2525 2526 /* fract = pos & (31); */ 2527 ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 2528 2529 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 2530 2531 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 2532 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 2533 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 2534 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 2535 2536 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 2537 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 2538 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 2539 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 2540 2541 for(row = 0; row < nt; row += 8) 2542 { 2543 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 2544 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 2545 2546 2547 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 2548 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 2549 2550 /* loding 8-bit 16 pixels */ 2551 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1 - (8 + row))); /* col=0*/ 2552 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1 - (8 + row))); /* col=1*/ 2553 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1 - (8 + row))); /* col=2*/ 2554 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1 - (8 + row))); /* col=3*/ 2555 2556 /* loding 8-bit 16 pixels */ 2557 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1 - (8 + row))); /* col=5*/ 2558 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1 - (8 + row))); /* col=6*/ 2559 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1 - (8 + row))); /* col=7*/ 2560 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1 - (8 + row))); /* col=8*/ 2561 2562 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ 2563 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ 2564 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ 2565 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ 2566 2567 src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/ 2568 src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/ 2569 src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/ 2570 src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/ 2571 2572 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2573 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 2574 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 2575 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 2576 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 2577 2578 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2579 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 2580 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 2581 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 2582 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 2583 2584 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2585 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 2586 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 2587 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 2588 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 2589 2590 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2591 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 2592 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 2593 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 2594 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 2595 2596 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2597 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 2598 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 2599 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 2600 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 2601 2602 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2603 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 2604 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 2605 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 2606 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 2607 2608 /* converting 16 bit to 8 bit */ 2609 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 2610 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 2611 2612 /* converting 16 bit to 8 bit */ 2613 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ 2614 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ 2615 2616 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 2617 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 2618 2619 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 2620 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 2621 2622 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 2623 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 2624 2625 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 2626 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 2627 2628 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 2629 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 2630 2631 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 2632 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 2633 2634 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 2635 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 2636 2637 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 2638 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 2639 2640 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp1_8x16b); /* row=7*/ 2641 2642 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp5_8x16b); /* row=6*/ 2643 2644 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp2_8x16b); /* row=5*/ 2645 2646 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp6_8x16b); /* row=4*/ 2647 2648 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp3_8x16b); /* row=3*/ 2649 2650 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp7_8x16b); /* row=2*/ 2651 2652 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp4_8x16b); /* row=1*/ 2653 2654 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 0))), src_temp8_8x16b); /* row=0*/ 2655 2656 } 2657 } 2658 } 2659 else 2660 { 2661 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2662 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 2663 const_temp2_4x32b = _mm_set1_epi16(31); 2664 const_temp4_4x32b = _mm_set1_epi16(8); 2665 const_temp3_4x32b = _mm_set1_epi16(32); 2666 two_nt_4x32b = _mm_set1_epi16(two_nt - nt); 2667 { 2668 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 2669 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 2670 2671 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 2672 2673 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2674 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 2675 2676 /* pos = ((row + 1) * intra_pred_ang); */ 2677 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2678 2679 /* idx = pos >> 5; */ 2680 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2681 2682 /* fract = pos & (31); */ 2683 ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 2684 2685 /*(32 - fract) */ 2686 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 2687 2688 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 2689 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 2690 2691 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 2692 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 2693 2694 2695 fract8_8x16b = _mm_unpackhi_epi8(fract2_8x16b, fract_4x32b); 2696 fract_4x32b = _mm_unpacklo_epi8(fract2_8x16b, fract_4x32b); 2697 2698 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 2699 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 2700 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 2701 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 2702 2703 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 2704 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 2705 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 2706 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 2707 2708 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 2709 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 2710 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 2711 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 2712 2713 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 2714 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 2715 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 2716 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 2717 2718 { 2719 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 2720 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 2721 2722 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 2723 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 2724 2725 /* loding 8-bit 16 pixels */ 2726 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 1)); /* col=0*/ 2727 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 1)); /* col=1*/ 2728 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 1)); /* col=2*/ 2729 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 1)); /* col=3*/ 2730 2731 /* loding 8-bit 16 pixels */ 2732 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - 1)); /* col=5*/ 2733 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - 1)); /* col=6*/ 2734 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - 1)); /* col=7*/ 2735 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - 1)); /* col=8*/ 2736 2737 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm3); /* col=0*/ 2738 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm3); /* col=1*/ 2739 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm3); /* col=2*/ 2740 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm3); /* col=3*/ 2741 2742 src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm3); /* col=0*/ 2743 src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm3); /* col=1*/ 2744 src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm3); /* col=2*/ 2745 src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm3); /* col=3*/ 2746 2747 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2748 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 2749 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 2750 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 2751 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 2752 2753 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2754 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 2755 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 2756 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 2757 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 2758 2759 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2760 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 2761 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 2762 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 2763 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 2764 2765 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2766 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 2767 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 2768 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 2769 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 2770 2771 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2772 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 2773 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 2774 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 2775 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 2776 2777 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2778 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 2779 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 2780 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 2781 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 2782 2783 /* converting 16 bit to 8 bit */ 2784 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 2785 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 2786 2787 /* converting 16 bit to 8 bit */ 2788 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ 2789 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ 2790 2791 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 2792 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 2793 2794 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 2795 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 2796 2797 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 2798 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 2799 2800 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 2801 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 2802 2803 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 2804 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 2805 2806 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 2807 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 2808 2809 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 2810 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 2811 2812 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 2813 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 2814 2815 _mm_storel_epi64((__m128i *)(pu1_dst), src_temp8_8x16b); /* row=0*/ 2816 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 1)), src_temp4_8x16b); /* row=1*/ 2817 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 2)), src_temp7_8x16b); /* row=2*/ 2818 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 3)), src_temp3_8x16b); /* row=3*/ 2819 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 4)), src_temp6_8x16b); /* row=4*/ 2820 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 5)), src_temp2_8x16b); /* row=5*/ 2821 2822 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 6)), src_temp5_8x16b); /* row=6*/ 2823 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 7)), src_temp1_8x16b); /* row=7*/ 2824 2825 } 2826 } 2827 } 2828 2829 } 2830 2831 2832 2833 /** 2834 ******************************************************************************* 2835 * 2836 * @brief 2837 * Intra prediction interpolation filter for luma mode 11 to mode 17 2838 * 2839 * @par Description: 2840 * Intraprediction for mode 11 to 17 (negative angle, horizontal mode ) 2841 * with reference neighboring samples location pointed by 'pu1_ref' to the 2842 * TU block location pointed by 'pu1_dst' 2843 * 2844 * @param[in] pu1_src 2845 * UWORD8 pointer to the source 2846 * 2847 * @param[out] pu1_dst 2848 * UWORD8 pointer to the destination 2849 * 2850 * @param[in] src_strd 2851 * integer source stride 2852 * 2853 * @param[in] dst_strd 2854 * integer destination stride 2855 * 2856 * @param[in] nt 2857 * integer Transform Block size 2858 * 2859 * @param[in] mode 2860 * integer intraprediction mode 2861 * 2862 * @returns 2863 * 2864 * @remarks 2865 * None 2866 * 2867 ******************************************************************************* 2868 */ 2869 2870 2871 void ihevc_intra_pred_luma_mode_11_to_17_ssse3(UWORD8 *pu1_ref, 2872 WORD32 src_strd, 2873 UWORD8 *pu1_dst, 2874 WORD32 dst_strd, 2875 WORD32 nt, 2876 WORD32 mode) 2877 { 2878 2879 /* This function and ihevc_intra_pred_luma_mode_19_to_25 are same except*/ 2880 /* for ref main & side samples assignment,can be combined for */ 2881 /* optimzation*/ 2882 2883 WORD32 row, col, k; 2884 WORD32 two_nt; 2885 WORD32 intra_pred_ang, inv_ang, inv_ang_sum; 2886 WORD32 ref_idx; 2887 2888 __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 2889 __m128i fract_4x32b, intra_pred_ang_4x32b; 2890 __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm3; 2891 2892 2893 UWORD8 ref_tmp[2 * MAX_CU_SIZE + 2]; 2894 UWORD8 *ref_main; 2895 UWORD8 *ref_temp; 2896 UNUSED(src_strd); 2897 inv_ang_sum = 128; 2898 two_nt = 2 * nt; 2899 ref_temp = ref_tmp + 1; 2900 ref_main = ref_temp + nt - 1; 2901 intra_pred_ang = gai4_ihevc_ang_table[mode]; 2902 2903 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 2904 /* samples dependent on distance to obtain destination sample */ 2905 const_temp_4x32b = _mm_set1_epi16(16); 2906 const_temp2_4x32b = _mm_set1_epi32(31); 2907 const_temp3_4x32b = _mm_set1_epi32(32); 2908 const_temp4_4x32b = _mm_set1_epi32(4); 2909 2910 two_nt_4x32b = _mm_set1_epi32(1); 2911 2912 2913 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); 2914 2915 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 2916 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); 2917 2918 row_4x32b = _mm_set_epi32(4, 3, 2, 1); 2919 2920 if(nt == 4) 2921 { 2922 2923 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 2924 int temp11, temp21, temp31, temp41; 2925 // WORD8 ai1_fract_temp_val[16], ai1_row_temp_val[16]; 2926 2927 __m128i fract1_8x16b, fract2_8x16b; 2928 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2929 2930 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 2931 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 2932 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, zero_8x16b, sign_8x16b; 2933 2934 /* Intermediate reference samples for negative angle modes */ 2935 /* This have to be removed during optimization*/ 2936 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */ 2937 inv_ang = gai4_ihevc_inv_ang_table[mode - 11]; 2938 2939 ref_main = ref_temp + nt - 1; 2940 for(k = 0; k < nt + 1; k++) 2941 ref_temp[k + nt - 1] = pu1_ref[two_nt - k]; 2942 2943 ref_main = ref_temp + nt - 1; 2944 ref_idx = (nt * intra_pred_ang) >> 5; 2945 zero_8x16b = _mm_setzero_si128(); 2946 2947 row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); 2948 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2949 /* SIMD Optimization can be done using look-up table for the loop */ 2950 /* For negative angled derive the main reference samples from side */ 2951 /* reference samples refer to section 8.4.4.2.6 */ 2952 for(k = -1; k > ref_idx; k--) 2953 { 2954 inv_ang_sum += inv_ang; 2955 ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)]; 2956 } 2957 2958 2959 /* pos = ((row + 1) * intra_pred_ang); */ 2960 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2961 2962 sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); 2963 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); 2964 2965 /* idx = pos >> 5; */ 2966 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2967 2968 /* fract = pos & (31); */ 2969 ref_main_idx_4x32b = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 2970 2971 /*(32 - fract) */ 2972 row_4x32b = _mm_sub_epi32(const_temp3_4x32b, fract_4x32b); 2973 2974 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 2975 fract2_8x16b = _mm_slli_epi16(row_4x32b, 8); 2976 2977 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 2978 row_4x32b = _mm_or_si128(row_4x32b, fract2_8x16b); /*(32 - fract) */ 2979 2980 fract2_8x16b = _mm_unpackhi_epi8(fract_4x32b, row_4x32b); 2981 fract1_8x16b = _mm_unpacklo_epi8(fract_4x32b, row_4x32b); 2982 2983 temp1_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0x00); 2984 temp2_8x16b = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 2985 temp3_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0x00); 2986 temp4_8x16b = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 2987 2988 ref_main_temp0 = _mm_srli_si128(ref_main_idx_4x32b, 4); /* next 32 bit values */ 2989 ref_main_temp1 = _mm_srli_si128(ref_main_idx_4x32b, 8); /* next 32 bit values */ 2990 ref_main_temp2 = _mm_srli_si128(ref_main_idx_4x32b, 12); /* next 32 bit values */ 2991 ref_main_idx1 = _mm_cvtsi128_si32(ref_main_idx_4x32b); /* col=0*/ 2992 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* col=1*/ 2993 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* col=2*/ 2994 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* col=3*/ 2995 2996 /* loding 8-bit 16 pixels */ 2997 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col=0*/ 2998 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col=1*/ 2999 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col=2*/ 3000 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col=3*/ 3001 3002 src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ 3003 src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ 3004 src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ 3005 src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ 3006 3007 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 3008 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 3009 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 3010 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 3011 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 3012 3013 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3014 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 3015 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 3016 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 3017 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 3018 3019 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3020 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 3021 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 3022 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 3023 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 3024 3025 /* converting 16 bit to 8 bit */ 3026 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 3027 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 3028 3029 3030 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 3031 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 3032 3033 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 3034 src_temp1_8x16b = _mm_srli_si128(src_temp7_8x16b, 4); 3035 src_temp2_8x16b = _mm_srli_si128(src_temp7_8x16b, 8); 3036 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 12); 3037 3038 temp11 = _mm_cvtsi128_si32(src_temp7_8x16b); 3039 temp21 = _mm_cvtsi128_si32(src_temp1_8x16b); 3040 temp31 = _mm_cvtsi128_si32(src_temp2_8x16b); 3041 temp41 = _mm_cvtsi128_si32(src_temp3_8x16b); 3042 3043 /* loding 8-bit 4 pixels values */ 3044 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; 3045 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; 3046 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; 3047 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; 3048 } 3049 3050 else if(nt == 32) 3051 { 3052 3053 3054 __m128i temp1, temp2, temp3, temp11, temp12; 3055 __m128i src_values0, src_values1; 3056 /* Intermediate reference samples for negative angle modes */ 3057 3058 ref_temp[two_nt - 1] = pu1_ref[two_nt - nt]; 3059 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1)); 3060 temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 17)); 3061 temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); 3062 3063 /* For negative angled derive the main reference samples from side */ 3064 3065 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ 3066 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 17)); /*(nt+16)-(two_nt-1)*/ 3067 3068 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode])); 3069 temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); 3070 3071 src_values0 = _mm_shuffle_epi8(src_values0, temp2); 3072 src_values1 = _mm_shuffle_epi8(src_values1, temp2); 3073 src_values0 = _mm_shuffle_epi8(src_values0, temp12); 3074 src_values1 = _mm_shuffle_epi8(src_values1, temp11); 3075 3076 temp1 = _mm_shuffle_epi8(temp1, temp2); 3077 temp3 = _mm_shuffle_epi8(temp3, temp2); 3078 3079 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp3); 3080 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp1); 3081 _mm_storeu_si128((__m128i *)(ref_main - 16), src_values0); 3082 _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[17 - mode][0]), src_values1); 3083 3084 3085 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3086 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3087 const_temp2_4x32b = _mm_set1_epi16(31); 3088 const_temp4_4x32b = _mm_set1_epi16(8); 3089 const_temp3_4x32b = _mm_set1_epi16(32); 3090 two_nt_4x32b = _mm_set1_epi16(1); 3091 3092 for(col = 0; col < nt; col += 8) 3093 { 3094 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 3095 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 3096 // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; 3097 3098 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 3099 3100 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 3101 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 3102 3103 /* pos = ((row + 1) * intra_pred_ang); */ 3104 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3105 3106 /* idx = pos >> 5; */ 3107 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3108 3109 /* fract = pos & (31); */ 3110 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3111 3112 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 3113 /*(32 - fract) */ 3114 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 3115 3116 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 3117 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 3118 3119 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 3120 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 3121 3122 3123 fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); 3124 fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); 3125 3126 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 3127 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 3128 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 3129 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 3130 3131 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 3132 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 3133 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 3134 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 3135 3136 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 3137 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 3138 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 3139 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 3140 3141 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 3142 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 3143 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 3144 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 3145 3146 for(row = 0; row < nt; row += 8) 3147 { 3148 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 3149 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 3150 3151 3152 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 3153 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 3154 3155 /* loding 8-bit 16 pixels */ 3156 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/ 3157 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/ 3158 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/ 3159 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/ 3160 3161 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/ 3162 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/ 3163 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/ 3164 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/ 3165 3166 /* loding 8-bit 16 pixels */ 3167 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/ 3168 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/ 3169 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/ 3170 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/ 3171 3172 src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ 3173 src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ 3174 src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ 3175 src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ 3176 3177 src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ 3178 src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ 3179 src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ 3180 src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ 3181 3182 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 3183 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 3184 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 3185 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 3186 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 3187 3188 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 3189 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 3190 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 3191 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 3192 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 3193 3194 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3195 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 3196 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 3197 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 3198 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 3199 3200 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3201 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 3202 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 3203 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 3204 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 3205 3206 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3207 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 3208 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 3209 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 3210 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 3211 3212 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3213 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 3214 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 3215 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 3216 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 3217 3218 /* converting 16 bit to 8 bit */ 3219 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 3220 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 3221 3222 /* converting 16 bit to 8 bit */ 3223 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ 3224 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ 3225 3226 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 3227 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 3228 3229 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 3230 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 3231 3232 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 3233 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 3234 3235 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 3236 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 3237 3238 3239 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 3240 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 3241 3242 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 3243 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 3244 3245 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 3246 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 3247 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 3248 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 3249 3250 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/ 3251 3252 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/ 3253 3254 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/ 3255 3256 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/ 3257 3258 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/ 3259 3260 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/ 3261 3262 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/ 3263 3264 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/ 3265 3266 } 3267 } 3268 } 3269 else if(nt == 16) 3270 { 3271 3272 __m128i temp1, temp2, temp11, src_values0; 3273 /* Intermediate reference samples for negative angle modes */ 3274 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 3275 ref_temp[two_nt - 1] = pu1_ref[two_nt - nt]; 3276 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + nt + 1)); 3277 temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); 3278 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ 3279 3280 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); 3281 3282 src_values0 = _mm_shuffle_epi8(src_values0, temp2); 3283 temp1 = _mm_shuffle_epi8(temp1, temp2); 3284 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 3285 3286 _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0); 3287 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); 3288 3289 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3290 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3291 const_temp2_4x32b = _mm_set1_epi16(31); 3292 const_temp4_4x32b = _mm_set1_epi16(8); 3293 const_temp3_4x32b = _mm_set1_epi16(32); 3294 two_nt_4x32b = _mm_set1_epi16(1); 3295 3296 for(col = 0; col < nt; col += 8) 3297 { 3298 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 3299 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 3300 // WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; 3301 3302 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 3303 3304 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 3305 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 3306 3307 /* pos = ((row + 1) * intra_pred_ang); */ 3308 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3309 3310 /* idx = pos >> 5; */ 3311 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3312 3313 /* fract = pos & (31); */ 3314 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3315 3316 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 3317 /*(32 - fract) */ 3318 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 3319 3320 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 3321 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 3322 3323 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 3324 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 3325 3326 3327 fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); 3328 fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); 3329 3330 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 3331 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 3332 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 3333 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 3334 3335 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 3336 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 3337 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 3338 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 3339 3340 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 3341 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 3342 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 3343 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 3344 3345 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 3346 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 3347 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 3348 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 3349 3350 for(row = 0; row < nt; row += 8) 3351 { 3352 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 3353 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 3354 3355 3356 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 3357 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 3358 3359 /* loding 8-bit 16 pixels */ 3360 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row)); /* col=0*/ 3361 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row)); /* col=1*/ 3362 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row)); /* col=2*/ 3363 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row)); /* col=3*/ 3364 3365 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 1); /* col=0*/ 3366 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 1); /* col=1*/ 3367 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 1); /* col=2*/ 3368 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 1); /* col=3*/ 3369 3370 /* loding 8-bit 16 pixels */ 3371 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row)); /* col=5*/ 3372 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row)); /* col=6*/ 3373 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row)); /* col=7*/ 3374 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row)); /* col=8*/ 3375 3376 src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ 3377 src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ 3378 src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ 3379 src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ 3380 3381 src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ 3382 src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ 3383 src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ 3384 src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ 3385 3386 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 3387 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 3388 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 3389 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 3390 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 3391 3392 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 3393 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 3394 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 3395 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 3396 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 3397 3398 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3399 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 3400 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 3401 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 3402 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 3403 3404 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3405 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 3406 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 3407 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 3408 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 3409 3410 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3411 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 3412 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 3413 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 3414 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 3415 3416 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3417 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 3418 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 3419 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 3420 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 3421 3422 /* converting 16 bit to 8 bit */ 3423 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 3424 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 3425 3426 /* converting 16 bit to 8 bit */ 3427 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=5*/ 3428 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=6*/ 3429 3430 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 3431 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 3432 3433 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 3434 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 3435 3436 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 3437 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 3438 3439 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 3440 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 3441 3442 3443 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 3444 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 3445 3446 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 3447 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 3448 3449 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 3450 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 3451 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 3452 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 3453 3454 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp1_8x16b); /* row=0*/ 3455 3456 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp5_8x16b); /* row=1*/ 3457 3458 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp2_8x16b); /* row=2*/ 3459 3460 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp6_8x16b); /* row=4*/ 3461 3462 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 4))), src_temp3_8x16b); /* row=5*/ 3463 3464 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 5))), src_temp7_8x16b); /* row=6*/ 3465 3466 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 6))), src_temp4_8x16b); /* row=7*/ 3467 3468 _mm_storel_epi64((__m128i *)(pu1_dst + col + (dst_strd * (row + 7))), src_temp8_8x16b); /* row=8*/ 3469 3470 } 3471 } 3472 } 3473 else 3474 { 3475 3476 3477 __m128i temp1, temp2, temp11, src_values0; 3478 /* Intermediate reference samples for negative angle modes */ 3479 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 3480 ref_temp[two_nt - 1] = pu1_ref[nt]; 3481 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 1)); 3482 3483 /* For negative angled derive the main reference samples from side */ 3484 3485 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 1)); /*nt-(nt+15)*/ 3486 temp2 = _mm_loadu_si128((__m128i *)IHEVCE_SHUFFLEMASKY3); 3487 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[17 - mode] + 16)); 3488 3489 src_values0 = _mm_shuffle_epi8(src_values0, temp2); 3490 temp1 = _mm_shuffle_epi8(temp1, temp2); 3491 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 3492 src_values0 = _mm_srli_si128(src_values0, 8); 3493 3494 _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1); 3495 _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0); 3496 3497 3498 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3499 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3500 const_temp2_4x32b = _mm_set1_epi16(31); 3501 const_temp4_4x32b = _mm_set1_epi16(8); 3502 const_temp3_4x32b = _mm_set1_epi16(32); 3503 two_nt_4x32b = _mm_set1_epi16(1); 3504 3505 { 3506 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 3507 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 3508 //WORD8 ai1_fract_temp0_val[16], ai1_fract_temp1_val[16]; 3509 3510 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract8_8x16b; 3511 3512 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 3513 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 3514 3515 /* pos = ((row + 1) * intra_pred_ang); */ 3516 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3517 3518 /* idx = pos >> 5; */ 3519 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3520 3521 /* fract = pos & (31); */ 3522 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3523 3524 /*(32 - fract) */ 3525 fract2_8x16b = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 3526 3527 fract1_8x16b = _mm_slli_epi16(fract_4x32b, 8); 3528 fract3_8x16b = _mm_slli_epi16(fract2_8x16b, 8); /*(32 - fract) */ 3529 3530 fract_4x32b = _mm_or_si128(fract_4x32b, fract1_8x16b); 3531 fract2_8x16b = _mm_or_si128(fract2_8x16b, fract3_8x16b); /*(32 - fract) */ 3532 3533 fract8_8x16b = _mm_unpackhi_epi8(fract_4x32b, fract2_8x16b); 3534 fract_4x32b = _mm_unpacklo_epi8(fract_4x32b, fract2_8x16b); 3535 3536 temp1_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x00); 3537 temp2_8x16b = _mm_shuffle_epi32(fract_4x32b, 0x55); 3538 temp3_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xaa); 3539 temp4_8x16b = _mm_shuffle_epi32(fract_4x32b, 0xff); 3540 3541 temp11_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x00); 3542 temp12_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0x55); 3543 temp13_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xaa); 3544 temp14_8x16b = _mm_shuffle_epi32(fract8_8x16b, 0xff); 3545 3546 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 3547 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 3548 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 3549 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 3550 3551 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 3552 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 3553 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 3554 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 3555 3556 { 3557 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 3558 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 3559 3560 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 3561 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 3562 3563 /* loding 8-bit 16 pixels */ 3564 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/ 3565 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/ 3566 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/ 3567 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/ 3568 3569 /* loding 8-bit 16 pixels */ 3570 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5)); /* col=5*/ 3571 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6)); /* col=6*/ 3572 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7)); /* col=7*/ 3573 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8)); /* col=8*/ 3574 3575 src_temp1_8x16b = _mm_shuffle_epi8(src_temp5_8x16b, sm3); /* col=0*/ 3576 src_temp2_8x16b = _mm_shuffle_epi8(src_temp6_8x16b, sm3); /* col=1*/ 3577 src_temp3_8x16b = _mm_shuffle_epi8(src_temp7_8x16b, sm3); /* col=2*/ 3578 src_temp4_8x16b = _mm_shuffle_epi8(src_temp8_8x16b, sm3); /* col=3*/ 3579 3580 src_temp11_8x16b = _mm_shuffle_epi8(src_temp15_8x16b, sm3); /* col=0*/ 3581 src_temp12_8x16b = _mm_shuffle_epi8(src_temp16_8x16b, sm3); /* col=1*/ 3582 src_temp13_8x16b = _mm_shuffle_epi8(src_temp17_8x16b, sm3); /* col=2*/ 3583 src_temp14_8x16b = _mm_shuffle_epi8(src_temp18_8x16b, sm3); /* col=3*/ 3584 3585 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 3586 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 3587 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 3588 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 3589 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 3590 3591 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 3592 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 3593 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 3594 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 3595 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 3596 3597 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3598 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 3599 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 3600 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 3601 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 3602 3603 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3604 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* row=0*/ 3605 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* row=1*/ 3606 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* row=2*/ 3607 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* row=3*/ 3608 3609 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3610 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 3611 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 3612 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 3613 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 3614 3615 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3616 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 3617 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 3618 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 3619 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 3620 3621 /* converting 16 bit to 8 bit */ 3622 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp3_8x16b); /* col=0*/ 3623 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, src_temp4_8x16b); /* col=1*/ 3624 3625 /* converting 16 bit to 8 bit */ 3626 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp13_8x16b); /* col=4*/ 3627 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, src_temp14_8x16b); /* col=5*/ 3628 3629 src_temp5_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp2_8x16b); 3630 src_temp6_8x16b = _mm_unpackhi_epi8(src_temp1_8x16b, src_temp2_8x16b); 3631 3632 src_temp15_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp12_8x16b); 3633 src_temp16_8x16b = _mm_unpackhi_epi8(src_temp11_8x16b, src_temp12_8x16b); 3634 3635 src_temp7_8x16b = _mm_unpacklo_epi16(src_temp5_8x16b, src_temp6_8x16b); 3636 src_temp8_8x16b = _mm_unpackhi_epi16(src_temp5_8x16b, src_temp6_8x16b); 3637 3638 src_temp17_8x16b = _mm_unpacklo_epi16(src_temp15_8x16b, src_temp16_8x16b); 3639 src_temp18_8x16b = _mm_unpackhi_epi16(src_temp15_8x16b, src_temp16_8x16b); 3640 3641 3642 src_temp1_8x16b = _mm_unpacklo_epi32(src_temp7_8x16b, src_temp17_8x16b); 3643 src_temp2_8x16b = _mm_unpackhi_epi32(src_temp7_8x16b, src_temp17_8x16b); 3644 3645 src_temp3_8x16b = _mm_unpacklo_epi32(src_temp8_8x16b, src_temp18_8x16b); 3646 src_temp4_8x16b = _mm_unpackhi_epi32(src_temp8_8x16b, src_temp18_8x16b); 3647 3648 src_temp5_8x16b = _mm_srli_si128(src_temp1_8x16b, 8); 3649 src_temp6_8x16b = _mm_srli_si128(src_temp2_8x16b, 8); 3650 src_temp7_8x16b = _mm_srli_si128(src_temp3_8x16b, 8); 3651 src_temp8_8x16b = _mm_srli_si128(src_temp4_8x16b, 8); 3652 3653 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp1_8x16b); /* row=0*/ 3654 3655 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp5_8x16b); /* row=1*/ 3656 3657 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp2_8x16b); /* row=2*/ 3658 3659 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp6_8x16b); /* row=3*/ 3660 3661 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (4))), src_temp3_8x16b); /* row=4*/ 3662 3663 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (5))), src_temp7_8x16b); /* row=5*/ 3664 3665 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (6))), src_temp4_8x16b); /* row=6*/ 3666 3667 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (7))), src_temp8_8x16b); /* row=7*/ 3668 3669 } 3670 } 3671 } 3672 3673 } 3674 3675 3676 /** 3677 ******************************************************************************* 3678 * 3679 * @brief 3680 * Intra prediction interpolation filter for luma mode 19 to mode 25 3681 * 3682 * @par Description: 3683 * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with 3684 * reference neighboring samples location pointed by 'pu1_ref' to the TU 3685 * block location pointed by 'pu1_dst' 3686 * 3687 * @param[in] pu1_src 3688 * UWORD8 pointer to the source 3689 * 3690 * @param[out] pu1_dst 3691 * UWORD8 pointer to the destination 3692 * 3693 * @param[in] src_strd 3694 * integer source stride 3695 * 3696 * @param[in] dst_strd 3697 * integer destination stride 3698 * 3699 * @param[in] nt 3700 * integer Transform Block size 3701 * 3702 * @param[in] mode 3703 * integer intraprediction mode 3704 * 3705 * @returns 3706 * 3707 * @remarks 3708 * None 3709 * 3710 ******************************************************************************* 3711 */ 3712 3713 void ihevc_intra_pred_luma_mode_19_to_25_ssse3(UWORD8 *pu1_ref, 3714 WORD32 src_strd, 3715 UWORD8 *pu1_dst, 3716 WORD32 dst_strd, 3717 WORD32 nt, 3718 WORD32 mode) 3719 { 3720 3721 WORD32 row, k; 3722 WORD32 two_nt, intra_pred_ang; 3723 WORD32 inv_ang, inv_ang_sum; 3724 //WORD32 ref_main_idx, pos, fract, idx; 3725 WORD32 ref_idx; 3726 UWORD8 ref_tmp[(2 * MAX_CU_SIZE) + 2]; 3727 UWORD8 *ref_main, *ref_temp; 3728 3729 __m128i /*fract_8x16b,*/ const_temp_8x16b, sm3; 3730 __m128i temp1, temp2, temp3, temp4; 3731 __m128i temp11, temp12, temp13, temp14; 3732 UNUSED(src_strd); 3733 two_nt = 2 * nt; 3734 intra_pred_ang = gai4_ihevc_ang_table[mode]; 3735 inv_ang = gai4_ihevc_inv_ang_table[mode - 12]; 3736 3737 /* Intermediate reference samples for negative angle modes */ 3738 /* This have to be removed during optimization*/ 3739 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 3740 ref_temp = ref_tmp + 1; 3741 ref_main = ref_temp + nt - 1; 3742 3743 3744 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); 3745 3746 3747 3748 const_temp_8x16b = _mm_set1_epi16(16); 3749 3750 if(nt == 32) 3751 { 3752 3753 __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; 3754 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 3755 __m128i row_4x32b, two_nt_4x32b, src_values12; 3756 3757 __m128i src_values0, src_values1, src_values2, src_values3; 3758 __m128i src_values4, src_values5, src_values6, src_values7; 3759 WORD32 col = 0; 3760 3761 /* Intermediate reference samples for negative angle modes */ 3762 /* This have to be removed during optimization*/ 3763 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 3764 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; 3765 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt)); 3766 temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt + 16)); 3767 3768 /* SIMD Optimization can be done using look-up table for the loop */ 3769 /* For negative angled derive the main reference samples from side */ 3770 /* reference samples refer to section 8.4.4.2.6 */ 3771 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/ 3772 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - 16)); /*(nt+16)-(two_nt-1)*/ 3773 3774 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19])); 3775 temp12 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); 3776 3777 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 3778 src_values1 = _mm_shuffle_epi8(src_values1, temp12); 3779 3780 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); 3781 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1 + 16), temp3); 3782 _mm_storeu_si128((__m128i *)(ref_main - 16), src_values1); 3783 _mm_storeu_si128((__m128i *)(ref_main - nt + inv_angle_shuffle[mode - 19][0]), src_values0); 3784 3785 const_temp2_4x32b = _mm_set1_epi16(31); 3786 const_temp3_4x32b = _mm_set1_epi16(32); 3787 const_temp8_4x32b = _mm_set1_epi16(8); 3788 3789 two_nt_4x32b = _mm_set1_epi16(1); 3790 3791 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 3792 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3793 3794 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3795 3796 for(row = 0; row < nt; row += 8) 3797 { 3798 3799 WORD16 ref_main_idx[9]; 3800 3801 __m128i res_temp5_4x32b; 3802 __m128i fract1_8x16b, fract2_8x16b; 3803 3804 /* pos = ((row + 1) * intra_pred_ang); */ 3805 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 3806 3807 /* fract = pos & (31); */ 3808 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 3809 3810 /* idx = pos >> 5; */ 3811 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 3812 3813 /*(32 - fract) */ 3814 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 3815 3816 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 3817 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 3818 3819 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 3820 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 3821 3822 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 3823 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 3824 3825 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 3826 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 3827 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 3828 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 3829 3830 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 3831 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 3832 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 3833 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 3834 3835 row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); 3836 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 3837 for(col = 0; col < nt; col += 16) 3838 { 3839 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + col)); 3840 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + col)); 3841 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + col)); 3842 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + col)); 3843 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8 + col)); 3844 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8 + col)); 3845 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8 + col)); 3846 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8 + col)); 3847 3848 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3849 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3850 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3851 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3852 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3853 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3854 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3855 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3856 3857 3858 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 3859 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 3860 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 3861 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 3862 src_values4 = _mm_maddubs_epi16(src_values4, temp1); 3863 src_values5 = _mm_maddubs_epi16(src_values5, temp2); 3864 src_values6 = _mm_maddubs_epi16(src_values6, temp3); 3865 src_values7 = _mm_maddubs_epi16(src_values7, temp4); 3866 3867 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3868 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3869 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3870 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3871 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3872 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3873 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3874 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3875 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3876 3877 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3878 src_values0 = _mm_srai_epi16(src_values0, 5); 3879 src_values1 = _mm_srai_epi16(src_values1, 5); 3880 src_values2 = _mm_srai_epi16(src_values2, 5); 3881 src_values3 = _mm_srai_epi16(src_values3, 5); 3882 src_values4 = _mm_srai_epi16(src_values4, 5); 3883 src_values5 = _mm_srai_epi16(src_values5, 5); 3884 src_values6 = _mm_srai_epi16(src_values6, 5); 3885 src_values7 = _mm_srai_epi16(src_values7, 5); 3886 3887 /* converting 16 bit to 8 bit */ 3888 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3889 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3890 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3891 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3892 3893 /* loading 8-bit 8 pixels values */ 3894 _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/ 3895 _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/ 3896 _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/ 3897 _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/ 3898 3899 3900 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + col)); 3901 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + col)); 3902 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + col)); 3903 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + col)); 3904 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8 + col)); 3905 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8 + col)); 3906 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8 + col)); 3907 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8 + col)); 3908 3909 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 3910 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 3911 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 3912 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 3913 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 3914 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 3915 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 3916 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 3917 3918 3919 src_values0 = _mm_maddubs_epi16(src_values0, temp11); 3920 src_values1 = _mm_maddubs_epi16(src_values1, temp12); 3921 src_values2 = _mm_maddubs_epi16(src_values2, temp13); 3922 src_values3 = _mm_maddubs_epi16(src_values3, temp14); 3923 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 3924 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 3925 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 3926 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 3927 3928 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 3929 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 3930 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 3931 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 3932 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 3933 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 3934 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 3935 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 3936 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 3937 3938 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 3939 src_values0 = _mm_srai_epi16(src_values0, 5); 3940 src_values1 = _mm_srai_epi16(src_values1, 5); 3941 src_values2 = _mm_srai_epi16(src_values2, 5); 3942 src_values3 = _mm_srai_epi16(src_values3, 5); 3943 src_values4 = _mm_srai_epi16(src_values4, 5); 3944 src_values5 = _mm_srai_epi16(src_values5, 5); 3945 src_values6 = _mm_srai_epi16(src_values6, 5); 3946 src_values7 = _mm_srai_epi16(src_values7, 5); 3947 3948 /* converting 16 bit to 8 bit */ 3949 src_values0 = _mm_packus_epi16(src_values0, src_values4); 3950 src_values1 = _mm_packus_epi16(src_values1, src_values5); 3951 src_values2 = _mm_packus_epi16(src_values2, src_values6); 3952 src_values3 = _mm_packus_epi16(src_values3, src_values7); 3953 3954 /* loading 8-bit 8 pixels values */ 3955 _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/ 3956 _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/ 3957 _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/ 3958 _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/ 3959 3960 } 3961 pu1_dst += 8 * dst_strd; 3962 } 3963 3964 } 3965 else if(nt == 16) /* for nt = 16 case */ 3966 { 3967 3968 __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; 3969 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 3970 __m128i row_4x32b, two_nt_4x32b, src_values12; 3971 __m128i src_values0, src_values1, src_values2, src_values3; 3972 __m128i src_values4, src_values5, src_values6, src_values7; 3973 3974 3975 /* Intermediate reference samples for negative angle modes */ 3976 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 3977 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; 3978 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt)); 3979 3980 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + two_nt - nt)); /*nt-(nt+15)*/ 3981 3982 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); 3983 3984 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 3985 3986 _mm_storeu_si128((__m128i *)(ref_main - nt), src_values0); 3987 _mm_storeu_si128((__m128i *)(ref_temp + nt - 1), temp1); 3988 3989 const_temp2_4x32b = _mm_set1_epi16(31); 3990 const_temp3_4x32b = _mm_set1_epi16(32); 3991 const_temp8_4x32b = _mm_set1_epi16(8); 3992 3993 two_nt_4x32b = _mm_set1_epi16(1); 3994 3995 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 3996 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 3997 3998 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 3999 4000 for(row = 0; row < nt; row += 8) 4001 { 4002 4003 WORD16 ref_main_idx[9]; 4004 4005 __m128i res_temp5_4x32b; 4006 __m128i fract1_8x16b, fract2_8x16b; 4007 4008 /* pos = ((row + 1) * intra_pred_ang); */ 4009 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 4010 4011 /* fract = pos & (31); */ 4012 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 4013 4014 /* idx = pos >> 5; */ 4015 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 4016 4017 /*(32 - fract) */ 4018 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 4019 4020 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 4021 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 4022 4023 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 4024 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 4025 4026 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 4027 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 4028 4029 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 4030 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 4031 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 4032 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 4033 4034 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 4035 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 4036 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 4037 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 4038 4039 row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); 4040 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 4041 4042 { 4043 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0])); 4044 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1])); 4045 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2])); 4046 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3])); 4047 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0] + 8)); 4048 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1] + 8)); 4049 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2] + 8)); 4050 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3] + 8)); 4051 4052 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4053 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4054 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4055 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4056 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4057 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4058 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4059 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4060 4061 4062 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 4063 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 4064 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 4065 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 4066 src_values4 = _mm_maddubs_epi16(src_values4, temp1); 4067 src_values5 = _mm_maddubs_epi16(src_values5, temp2); 4068 src_values6 = _mm_maddubs_epi16(src_values6, temp3); 4069 src_values7 = _mm_maddubs_epi16(src_values7, temp4); 4070 4071 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4072 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4073 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4074 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4075 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4076 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4077 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4078 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4079 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4080 4081 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4082 src_values0 = _mm_srai_epi16(src_values0, 5); 4083 src_values1 = _mm_srai_epi16(src_values1, 5); 4084 src_values2 = _mm_srai_epi16(src_values2, 5); 4085 src_values3 = _mm_srai_epi16(src_values3, 5); 4086 src_values4 = _mm_srai_epi16(src_values4, 5); 4087 src_values5 = _mm_srai_epi16(src_values5, 5); 4088 src_values6 = _mm_srai_epi16(src_values6, 5); 4089 src_values7 = _mm_srai_epi16(src_values7, 5); 4090 4091 /* converting 16 bit to 8 bit */ 4092 src_values0 = _mm_packus_epi16(src_values0, src_values4); 4093 src_values1 = _mm_packus_epi16(src_values1, src_values5); 4094 src_values2 = _mm_packus_epi16(src_values2, src_values6); 4095 src_values3 = _mm_packus_epi16(src_values3, src_values7); 4096 4097 /* loading 8-bit 8 pixels values */ 4098 _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/ 4099 _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/ 4100 _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/ 4101 _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/ 4102 4103 4104 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4])); 4105 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5])); 4106 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6])); 4107 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7])); 4108 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4] + 8)); 4109 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5] + 8)); 4110 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6] + 8)); 4111 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7] + 8)); 4112 4113 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4114 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4115 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4116 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4117 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4118 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4119 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4120 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4121 4122 4123 src_values0 = _mm_maddubs_epi16(src_values0, temp11); 4124 src_values1 = _mm_maddubs_epi16(src_values1, temp12); 4125 src_values2 = _mm_maddubs_epi16(src_values2, temp13); 4126 src_values3 = _mm_maddubs_epi16(src_values3, temp14); 4127 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 4128 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 4129 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 4130 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 4131 4132 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4133 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4134 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4135 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4136 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4137 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4138 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4139 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4140 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4141 4142 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4143 src_values0 = _mm_srai_epi16(src_values0, 5); 4144 src_values1 = _mm_srai_epi16(src_values1, 5); 4145 src_values2 = _mm_srai_epi16(src_values2, 5); 4146 src_values3 = _mm_srai_epi16(src_values3, 5); 4147 src_values4 = _mm_srai_epi16(src_values4, 5); 4148 src_values5 = _mm_srai_epi16(src_values5, 5); 4149 src_values6 = _mm_srai_epi16(src_values6, 5); 4150 src_values7 = _mm_srai_epi16(src_values7, 5); 4151 4152 /* converting 16 bit to 8 bit */ 4153 src_values0 = _mm_packus_epi16(src_values0, src_values4); 4154 src_values1 = _mm_packus_epi16(src_values1, src_values5); 4155 src_values2 = _mm_packus_epi16(src_values2, src_values6); 4156 src_values3 = _mm_packus_epi16(src_values3, src_values7); 4157 4158 /* loading 8-bit 8 pixels values */ 4159 _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/ 4160 _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/ 4161 _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/ 4162 _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/ 4163 4164 } 4165 pu1_dst += 8 * dst_strd; 4166 } 4167 } 4168 else if(nt == 8) 4169 { 4170 4171 4172 __m128i const_temp2_4x32b, const_temp3_4x32b; 4173 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 4174 4175 __m128i row_4x32b, two_nt_4x32b, src_values12; 4176 __m128i src_values0, src_values1, src_values2, src_values3; 4177 __m128i src_values4, src_values5, src_values6, src_values7; 4178 4179 4180 /* Intermediate reference samples for negative angle modes */ 4181 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 4182 ref_temp[two_nt - 1] = pu1_ref[two_nt + nt]; 4183 temp1 = _mm_loadl_epi64((__m128i *)(pu1_ref + two_nt)); 4184 4185 /* For negative angled derive the main reference samples from side */ 4186 4187 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref)); /*nt-(nt+15)*/ 4188 4189 temp11 = _mm_loadu_si128((__m128i *)(inv_angle_shuffle[mode - 19] + 16)); 4190 4191 src_values0 = _mm_shuffle_epi8(src_values0, temp11); 4192 src_values0 = _mm_srli_si128(src_values0, 8); 4193 _mm_storel_epi64((__m128i *)(ref_temp + nt - 1), temp1); 4194 _mm_storel_epi64((__m128i *)(ref_main - nt), src_values0); 4195 4196 4197 4198 const_temp2_4x32b = _mm_set1_epi16(31); 4199 const_temp3_4x32b = _mm_set1_epi16(32); 4200 4201 4202 two_nt_4x32b = _mm_set1_epi16(1); 4203 4204 4205 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 4206 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 4207 4208 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 4209 4210 { 4211 4212 WORD16 ref_main_idx[9]; 4213 4214 __m128i res_temp5_4x32b; 4215 __m128i fract1_8x16b, fract2_8x16b; 4216 4217 /* pos = ((row + 1) * intra_pred_ang); */ 4218 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 4219 4220 /* fract = pos & (31); */ 4221 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 4222 4223 /* idx = pos >> 5; */ 4224 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 4225 4226 /*(32 - fract) */ 4227 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 4228 4229 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 4230 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 4231 4232 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 4233 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 4234 4235 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 4236 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 4237 4238 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 4239 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 4240 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 4241 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 4242 4243 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 4244 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 4245 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 4246 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 4247 4248 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 4249 4250 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[0])); /* col = 0-7 */ 4251 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[1])); /* col = 8-15 */ 4252 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[2])); /* col = 16-23 */ 4253 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[3])); /* col = 24-31 */ 4254 src_values4 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[4])); /* col = 32-39 */ 4255 src_values5 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[5])); /* col = 40-47 */ 4256 src_values6 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[6])); /* col = 48-55 */ 4257 src_values7 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx[7])); /* col = 56-63*/ 4258 4259 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4260 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4261 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4262 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4263 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4264 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4265 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4266 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4267 4268 4269 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 4270 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 4271 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 4272 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 4273 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 4274 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 4275 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 4276 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 4277 4278 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4279 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4280 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4281 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4282 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4283 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4284 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4285 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4286 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4287 4288 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4289 src_values0 = _mm_srai_epi16(src_values0, 5); 4290 src_values1 = _mm_srai_epi16(src_values1, 5); 4291 src_values2 = _mm_srai_epi16(src_values2, 5); 4292 src_values3 = _mm_srai_epi16(src_values3, 5); 4293 src_values4 = _mm_srai_epi16(src_values4, 5); 4294 src_values5 = _mm_srai_epi16(src_values5, 5); 4295 src_values6 = _mm_srai_epi16(src_values6, 5); 4296 src_values7 = _mm_srai_epi16(src_values7, 5); 4297 4298 /* converting 16 bit to 8 bit */ 4299 src_values0 = _mm_packus_epi16(src_values0, src_values1); 4300 src_values2 = _mm_packus_epi16(src_values2, src_values3); 4301 src_values1 = _mm_srli_si128(src_values0, 8); 4302 src_values3 = _mm_srli_si128(src_values2, 8); 4303 src_values4 = _mm_packus_epi16(src_values4, src_values5); 4304 src_values6 = _mm_packus_epi16(src_values6, src_values7); 4305 src_values5 = _mm_srli_si128(src_values4, 8); 4306 src_values7 = _mm_srli_si128(src_values6, 8); 4307 4308 /* loading 8-bit 8 pixels values */ 4309 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/ 4310 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/ 4311 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/ 4312 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/ 4313 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/ 4314 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/ 4315 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/ 4316 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/ 4317 } 4318 } 4319 else /* if nt =4*/ 4320 { 4321 4322 __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b; 4323 __m128i src_values10, src_values11, intra_pred_ang_4x32b, sign_8x16b; 4324 4325 __m128i row_4x32b, two_nt_4x32b, src_values12; 4326 4327 4328 for(k = 0; k < (nt + 1); k++) 4329 ref_temp[k + nt - 1] = pu1_ref[two_nt + k]; 4330 ref_idx = (nt * intra_pred_ang) >> 5; 4331 inv_ang_sum = 128; 4332 4333 for(k = -1; k > ref_idx; k--) 4334 { 4335 inv_ang_sum += inv_ang; 4336 ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)]; 4337 } 4338 4339 4340 const_temp2_4x32b = _mm_set1_epi32(31); 4341 const_temp3_4x32b = _mm_set1_epi32(32); 4342 zero_8x16b = _mm_setzero_si128(); 4343 two_nt_4x32b = _mm_set1_epi32(1); 4344 4345 4346 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 4347 row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); 4348 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 4349 4350 { 4351 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 4352 int temp11, temp21, temp31, temp41; 4353 4354 4355 __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b; 4356 __m128i src_values0, src_values1, src_values2, src_values3; 4357 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; 4358 4359 /* pos = ((row + 1) * intra_pred_ang); */ 4360 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 4361 sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); 4362 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); 4363 4364 /* fract = pos & (31); */ 4365 src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 4366 4367 ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */ 4368 ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */ 4369 ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */ 4370 ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/ 4371 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/ 4372 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/ 4373 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/ 4374 4375 /* idx = pos >> 5; */ 4376 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 4377 4378 /*(32 - fract) */ 4379 src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11); 4380 4381 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 4382 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 4383 4384 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 4385 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 4386 4387 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 4388 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 4389 4390 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 4391 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 4392 temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 4393 temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 4394 4395 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */ 4396 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */ 4397 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */ 4398 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */ 4399 4400 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4401 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4402 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4403 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4404 4405 4406 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 4407 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 4408 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 4409 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 4410 4411 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4412 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4413 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4414 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4415 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4416 4417 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4418 src_values0 = _mm_srai_epi16(src_values0, 5); 4419 src_values1 = _mm_srai_epi16(src_values1, 5); 4420 src_values2 = _mm_srai_epi16(src_values2, 5); 4421 src_values3 = _mm_srai_epi16(src_values3, 5); 4422 4423 /* converting 16 bit to 8 bit */ 4424 src_values0 = _mm_packus_epi16(src_values0, src_values1); 4425 src_values2 = _mm_packus_epi16(src_values2, src_values3); 4426 src_values1 = _mm_srli_si128(src_values0, 8); 4427 src_values3 = _mm_srli_si128(src_values2, 8); 4428 4429 temp11 = _mm_cvtsi128_si32(src_values0); 4430 temp21 = _mm_cvtsi128_si32(src_values1); 4431 temp31 = _mm_cvtsi128_si32(src_values2); 4432 temp41 = _mm_cvtsi128_si32(src_values3); 4433 4434 /* loding 4-bit 8 pixels values */ 4435 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; 4436 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; 4437 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; 4438 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; 4439 4440 } 4441 } 4442 } 4443 4444 4445 4446 4447 /** 4448 ******************************************************************************* 4449 * 4450 * @brief 4451 * Intra prediction interpolation filter for luma mode 27 to mode 33 4452 * 4453 * @par Description: 4454 * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with 4455 * reference neighboring samples location pointed by 'pu1_ref' to the TU 4456 * block location pointed by 'pu1_dst' 4457 * 4458 * @param[in] pu1_src 4459 * UWORD8 pointer to the source 4460 * 4461 * @param[out] pu1_dst 4462 * UWORD8 pointer to the destination 4463 * 4464 * @param[in] src_strd 4465 * integer source stride 4466 * 4467 * @param[in] dst_strd 4468 * integer destination stride 4469 * 4470 * @param[in] nt 4471 * integer Transform Block size 4472 * 4473 * @param[in] mode 4474 * integer intraprediction mode 4475 * 4476 * @returns 4477 * 4478 * @remarks 4479 * None 4480 * 4481 ******************************************************************************* 4482 */ 4483 4484 4485 void ihevc_intra_pred_luma_mode_27_to_33_ssse3(UWORD8 *pu1_ref, 4486 WORD32 src_strd, 4487 UWORD8 *pu1_dst, 4488 WORD32 dst_strd, 4489 WORD32 nt, 4490 WORD32 mode) 4491 { 4492 WORD32 row; 4493 WORD32 two_nt; 4494 WORD32 intra_pred_ang; 4495 4496 __m128i temp11, temp12, temp13, temp14; 4497 4498 __m128i const_temp_8x16b; 4499 __m128i temp1, temp2, temp3, temp4, sm3; 4500 UNUSED(src_strd); 4501 two_nt = 2 * nt; 4502 intra_pred_ang = gai4_ihevc_ang_table[mode]; 4503 4504 const_temp_8x16b = _mm_set1_epi16(16); 4505 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY11[0]); 4506 if(nt == 32) 4507 { 4508 4509 __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; 4510 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 4511 __m128i row_4x32b, two_nt_4x32b, src_values12; 4512 int col = 0; 4513 4514 const_temp2_4x32b = _mm_set1_epi16(31); 4515 const_temp3_4x32b = _mm_set1_epi16(32); 4516 const_temp8_4x32b = _mm_set1_epi16(8); 4517 4518 two_nt_4x32b = _mm_set1_epi16(two_nt + 1); 4519 4520 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 4521 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 4522 4523 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 4524 4525 for(row = 0; row < nt; row += 8) 4526 { 4527 4528 WORD16 ref_main_idx[9]; 4529 4530 __m128i res_temp5_4x32b; 4531 __m128i fract1_8x16b, fract2_8x16b; 4532 __m128i src_values0, src_values1, src_values2, src_values3; 4533 __m128i src_values4, src_values5, src_values6, src_values7; 4534 4535 /* pos = ((row + 1) * intra_pred_ang); */ 4536 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 4537 4538 /* fract = pos & (31); */ 4539 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 4540 4541 /* idx = pos >> 5; */ 4542 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 4543 4544 /*(32 - fract) */ 4545 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 4546 4547 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 4548 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 4549 4550 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 4551 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 4552 4553 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 4554 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 4555 4556 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 4557 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 4558 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 4559 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 4560 4561 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 4562 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 4563 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 4564 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 4565 4566 row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); 4567 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 4568 for(col = 0; col < nt; col += 16) 4569 { 4570 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + col)); 4571 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + col)); 4572 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + col)); 4573 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + col)); 4574 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8 + col)); 4575 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8 + col)); 4576 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8 + col)); 4577 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8 + col)); 4578 4579 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4580 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4581 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4582 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4583 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4584 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4585 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4586 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4587 4588 4589 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 4590 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 4591 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 4592 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 4593 src_values4 = _mm_maddubs_epi16(src_values4, temp1); 4594 src_values5 = _mm_maddubs_epi16(src_values5, temp2); 4595 src_values6 = _mm_maddubs_epi16(src_values6, temp3); 4596 src_values7 = _mm_maddubs_epi16(src_values7, temp4); 4597 4598 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4599 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4600 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4601 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4602 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4603 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4604 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4605 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4606 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4607 4608 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4609 src_values0 = _mm_srai_epi16(src_values0, 5); 4610 src_values1 = _mm_srai_epi16(src_values1, 5); 4611 src_values2 = _mm_srai_epi16(src_values2, 5); 4612 src_values3 = _mm_srai_epi16(src_values3, 5); 4613 src_values4 = _mm_srai_epi16(src_values4, 5); 4614 src_values5 = _mm_srai_epi16(src_values5, 5); 4615 src_values6 = _mm_srai_epi16(src_values6, 5); 4616 src_values7 = _mm_srai_epi16(src_values7, 5); 4617 4618 /* converting 16 bit to 8 bit */ 4619 src_values0 = _mm_packus_epi16(src_values0, src_values4); 4620 src_values1 = _mm_packus_epi16(src_values1, src_values5); 4621 src_values2 = _mm_packus_epi16(src_values2, src_values6); 4622 src_values3 = _mm_packus_epi16(src_values3, src_values7); 4623 4624 /* loading 8-bit 8 pixels values */ 4625 _mm_storeu_si128((__m128i *)(pu1_dst + col + (0) * dst_strd), src_values0); /* row=0*/ 4626 _mm_storeu_si128((__m128i *)(pu1_dst + col + (1) * dst_strd), src_values1); /* row=1*/ 4627 _mm_storeu_si128((__m128i *)(pu1_dst + col + (2) * dst_strd), src_values2); /* row=2*/ 4628 _mm_storeu_si128((__m128i *)(pu1_dst + col + (3) * dst_strd), src_values3); /* row=3*/ 4629 4630 4631 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + col)); 4632 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + col)); 4633 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + col)); 4634 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + col)); 4635 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8 + col)); 4636 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8 + col)); 4637 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8 + col)); 4638 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8 + col)); 4639 4640 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4641 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4642 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4643 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4644 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4645 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4646 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4647 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4648 4649 4650 src_values0 = _mm_maddubs_epi16(src_values0, temp11); 4651 src_values1 = _mm_maddubs_epi16(src_values1, temp12); 4652 src_values2 = _mm_maddubs_epi16(src_values2, temp13); 4653 src_values3 = _mm_maddubs_epi16(src_values3, temp14); 4654 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 4655 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 4656 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 4657 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 4658 4659 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4660 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4661 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4662 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4663 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4664 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4665 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4666 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4667 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4668 4669 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4670 src_values0 = _mm_srai_epi16(src_values0, 5); 4671 src_values1 = _mm_srai_epi16(src_values1, 5); 4672 src_values2 = _mm_srai_epi16(src_values2, 5); 4673 src_values3 = _mm_srai_epi16(src_values3, 5); 4674 src_values4 = _mm_srai_epi16(src_values4, 5); 4675 src_values5 = _mm_srai_epi16(src_values5, 5); 4676 src_values6 = _mm_srai_epi16(src_values6, 5); 4677 src_values7 = _mm_srai_epi16(src_values7, 5); 4678 4679 /* converting 16 bit to 8 bit */ 4680 src_values0 = _mm_packus_epi16(src_values0, src_values4); 4681 src_values1 = _mm_packus_epi16(src_values1, src_values5); 4682 src_values2 = _mm_packus_epi16(src_values2, src_values6); 4683 src_values3 = _mm_packus_epi16(src_values3, src_values7); 4684 4685 /* loading 8-bit 8 pixels values */ 4686 _mm_storeu_si128((__m128i *)(pu1_dst + col + (4) * dst_strd), src_values0); /* row=4*/ 4687 _mm_storeu_si128((__m128i *)(pu1_dst + col + (5) * dst_strd), src_values1); /* row=5*/ 4688 _mm_storeu_si128((__m128i *)(pu1_dst + col + (6) * dst_strd), src_values2); /* row=6*/ 4689 _mm_storeu_si128((__m128i *)(pu1_dst + col + (7) * dst_strd), src_values3); /* row=7*/ 4690 4691 } 4692 pu1_dst += 8 * dst_strd; 4693 } 4694 4695 } 4696 else if(nt == 16) /* for nt = 16 case */ 4697 { 4698 4699 __m128i const_temp2_4x32b, const_temp3_4x32b, const_temp8_4x32b; 4700 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 4701 __m128i row_4x32b, two_nt_4x32b, src_values12; 4702 4703 4704 const_temp2_4x32b = _mm_set1_epi16(31); 4705 const_temp3_4x32b = _mm_set1_epi16(32); 4706 const_temp8_4x32b = _mm_set1_epi16(8); 4707 4708 two_nt_4x32b = _mm_set1_epi16(two_nt + 1); 4709 4710 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 4711 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 4712 4713 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 4714 4715 for(row = 0; row < nt; row += 8) 4716 { 4717 4718 WORD16 ref_main_idx[9]; 4719 4720 __m128i res_temp5_4x32b; 4721 __m128i fract1_8x16b, fract2_8x16b; 4722 __m128i src_values0, src_values1, src_values2, src_values3; 4723 __m128i src_values4, src_values5, src_values6, src_values7; 4724 4725 /* pos = ((row + 1) * intra_pred_ang); */ 4726 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 4727 4728 /* fract = pos & (31); */ 4729 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 4730 4731 /* idx = pos >> 5; */ 4732 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 4733 4734 /*(32 - fract) */ 4735 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 4736 4737 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 4738 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 4739 4740 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 4741 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 4742 4743 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 4744 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 4745 4746 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 4747 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 4748 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 4749 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 4750 4751 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 4752 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 4753 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 4754 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 4755 4756 row_4x32b = _mm_add_epi16(row_4x32b, const_temp8_4x32b); 4757 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 4758 4759 { 4760 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); 4761 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); 4762 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); 4763 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); 4764 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0] + 8)); 4765 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1] + 8)); 4766 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2] + 8)); 4767 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3] + 8)); 4768 4769 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4770 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4771 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4772 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4773 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4774 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4775 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4776 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4777 4778 4779 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 4780 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 4781 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 4782 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 4783 src_values4 = _mm_maddubs_epi16(src_values4, temp1); 4784 src_values5 = _mm_maddubs_epi16(src_values5, temp2); 4785 src_values6 = _mm_maddubs_epi16(src_values6, temp3); 4786 src_values7 = _mm_maddubs_epi16(src_values7, temp4); 4787 4788 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4789 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4790 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4791 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4792 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4793 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4794 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4795 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4796 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4797 4798 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4799 src_values0 = _mm_srai_epi16(src_values0, 5); 4800 src_values1 = _mm_srai_epi16(src_values1, 5); 4801 src_values2 = _mm_srai_epi16(src_values2, 5); 4802 src_values3 = _mm_srai_epi16(src_values3, 5); 4803 src_values4 = _mm_srai_epi16(src_values4, 5); 4804 src_values5 = _mm_srai_epi16(src_values5, 5); 4805 src_values6 = _mm_srai_epi16(src_values6, 5); 4806 src_values7 = _mm_srai_epi16(src_values7, 5); 4807 4808 /* converting 16 bit to 8 bit */ 4809 src_values0 = _mm_packus_epi16(src_values0, src_values4); 4810 src_values1 = _mm_packus_epi16(src_values1, src_values5); 4811 src_values2 = _mm_packus_epi16(src_values2, src_values6); 4812 src_values3 = _mm_packus_epi16(src_values3, src_values7); 4813 4814 /* loading 8-bit 8 pixels values */ 4815 _mm_storeu_si128((__m128i *)(pu1_dst + (0) * dst_strd), src_values0); /* row=0*/ 4816 _mm_storeu_si128((__m128i *)(pu1_dst + (1) * dst_strd), src_values1); /* row=1*/ 4817 _mm_storeu_si128((__m128i *)(pu1_dst + (2) * dst_strd), src_values2); /* row=2*/ 4818 _mm_storeu_si128((__m128i *)(pu1_dst + (3) * dst_strd), src_values3); /* row=3*/ 4819 4820 4821 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); 4822 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); 4823 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); 4824 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); 4825 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4] + 8)); 4826 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5] + 8)); 4827 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6] + 8)); 4828 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7] + 8)); 4829 4830 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4831 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4832 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4833 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4834 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4835 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4836 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4837 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4838 4839 4840 src_values0 = _mm_maddubs_epi16(src_values0, temp11); 4841 src_values1 = _mm_maddubs_epi16(src_values1, temp12); 4842 src_values2 = _mm_maddubs_epi16(src_values2, temp13); 4843 src_values3 = _mm_maddubs_epi16(src_values3, temp14); 4844 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 4845 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 4846 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 4847 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 4848 4849 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4850 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4851 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4852 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4853 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4854 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4855 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4856 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4857 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4858 4859 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4860 src_values0 = _mm_srai_epi16(src_values0, 5); 4861 src_values1 = _mm_srai_epi16(src_values1, 5); 4862 src_values2 = _mm_srai_epi16(src_values2, 5); 4863 src_values3 = _mm_srai_epi16(src_values3, 5); 4864 src_values4 = _mm_srai_epi16(src_values4, 5); 4865 src_values5 = _mm_srai_epi16(src_values5, 5); 4866 src_values6 = _mm_srai_epi16(src_values6, 5); 4867 src_values7 = _mm_srai_epi16(src_values7, 5); 4868 4869 /* converting 16 bit to 8 bit */ 4870 src_values0 = _mm_packus_epi16(src_values0, src_values4); 4871 src_values1 = _mm_packus_epi16(src_values1, src_values5); 4872 src_values2 = _mm_packus_epi16(src_values2, src_values6); 4873 src_values3 = _mm_packus_epi16(src_values3, src_values7); 4874 4875 /* loading 8-bit 8 pixels values */ 4876 _mm_storeu_si128((__m128i *)(pu1_dst + (4) * dst_strd), src_values0); /* row=4*/ 4877 _mm_storeu_si128((__m128i *)(pu1_dst + (5) * dst_strd), src_values1); /* row=5*/ 4878 _mm_storeu_si128((__m128i *)(pu1_dst + (6) * dst_strd), src_values2); /* row=6*/ 4879 _mm_storeu_si128((__m128i *)(pu1_dst + (7) * dst_strd), src_values3); /* row=7*/ 4880 4881 } 4882 pu1_dst += 8 * dst_strd; 4883 } 4884 4885 } 4886 else if(nt == 8) 4887 { 4888 4889 __m128i const_temp2_4x32b, const_temp3_4x32b; 4890 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 4891 __m128i row_4x32b, two_nt_4x32b, src_values12; 4892 4893 4894 const_temp2_4x32b = _mm_set1_epi16(31); 4895 const_temp3_4x32b = _mm_set1_epi16(32); 4896 4897 two_nt_4x32b = _mm_set1_epi16(two_nt + 1); 4898 4899 4900 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 4901 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 4902 4903 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 4904 4905 //for(row = 0; row < nt; row +=4) 4906 { 4907 4908 WORD16 ref_main_idx[9]; 4909 4910 __m128i res_temp5_4x32b; 4911 __m128i fract1_8x16b, fract2_8x16b; 4912 __m128i src_values0, src_values1, src_values2, src_values3; 4913 __m128i src_values4, src_values5, src_values6, src_values7; 4914 4915 /* pos = ((row + 1) * intra_pred_ang); */ 4916 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 4917 4918 /* fract = pos & (31); */ 4919 src_values12 = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 4920 4921 /* idx = pos >> 5; */ 4922 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 4923 4924 /*(32 - fract) */ 4925 src_values10 = _mm_sub_epi16(const_temp3_4x32b, src_values11); 4926 4927 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 4928 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 4929 4930 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 4931 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 4932 4933 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 4934 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 4935 4936 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 4937 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0x55); 4938 temp3 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 4939 temp4 = _mm_shuffle_epi32(fract1_8x16b, 0xff); 4940 4941 temp11 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 4942 temp12 = _mm_shuffle_epi32(fract2_8x16b, 0x55); 4943 temp13 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 4944 temp14 = _mm_shuffle_epi32(fract2_8x16b, 0xff); 4945 4946 _mm_storeu_si128((__m128i *)ref_main_idx, src_values12); 4947 4948 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[0])); /* col = 0-7 */ 4949 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[1])); /* col = 8-15 */ 4950 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[2])); /* col = 16-23 */ 4951 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[3])); /* col = 24-31 */ 4952 src_values4 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[4])); /* col = 32-39 */ 4953 src_values5 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[5])); /* col = 40-47 */ 4954 src_values6 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[6])); /* col = 48-55 */ 4955 src_values7 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx[7])); /* col = 56-63*/ 4956 4957 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 4958 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 4959 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 4960 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 4961 src_values4 = _mm_shuffle_epi8(src_values4, sm3); 4962 src_values5 = _mm_shuffle_epi8(src_values5, sm3); 4963 src_values6 = _mm_shuffle_epi8(src_values6, sm3); 4964 src_values7 = _mm_shuffle_epi8(src_values7, sm3); 4965 4966 4967 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 4968 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 4969 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 4970 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 4971 src_values4 = _mm_maddubs_epi16(src_values4, temp11); 4972 src_values5 = _mm_maddubs_epi16(src_values5, temp12); 4973 src_values6 = _mm_maddubs_epi16(src_values6, temp13); 4974 src_values7 = _mm_maddubs_epi16(src_values7, temp14); 4975 4976 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 4977 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 4978 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 4979 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 4980 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 4981 src_values4 = _mm_add_epi16(src_values4, const_temp_8x16b); 4982 src_values5 = _mm_add_epi16(src_values5, const_temp_8x16b); 4983 src_values6 = _mm_add_epi16(src_values6, const_temp_8x16b); 4984 src_values7 = _mm_add_epi16(src_values7, const_temp_8x16b); 4985 4986 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 4987 src_values0 = _mm_srai_epi16(src_values0, 5); 4988 src_values1 = _mm_srai_epi16(src_values1, 5); 4989 src_values2 = _mm_srai_epi16(src_values2, 5); 4990 src_values3 = _mm_srai_epi16(src_values3, 5); 4991 src_values4 = _mm_srai_epi16(src_values4, 5); 4992 src_values5 = _mm_srai_epi16(src_values5, 5); 4993 src_values6 = _mm_srai_epi16(src_values6, 5); 4994 src_values7 = _mm_srai_epi16(src_values7, 5); 4995 4996 /* converting 16 bit to 8 bit */ 4997 src_values0 = _mm_packus_epi16(src_values0, src_values1); 4998 src_values2 = _mm_packus_epi16(src_values2, src_values3); 4999 src_values1 = _mm_srli_si128(src_values0, 8); 5000 src_values3 = _mm_srli_si128(src_values2, 8); 5001 src_values4 = _mm_packus_epi16(src_values4, src_values5); 5002 src_values6 = _mm_packus_epi16(src_values6, src_values7); 5003 src_values5 = _mm_srli_si128(src_values4, 8); 5004 src_values7 = _mm_srli_si128(src_values6, 8); 5005 5006 /* loading 8-bit 8 pixels values */ 5007 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_values0); /* row=0*/ 5008 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_values1); /* row=1*/ 5009 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_values2); /* row=2*/ 5010 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_values3); /* row=3*/ 5011 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), src_values4); /* row=4*/ 5012 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), src_values5); /* row=5*/ 5013 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), src_values6); /* row=6*/ 5014 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), src_values7); /* row=7*/ 5015 } 5016 5017 } 5018 else /* if nt =4*/ 5019 { 5020 5021 __m128i const_temp2_4x32b, const_temp3_4x32b, zero_8x16b; 5022 __m128i src_values10, src_values11, intra_pred_ang_4x32b; 5023 5024 __m128i row_4x32b, two_nt_4x32b, src_values12, sign_8x16b; 5025 5026 5027 const_temp2_4x32b = _mm_set1_epi32(31); 5028 const_temp3_4x32b = _mm_set1_epi32(32); 5029 zero_8x16b = _mm_setzero_si128(); 5030 two_nt_4x32b = _mm_set1_epi32(two_nt + 1); 5031 5032 5033 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 5034 row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); 5035 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 5036 { 5037 int temp11, temp21, temp31, temp41; 5038 5039 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 5040 5041 __m128i fract1_8x16b, fract2_8x16b, res_temp5_4x32b; 5042 __m128i src_values0, src_values1, src_values2, src_values3; 5043 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2; 5044 5045 /* pos = ((row + 1) * intra_pred_ang); */ 5046 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 5047 sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); 5048 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); 5049 5050 /* fract = pos & (31); */ 5051 src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 5052 5053 ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */ 5054 ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */ 5055 ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */ 5056 ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/ 5057 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/ 5058 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/ 5059 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/ 5060 5061 /* idx = pos >> 5; */ 5062 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 5063 5064 /*(32 - fract) */ 5065 src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11); 5066 5067 fract1_8x16b = _mm_slli_epi16(src_values11, 8); 5068 fract2_8x16b = _mm_slli_epi16(src_values10, 8); 5069 5070 src_values11 = _mm_or_si128(src_values11, fract1_8x16b); 5071 src_values10 = _mm_or_si128(src_values10, fract2_8x16b); /*(32 - fract) */ 5072 5073 fract2_8x16b = _mm_unpackhi_epi8(src_values11, src_values10); 5074 fract1_8x16b = _mm_unpacklo_epi8(src_values11, src_values10); 5075 5076 temp1 = _mm_shuffle_epi32(fract1_8x16b, 0x00); 5077 temp2 = _mm_shuffle_epi32(fract1_8x16b, 0xaa); 5078 temp3 = _mm_shuffle_epi32(fract2_8x16b, 0x00); 5079 temp4 = _mm_shuffle_epi32(fract2_8x16b, 0xaa); 5080 5081 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */ 5082 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */ 5083 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */ 5084 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */ 5085 5086 src_values0 = _mm_shuffle_epi8(src_values0, sm3); 5087 src_values1 = _mm_shuffle_epi8(src_values1, sm3); 5088 src_values2 = _mm_shuffle_epi8(src_values2, sm3); 5089 src_values3 = _mm_shuffle_epi8(src_values3, sm3); 5090 5091 src_values0 = _mm_maddubs_epi16(src_values0, temp1); 5092 src_values1 = _mm_maddubs_epi16(src_values1, temp2); 5093 src_values2 = _mm_maddubs_epi16(src_values2, temp3); 5094 src_values3 = _mm_maddubs_epi16(src_values3, temp4); 5095 5096 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 5097 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 5098 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 5099 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 5100 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 5101 5102 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 5103 src_values0 = _mm_srai_epi16(src_values0, 5); 5104 src_values1 = _mm_srai_epi16(src_values1, 5); 5105 src_values2 = _mm_srai_epi16(src_values2, 5); 5106 src_values3 = _mm_srai_epi16(src_values3, 5); 5107 5108 /* converting 16 bit to 8 bit */ 5109 src_values0 = _mm_packus_epi16(src_values0, src_values1); 5110 src_values2 = _mm_packus_epi16(src_values2, src_values3); 5111 src_values1 = _mm_srli_si128(src_values0, 8); 5112 src_values3 = _mm_srli_si128(src_values2, 8); 5113 5114 temp11 = _mm_cvtsi128_si32(src_values0); 5115 temp21 = _mm_cvtsi128_si32(src_values1); 5116 temp31 = _mm_cvtsi128_si32(src_values2); 5117 temp41 = _mm_cvtsi128_si32(src_values3); 5118 5119 /* loding 4-bit 8 pixels values */ 5120 *(WORD32 *)(&pu1_dst[(0 * dst_strd)]) = temp11; 5121 *(WORD32 *)(&pu1_dst[(1 * dst_strd)]) = temp21; 5122 *(WORD32 *)(&pu1_dst[(2 * dst_strd)]) = temp31; 5123 *(WORD32 *)(&pu1_dst[(3 * dst_strd)]) = temp41; 5124 5125 } 5126 } 5127 } 5128