1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_chroma_intra_pred_filters_atom_intr.c 22 * 23 * @brief 24 * Contains function Definition for intra prediction interpolation filters 25 * 26 * 27 * @author 28 * Ittiam 29 * 30 * @par List of Functions: 31 * ihevc_intra_pred_chroma_planar_ssse3() 32 * 33 * ihevc_intra_pred_chroma_dc_ssse3() 34 * 35 * ihevc_intra_pred_chroma_horz_ssse3() 36 * 37 * ihevc_intra_pred_chroma_ver_ssse3() 38 * 39 * ihevc_intra_pred_chroma_mode2_ssse3() 40 * 41 * ihevc_intra_pred_chroma_mode_18_34_ssse3() 42 * 43 * ihevc_intra_pred_chroma_mode_3_to_9_ssse3() 44 * 45 * ihevc_intra_pred_chroma_mode_11_to_17_ssse3() 46 * 47 * ihevc_intra_pred_chroma_mode_19_to_25_ssse3() 48 * 49 * ihevc_intra_pred_chroma_mode_27_to_33_ssse3() 50 * 51 * 52 * 53 * @remarks 54 * None 55 * 56 ******************************************************************************* 57 */ 58 59 60 /*****************************************************************************/ 61 /* File Includes */ 62 /*****************************************************************************/ 63 64 #include "ihevc_typedefs.h" 65 #include "ihevc_platform_macros.h" 66 #include "ihevc_macros.h" 67 #include "ihevc_func_selector.h" 68 #include "ihevc_intra_pred.h" 69 70 #include "ihevc_chroma_intra_pred.h" 71 #include "ihevc_common_tables.h" 72 #include "ihevc_tables_x86_intr.h" 73 74 #include <mmintrin.h> 75 #include <xmmintrin.h> 76 #include <emmintrin.h> 77 78 #include <immintrin.h> 79 80 81 /****************************************************************************/ 82 /* Constant Macros */ 83 /****************************************************************************/ 84 #define MAX_CU_SIZE 64 85 #define BIT_DEPTH 8 86 #define T32_4NT 128 87 #define T16_4NT 64 88 #define T16C_4NT 64 89 #define T8C_4NT 32 90 /****************************************************************************/ 91 /* Function Macros */ 92 /****************************************************************************/ 93 94 #define GET_BIT(y,x) ((y) & (1 << x)) && (1 << x) 95 96 /* tables to shuffle 8-bit values */ 97 98 /*****************************************************************************/ 99 /* Function Definition */ 100 /*****************************************************************************/ 101 102 103 104 /** 105 ******************************************************************************* 106 * 107 * @brief 108 * Planar Intraprediction with reference neighboring samples location 109 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 110 * to section 8.4.4.2.4 in the standard 111 * 112 * @par Description: 113 * 114 * 115 * @param[in] pu1_src 116 * UWORD8 pointer to the source 117 * 118 * @param[in] pu1_dst 119 * UWORD8 pointer to the destination 120 * 121 * @param[in] src_strd 122 * integer source stride 123 * 124 * @param[in] dst_strd 125 * integer destination stride 126 * 127 * @param[in] nt 128 * integer Transform Block size 129 * 130 * @param[in] mode 131 * integer intraprediction mode 132 * 133 * @returns 134 * 135 * @remarks 136 * None 137 * 138 ******************************************************************************* 139 */ 140 141 void ihevc_intra_pred_chroma_planar_ssse3(UWORD8 *pu1_ref, 142 WORD32 src_strd, 143 UWORD8 *pu1_dst, 144 WORD32 dst_strd, 145 WORD32 nt, 146 WORD32 mode) 147 { 148 149 WORD32 row, col; 150 WORD32 log2nt = 5; 151 WORD32 two_nt, three_nt; 152 153 __m128i const_temp_4x32b, const_temp1_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 154 __m128i col_8x16b, const_temp5_4x32b, const_temp6_4x32b, zero_8x16b, const_temp7_4x32b; 155 UNUSED(src_strd); 156 UNUSED(mode); 157 switch(nt) 158 { 159 case 16: 160 log2nt = 4; 161 break; 162 case 8: 163 log2nt = 3; 164 break; 165 case 4: 166 log2nt = 2; 167 break; 168 default: 169 break; 170 } 171 two_nt = 2 * nt; 172 three_nt = 3 * nt; 173 174 /* Planar filtering */ 175 176 /* setting vallues in registera*/ 177 178 // pu1_ref[2*(two_nt - 1 - row)] 179 // pu1_ref[2 * (three_nt + 1)] 180 // pu1_ref[2 * (two_nt + 1) + col] 181 // pu1_ref[2 * (nt - 1)] 182 183 const_temp_4x32b = _mm_set_epi16(pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], 184 pu1_ref[2 * (three_nt + 1)], pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)], 185 pu1_ref[2 * (three_nt + 1) + 1], pu1_ref[2 * (three_nt + 1)]); 186 187 const_temp1_4x32b = _mm_set_epi16(pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], 188 pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)], pu1_ref[2 * (nt - 1) + 1], pu1_ref[2 * (nt - 1)]); 189 190 const_temp4_4x32b = _mm_set1_epi16(nt - 1); 191 const_temp6_4x32b = _mm_set1_epi16(nt); 192 const_temp7_4x32b = _mm_set1_epi16(4); 193 194 zero_8x16b = _mm_set1_epi32(0); 195 196 197 if(nt % 4 == 0) 198 { 199 const_temp7_4x32b = _mm_set1_epi16(4); 200 201 for(row = 0; row < nt; row++) 202 { 203 __m128i res_temp_8x16b, row_8x16b, res_temp1_8x16b, res_temp2_8x16b; 204 __m128i res_temp3_8x16b; 205 206 const_temp2_4x32b = _mm_set_epi16(pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], 207 pu1_ref[2 * (two_nt - 1 - row)], pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)], 208 pu1_ref[2 * (two_nt - 1 - row) + 1], pu1_ref[2 * (two_nt - 1 - row)]); 209 210 const_temp3_4x32b = _mm_set1_epi16((row + 1)); 211 row_8x16b = _mm_set1_epi16((nt - 1 - row)); 212 213 const_temp5_4x32b = _mm_set_epi16(3, 3, 2, 2, 1, 1, 0, 0); 214 col_8x16b = _mm_set_epi16(4, 4, 3, 3, 2, 2, 1, 1); 215 216 const_temp5_4x32b = _mm_sub_epi16(const_temp4_4x32b, const_temp5_4x32b); 217 218 /*(row + 1) * pu1_ref[nt - 1]*/ 219 res_temp_8x16b = _mm_mullo_epi16(const_temp3_4x32b, const_temp1_4x32b); 220 221 /*(row + 1) * pu1_ref[nt - 1] + nt)*/ 222 res_temp_8x16b = _mm_add_epi16(res_temp_8x16b, const_temp6_4x32b); 223 224 for(col = 0; col < 2 * nt; col += 8) 225 { 226 __m128i src_temp_8x16b; 227 228 /* loding 8bit 16 pixles*/ 229 src_temp_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (two_nt + 1) + col)); 230 231 //src_temp_8x16b = _mm_cvtepu8_epi16 (src_temp_8x16b); /* row=0*/ 232 src_temp_8x16b = _mm_unpacklo_epi8(src_temp_8x16b, zero_8x16b); 233 234 /* (nt - 1 - row) * pu1_ref[two_nt + 1 + col] */ 235 res_temp1_8x16b = _mm_mullo_epi16(src_temp_8x16b, row_8x16b); 236 237 /*(col + 1) * pu1_ref[three_nt + 1]*/ 238 res_temp2_8x16b = _mm_mullo_epi16(const_temp_4x32b, col_8x16b); 239 240 /*(nt - 1 - col)* pu1_ref[two_nt - 1 - row]*/ 241 res_temp3_8x16b = _mm_mullo_epi16(const_temp2_4x32b, const_temp5_4x32b); 242 243 res_temp1_8x16b = _mm_add_epi16(res_temp_8x16b, res_temp1_8x16b); 244 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 245 res_temp1_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp3_8x16b); 246 247 res_temp1_8x16b = _mm_srli_epi16(res_temp1_8x16b, (log2nt + 1)); 248 res_temp1_8x16b = _mm_packus_epi16(res_temp1_8x16b, zero_8x16b); 249 250 _mm_storel_epi64((__m128i *)(pu1_dst + (row * dst_strd) + col), res_temp1_8x16b); 251 252 const_temp5_4x32b = _mm_sub_epi16(const_temp5_4x32b, const_temp7_4x32b); 253 col_8x16b = _mm_add_epi16(col_8x16b, const_temp7_4x32b); 254 } /* inner loop ends here */ 255 } 256 } 257 } 258 259 260 /** 261 ******************************************************************************* 262 * 263 * @brief 264 * Intraprediction for DC mode with reference neighboring samples location 265 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 266 * to section 8.4.4.2.5 in the standard 267 * 268 * @par Description: 269 * 270 * 271 * @param[in] pu1_src 272 * UWORD8 pointer to the source 273 * 274 * @param[in] pu1_dst 275 * UWORD8 pointer to the destination 276 * 277 * @param[in] src_strd 278 * integer source stride 279 * 280 * @param[in] dst_strd 281 * integer destination stride 282 * 283 * @param[in] nt 284 * integer Transform Block size (Chroma) 285 * 286 * @param[in] mode 287 * integer intraprediction mode 288 * 289 * @returns 290 * 291 * @remarks 292 * None 293 * 294 ******************************************************************************* 295 */ 296 297 void ihevc_intra_pred_chroma_dc_ssse3(UWORD8 *pu1_ref, 298 WORD32 src_strd, 299 UWORD8 *pu1_dst, 300 WORD32 dst_strd, 301 WORD32 nt, 302 WORD32 mode) 303 { 304 305 WORD32 acc_dc_u, acc_dc_v; 306 WORD32 dc_val_u, dc_val_v; 307 WORD32 row; 308 WORD32 log2nt = 5; 309 __m128i src_temp1, src_temp3, src_temp4, src_temp5, src_temp6, m_mask; 310 __m128i src_temp7, src_temp8, src_temp9, src_temp10; 311 __m128i m_zero = _mm_set1_epi32(0); 312 UNUSED(src_strd); 313 UNUSED(mode); 314 315 switch(nt) 316 { 317 case 32: 318 log2nt = 5; 319 break; 320 case 16: 321 log2nt = 4; 322 break; 323 case 8: 324 log2nt = 3; 325 break; 326 case 4: 327 log2nt = 2; 328 break; 329 default: 330 break; 331 } 332 333 acc_dc_u = 0; 334 acc_dc_v = 0; 335 336 /* Calculate DC value for the transform block */ 337 338 m_mask = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY9[0]); 339 340 if(nt == 16) 341 { 342 __m128i temp_sad, sign_8x16b; 343 344 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 345 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16)); 346 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 32)); 347 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 48)); 348 349 src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero); 350 src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero); 351 src_temp9 = _mm_unpacklo_epi8(src_temp7, m_zero); 352 src_temp10 = _mm_unpacklo_epi8(src_temp8, m_zero); 353 354 src_temp3 = _mm_srli_si128(src_temp3, 8); 355 src_temp4 = _mm_srli_si128(src_temp4, 8); 356 src_temp7 = _mm_srli_si128(src_temp7, 8); 357 src_temp8 = _mm_srli_si128(src_temp8, 8); 358 359 src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero); 360 src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero); 361 src_temp7 = _mm_unpacklo_epi8(src_temp7, m_zero); 362 src_temp8 = _mm_unpacklo_epi8(src_temp8, m_zero); 363 364 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 365 src_temp6 = _mm_add_epi16(src_temp3, src_temp5); 366 src_temp8 = _mm_add_epi16(src_temp7, src_temp8); 367 src_temp10 = _mm_add_epi16(src_temp9, src_temp10); 368 369 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 370 src_temp8 = _mm_add_epi16(src_temp8, src_temp10); 371 372 src_temp4 = _mm_add_epi16(src_temp4, src_temp8); 373 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 374 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 375 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 376 377 sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4); 378 src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b); 379 380 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 381 acc_dc_u = _mm_cvtsi128_si32(src_temp4); 382 acc_dc_v = _mm_cvtsi128_si32(temp_sad); 383 } 384 385 else if(nt == 8) 386 { 387 __m128i temp_sad, sign_8x16b; 388 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 389 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt) + 16)); 390 391 src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero); 392 src_temp6 = _mm_unpacklo_epi8(src_temp4, m_zero); 393 394 src_temp3 = _mm_srli_si128(src_temp3, 8); 395 src_temp4 = _mm_srli_si128(src_temp4, 8); 396 397 src_temp3 = _mm_unpacklo_epi8(src_temp3, m_zero); 398 src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero); 399 400 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 401 src_temp6 = _mm_add_epi16(src_temp3, src_temp5); 402 403 src_temp4 = _mm_add_epi16(src_temp4, src_temp6); 404 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 405 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 406 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 407 408 sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4); 409 src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b); 410 411 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 412 acc_dc_u = _mm_cvtsi128_si32(src_temp4); 413 acc_dc_v = _mm_cvtsi128_si32(temp_sad); 414 } 415 416 else if(nt == 4) 417 { 418 __m128i temp_sad, sign_8x16b; 419 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (2 * nt))); 420 421 src_temp5 = _mm_unpacklo_epi8(src_temp3, m_zero); 422 src_temp4 = _mm_srli_si128(src_temp3, 8); 423 424 src_temp4 = _mm_unpacklo_epi8(src_temp4, m_zero); 425 426 src_temp4 = _mm_add_epi16(src_temp4, src_temp5); 427 428 src_temp4 = _mm_shuffle_epi8(src_temp4, m_mask); 429 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 430 src_temp4 = _mm_hadd_epi16(src_temp4, m_zero); 431 432 sign_8x16b = _mm_cmpgt_epi16(m_zero, src_temp4); 433 src_temp4 = _mm_unpacklo_epi16(src_temp4, sign_8x16b); 434 435 temp_sad = _mm_srli_si128(src_temp4, 4); /* Next 32 bits */ 436 acc_dc_u = _mm_cvtsi128_si32(src_temp4); 437 acc_dc_v = _mm_cvtsi128_si32(temp_sad); 438 } 439 440 441 acc_dc_u += pu1_ref[6 * nt]; 442 acc_dc_v += pu1_ref[6 * nt + 1]; 443 444 acc_dc_u -= pu1_ref[4 * nt]; 445 acc_dc_v -= pu1_ref[4 * nt + 1]; 446 447 dc_val_u = (acc_dc_u + nt) >> (log2nt + 1); 448 dc_val_v = (acc_dc_v + nt) >> (log2nt + 1); 449 450 dc_val_u = dc_val_u | (dc_val_v << 8); 451 452 /* Fill the remaining rows with DC value*/ 453 454 if(nt == 4) 455 { 456 src_temp1 = _mm_set1_epi16(dc_val_u); 457 458 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 459 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 460 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 461 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 462 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 463 464 } 465 else if(nt == 8) 466 { 467 src_temp1 = _mm_set1_epi16(dc_val_u); 468 469 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 470 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 471 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 472 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 473 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 474 475 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1); 476 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1); 477 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1); 478 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); 479 480 } 481 482 else /* nt == 16 */ 483 { 484 src_temp1 = _mm_set1_epi16(dc_val_u); 485 486 for(row = 0; row < nt; row += 8) 487 { 488 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 489 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 490 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 491 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 492 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 493 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp1); 494 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp1); 495 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp1); 496 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp1); 497 498 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp1); 499 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp1); 500 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp1); 501 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp1); 502 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp1); 503 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp1); 504 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp1); 505 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp1); 506 507 pu1_dst += 8 * dst_strd; 508 } 509 } 510 511 } 512 513 514 /** 515 ******************************************************************************* 516 * 517 * @brief 518 * Horizontal intraprediction(mode 10) with reference samples location 519 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 520 * to section 8.4.4.2.6 in the standard (Special case) 521 * 522 * @par Description: 523 * 524 * 525 * @param[in] pu1_src 526 * UWORD8 pointer to the source 527 * 528 * @param[in] pu1_dst 529 * UWORD8 pointer to the destination 530 * 531 * @param[in] src_strd 532 * integer source stride 533 * 534 * @param[in] dst_strd 535 * integer destination stride 536 * 537 * @param[in] nt 538 * integer Transform Block size 539 * 540 * @param[in] mode 541 * integer intraprediction mode 542 * 543 * @returns 544 * 545 * @remarks 546 * None 547 * 548 ******************************************************************************* 549 */ 550 551 void ihevc_intra_pred_chroma_horz_ssse3(UWORD8 *pu1_ref, 552 WORD32 src_strd, 553 UWORD8 *pu1_dst, 554 WORD32 dst_strd, 555 WORD32 nt, 556 WORD32 mode) 557 { 558 559 WORD32 row; 560 __m128i temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; 561 UNUSED(src_strd); 562 UNUSED(mode); 563 564 /* Replication to next rows*/ 565 566 if(nt == 8) 567 { 568 for(row = 0; row < nt; row += 4) 569 { 570 temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]); 571 temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]); 572 temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]); 573 temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]); 574 temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]); 575 temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]); 576 temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]); 577 temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]); 578 579 temp2 = _mm_unpacklo_epi8(temp1, temp2); 580 temp4 = _mm_unpacklo_epi8(temp3, temp4); 581 temp6 = _mm_unpacklo_epi8(temp5, temp6); 582 temp8 = _mm_unpacklo_epi8(temp7, temp8); 583 584 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd)), temp2); 585 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd)), temp4); 586 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd)), temp6); 587 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd)), temp8); 588 589 } 590 } 591 else if(nt == 16) 592 { 593 for(row = 0; row < nt; row += 4) 594 { 595 temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 0)]); 596 temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 0)]); 597 598 temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 1)]); 599 temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 1)]); 600 601 temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 2)]); 602 temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 2)]); 603 604 temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * (row + 3)]); 605 temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * (row + 3)]); 606 607 temp2 = _mm_unpacklo_epi8(temp1, temp2); 608 temp4 = _mm_unpacklo_epi8(temp3, temp4); 609 temp6 = _mm_unpacklo_epi8(temp5, temp6); 610 temp8 = _mm_unpacklo_epi8(temp7, temp8); 611 612 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 0), temp2); 613 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 0) * dst_strd) + 16), temp2); 614 615 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 0), temp4); 616 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 1) * dst_strd) + 16), temp4); 617 618 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 0), temp6); 619 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 2) * dst_strd) + 16), temp6); 620 621 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 0), temp8); 622 _mm_storeu_si128((__m128i *)(pu1_dst + ((row + 3) * dst_strd) + 16), temp8); 623 624 625 } 626 } 627 else 628 { 629 temp1 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 0]); 630 temp2 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 0]); 631 632 temp3 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 1]); 633 temp4 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 1]); 634 635 temp5 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 2]); 636 temp6 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 2]); 637 638 temp7 = _mm_set1_epi8(pu1_ref[(4 * nt) - 2 - 2 * 3]); 639 temp8 = _mm_set1_epi8(pu1_ref[(4 * nt) - 1 - 2 * 3]); 640 641 temp2 = _mm_unpacklo_epi8(temp1, temp2); 642 temp4 = _mm_unpacklo_epi8(temp3, temp4); 643 temp6 = _mm_unpacklo_epi8(temp5, temp6); 644 temp8 = _mm_unpacklo_epi8(temp7, temp8); 645 646 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), temp2); 647 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), temp4); 648 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), temp6); 649 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), temp8); 650 } 651 } 652 653 654 /** 655 ******************************************************************************* 656 * 657 * @brief 658 * Horizontal intraprediction with reference neighboring samples location 659 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst' Refer 660 * to section 8.4.4.2.6 in the standard (Special case) 661 * 662 * @par Description: 663 * 664 * 665 * @param[in] pu1_src 666 * UWORD8 pointer to the source 667 * 668 * @param[in] pu1_dst 669 * UWORD8 pointer to the destination 670 * 671 * @param[in] src_strd 672 * integer source stride 673 * 674 * @param[in] dst_strd 675 * integer destination stride 676 * 677 * @param[in] nt 678 * integer Transform Block size 679 * 680 * @param[in] mode 681 * integer intraprediction mode 682 * 683 * @returns 684 * 685 * @remarks 686 * None 687 * 688 ******************************************************************************* 689 */ 690 691 void ihevc_intra_pred_chroma_ver_ssse3(UWORD8 *pu1_ref, 692 WORD32 src_strd, 693 UWORD8 *pu1_dst, 694 WORD32 dst_strd, 695 WORD32 nt, 696 WORD32 mode) 697 { 698 __m128i src_temp1; 699 UNUSED(src_strd); 700 UNUSED(mode); 701 702 /* Replication to next columns*/ 703 if(nt == 8) 704 { 705 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0)); 706 707 _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), src_temp1); 708 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), src_temp1); 709 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), src_temp1); 710 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), src_temp1); 711 712 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), src_temp1); 713 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), src_temp1); 714 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), src_temp1); 715 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), src_temp1); 716 717 } 718 if(nt == 16) 719 { 720 __m128i temp1, temp2; 721 722 temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0)); 723 temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 16)); 724 725 /* pu1_dst[(row * dst_strd) + col] = dc_val;*/ 726 _mm_storeu_si128((__m128i *)(pu1_dst + ((0) * dst_strd)), temp1); 727 _mm_storeu_si128((__m128i *)(pu1_dst + ((1) * dst_strd)), temp1); 728 _mm_storeu_si128((__m128i *)(pu1_dst + ((2) * dst_strd)), temp1); 729 _mm_storeu_si128((__m128i *)(pu1_dst + ((3) * dst_strd)), temp1); 730 _mm_storeu_si128((__m128i *)(pu1_dst + ((4) * dst_strd)), temp1); 731 _mm_storeu_si128((__m128i *)(pu1_dst + ((5) * dst_strd)), temp1); 732 _mm_storeu_si128((__m128i *)(pu1_dst + ((6) * dst_strd)), temp1); 733 _mm_storeu_si128((__m128i *)(pu1_dst + ((7) * dst_strd)), temp1); 734 735 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((0) * dst_strd)), temp2); 736 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((1) * dst_strd)), temp2); 737 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((2) * dst_strd)), temp2); 738 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((3) * dst_strd)), temp2); 739 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((4) * dst_strd)), temp2); 740 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((5) * dst_strd)), temp2); 741 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((6) * dst_strd)), temp2); 742 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((7) * dst_strd)), temp2); 743 744 _mm_storeu_si128((__m128i *)(pu1_dst + ((8) * dst_strd)), temp1); 745 _mm_storeu_si128((__m128i *)(pu1_dst + ((9) * dst_strd)), temp1); 746 _mm_storeu_si128((__m128i *)(pu1_dst + ((10) * dst_strd)), temp1); 747 _mm_storeu_si128((__m128i *)(pu1_dst + ((11) * dst_strd)), temp1); 748 _mm_storeu_si128((__m128i *)(pu1_dst + ((12) * dst_strd)), temp1); 749 _mm_storeu_si128((__m128i *)(pu1_dst + ((13) * dst_strd)), temp1); 750 _mm_storeu_si128((__m128i *)(pu1_dst + ((14) * dst_strd)), temp1); 751 _mm_storeu_si128((__m128i *)(pu1_dst + ((15) * dst_strd)), temp1); 752 753 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((8) * dst_strd)), temp2); 754 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((9) * dst_strd)), temp2); 755 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((10) * dst_strd)), temp2); 756 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((11) * dst_strd)), temp2); 757 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((12) * dst_strd)), temp2); 758 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((13) * dst_strd)), temp2); 759 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((14) * dst_strd)), temp2); 760 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + ((15) * dst_strd)), temp2); 761 762 } 763 else 764 { 765 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) + 2 + 0)); 766 767 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 768 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp1); 769 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp1); 770 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp1); 771 772 773 } 774 775 } 776 777 /** 778 ******************************************************************************* 779 * 780 * @brief 781 * Intraprediction for mode 2 (sw angle) with reference neighboring samples 782 * location pointed by 'pu1_ref' to the TU block location pointed by 783 * 'pu1_dst' Refer to section 8.4.4.2.6 in the standard 784 * 785 * @par Description: 786 * 787 * 788 * @param[in] pu1_src 789 * UWORD8 pointer to the source 790 * 791 * @param[in] pu1_dst 792 * UWORD8 pointer to the destination 793 * 794 * @param[in] src_strd 795 * integer source stride 796 * 797 * @param[in] dst_strd 798 * integer destination stride 799 * 800 * @param[in] nt 801 * integer Transform Block size 802 * 803 * @param[in] mode 804 * integer intraprediction mode 805 * 806 * @returns 807 * 808 * @remarks 809 * None 810 * 811 ******************************************************************************* 812 */ 813 814 void ihevc_intra_pred_chroma_mode2_ssse3(UWORD8 *pu1_ref, 815 WORD32 src_strd, 816 UWORD8 *pu1_dst, 817 WORD32 dst_strd, 818 WORD32 nt, 819 WORD32 mode) 820 { 821 WORD32 row, col; 822 823 824 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8, sm2, sm3; 825 UNUSED(src_strd); 826 UNUSED(mode); 827 828 sm2 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]); 829 sm3 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY8[0]); 830 831 /* For the angle 45, replication is done from the corresponding angle */ 832 /* intra_pred_ang = tan(angle) in q5 format */ 833 834 if(nt == 4) 835 { 836 /*pu1_ref[two_nt - row - (col+1) - 1]*/ 837 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 8 - 2)); 838 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 8 - 2)); 839 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 8 - 2)); 840 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 8 - 2)); 841 842 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm2)); 843 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm2)); 844 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm2)); 845 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm2)); 846 847 } 848 else if(nt == 8) 849 { 850 /*pu1_ref[two_nt - row - (col+1) - 1]*/ 851 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 0 - 16 - 2)); 852 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 1 - 16 - 2)); 853 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 2 - 16 - 2)); 854 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 3 - 16 - 2)); 855 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 4 - 16 - 2)); 856 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 5 - 16 - 2)); 857 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 6 - 16 - 2)); 858 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * 7 - 16 - 2)); 859 860 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3)); 861 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3)); 862 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3)); 863 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3)); 864 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3)); 865 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3)); 866 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3)); 867 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3)); 868 869 870 } 871 else 872 { 873 for(row = 0; row < nt; row += 8) 874 { 875 for(col = 0; col < 2 * nt; col += 16) 876 { /*pu1_ref[two_nt - row - (col+1) - 1]*/ 877 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 0) - (col + 16) - 2)); 878 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 1) - (col + 16) - 2)); 879 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 2) - (col + 16) - 2)); 880 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 3) - (col + 16) - 2)); 881 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 4) - (col + 16) - 2)); 882 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 5) - (col + 16) - 2)); 883 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 6) - (col + 16) - 2)); 884 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + (4 * nt) - 2 * (row + 7) - (col + 16) - 2)); 885 886 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 0) * dst_strd)), _mm_shuffle_epi8(src_temp1, sm3)); 887 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 1) * dst_strd)), _mm_shuffle_epi8(src_temp2, sm3)); 888 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 2) * dst_strd)), _mm_shuffle_epi8(src_temp3, sm3)); 889 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 3) * dst_strd)), _mm_shuffle_epi8(src_temp4, sm3)); 890 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 4) * dst_strd)), _mm_shuffle_epi8(src_temp5, sm3)); 891 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 5) * dst_strd)), _mm_shuffle_epi8(src_temp6, sm3)); 892 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 6) * dst_strd)), _mm_shuffle_epi8(src_temp7, sm3)); 893 _mm_storeu_si128((__m128i *)(pu1_dst + col + ((row + 7) * dst_strd)), _mm_shuffle_epi8(src_temp8, sm3)); 894 } 895 } 896 } 897 } 898 899 /** 900 ******************************************************************************* 901 * 902 * @brief 903 * Intraprediction for mode 34 (ne angle) and mode 18 (nw angle) with 904 * reference neighboring samples location pointed by 'pu1_ref' to the TU 905 * block location pointed by 'pu1_dst' 906 * 907 * @par Description: 908 * 909 * 910 * @param[in] pu1_src 911 * UWORD8 pointer to the source 912 * 913 * @param[in] pu1_dst 914 * UWORD8 pointer to the destination 915 * 916 * @param[in] src_strd 917 * integer source stride 918 * 919 * @param[in] dst_strd 920 * integer destination stride 921 * 922 * @param[in] nt 923 * integer Transform Block size 924 * 925 * @param[in] mode 926 * integer intraprediction mode 927 * 928 * @returns 929 * 930 * @remarks 931 * None 932 * 933 ******************************************************************************* 934 */ 935 936 void ihevc_intra_pred_chroma_mode_18_34_ssse3(UWORD8 *pu1_ref, 937 WORD32 src_strd, 938 UWORD8 *pu1_dst, 939 WORD32 dst_strd, 940 WORD32 nt, 941 WORD32 mode) 942 { 943 WORD32 row; 944 WORD32 idx = 0; 945 946 __m128i src_temp1, src_temp2, src_temp3, src_temp4, src_temp5, src_temp6, src_temp7, src_temp8; 947 UNUSED(src_strd); 948 949 if(mode == 34) 950 { 951 if(nt == 4) 952 { 953 /*pu1_ref[two_nt + col + idx + 1]*/ 954 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2)); 955 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2)); 956 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2)); 957 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2)); 958 959 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 960 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2); 961 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3); 962 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4); 963 964 } 965 else if(nt == 8) 966 { 967 /*pu1_ref[two_nt + col + idx + 1]*/ 968 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + (4 * nt) + 2 * idx + 2)); 969 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + (4 * nt) + 2 * idx + 2)); 970 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + (4 * nt) + 2 * idx + 2)); 971 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + (4 * nt) + 2 * idx + 2)); 972 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + (4 * nt) + 2 * idx + 2)); 973 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + (4 * nt) + 2 * idx + 2)); 974 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + (4 * nt) + 2 * idx + 2)); 975 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + (4 * nt) + 2 * idx + 2)); 976 977 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 978 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2); 979 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3); 980 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4); 981 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5); 982 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6); 983 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7); 984 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8); 985 986 987 } 988 else 989 { 990 __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; 991 for(row = 0; row < nt; row += 8) 992 { 993 /*pu1_ref[two_nt + col + idx + 1]*/ 994 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 995 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 996 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 997 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 998 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 999 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1000 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1001 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1002 1003 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1); 1004 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9); 1005 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2); 1006 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10); 1007 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3); 1008 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11); 1009 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4); 1010 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12); 1011 1012 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1013 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1014 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1015 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1016 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1017 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1018 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1019 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref + 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1020 1021 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5); 1022 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13); 1023 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6); 1024 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14); 1025 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7); 1026 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15); 1027 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8); 1028 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16); 1029 1030 pu1_ref += 2 * 8; 1031 pu1_dst += 8 * dst_strd; 1032 } 1033 } 1034 } 1035 else 1036 { 1037 if(nt == 4) 1038 { 1039 /*pu1_ref[two_nt + col + idx + 1]*/ 1040 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2)); 1041 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2)); 1042 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2)); 1043 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2)); 1044 1045 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 1046 _mm_storel_epi64((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2); 1047 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3); 1048 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4); 1049 1050 1051 } 1052 else if(nt == 8) 1053 { 1054 /*pu1_ref[two_nt + col + idx + 1]*/ 1055 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + (4 * nt) + 2 * idx + 2)); 1056 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + (4 * nt) + 2 * idx + 2)); 1057 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + (4 * nt) + 2 * idx + 2)); 1058 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + (4 * nt) + 2 * idx + 2)); 1059 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + (4 * nt) + 2 * idx + 2)); 1060 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + (4 * nt) + 2 * idx + 2)); 1061 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + (4 * nt) + 2 * idx + 2)); 1062 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + (4 * nt) + 2 * idx + 2)); 1063 1064 _mm_storeu_si128((__m128i *)(pu1_dst + (0 * dst_strd)), src_temp1); 1065 _mm_storeu_si128((__m128i *)(pu1_dst + (1 * dst_strd)), src_temp2); 1066 _mm_storeu_si128((__m128i *)(pu1_dst + (2 * dst_strd)), src_temp3); 1067 _mm_storeu_si128((__m128i *)(pu1_dst + (3 * dst_strd)), src_temp4); 1068 _mm_storeu_si128((__m128i *)(pu1_dst + (4 * dst_strd)), src_temp5); 1069 _mm_storeu_si128((__m128i *)(pu1_dst + (5 * dst_strd)), src_temp6); 1070 _mm_storeu_si128((__m128i *)(pu1_dst + (6 * dst_strd)), src_temp7); 1071 _mm_storeu_si128((__m128i *)(pu1_dst + (7 * dst_strd)), src_temp8); 1072 1073 1074 } 1075 else 1076 { 1077 __m128i src_temp9, src_temp10, src_temp11, src_temp12, src_temp13, src_temp14, src_temp15, src_temp16; 1078 for(row = 0; row < nt; row += 8) 1079 { 1080 /*pu1_ref[two_nt + col + idx + 1]*/ 1081 src_temp1 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1082 src_temp9 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (0 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1083 src_temp2 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1084 src_temp10 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (1 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1085 src_temp3 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1086 src_temp11 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (2 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1087 src_temp4 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1088 src_temp12 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (3 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1089 1090 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (0 * dst_strd)), src_temp1); 1091 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (0 * dst_strd)), src_temp9); 1092 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (1 * dst_strd)), src_temp2); 1093 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (1 * dst_strd)), src_temp10); 1094 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (2 * dst_strd)), src_temp3); 1095 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (2 * dst_strd)), src_temp11); 1096 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (3 * dst_strd)), src_temp4); 1097 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (3 * dst_strd)), src_temp12); 1098 1099 src_temp5 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1100 src_temp13 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (4 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1101 src_temp6 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1102 src_temp14 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (5 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1103 src_temp7 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1104 src_temp15 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (6 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1105 src_temp8 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 0 + (4 * nt) + 2 * idx + 2)); 1106 src_temp16 = _mm_loadu_si128((__m128i *)(pu1_ref - 2 * (7 + 1) + 16 + (4 * nt) + 2 * idx + 2)); 1107 1108 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (4 * dst_strd)), src_temp5); 1109 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (4 * dst_strd)), src_temp13); 1110 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (5 * dst_strd)), src_temp6); 1111 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (5 * dst_strd)), src_temp14); 1112 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (6 * dst_strd)), src_temp7); 1113 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (6 * dst_strd)), src_temp15); 1114 _mm_storeu_si128((__m128i *)(pu1_dst + 0 + (7 * dst_strd)), src_temp8); 1115 _mm_storeu_si128((__m128i *)(pu1_dst + 16 + (7 * dst_strd)), src_temp16); 1116 1117 pu1_ref -= 2 * 8; 1118 pu1_dst += 8 * dst_strd; 1119 } 1120 } 1121 } 1122 1123 } 1124 1125 /** 1126 ******************************************************************************* 1127 * 1128 * @brief 1129 * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with 1130 * reference neighboring samples location pointed by 'pu1_ref' to the TU 1131 * block location pointed by 'pu1_dst' 1132 * 1133 * @par Description: 1134 * 1135 * 1136 * @param[in] pu1_src 1137 * UWORD8 pointer to the source 1138 * 1139 * @param[in] pu1_dst 1140 * UWORD8 pointer to the destination 1141 * 1142 * @param[in] src_strd 1143 * integer source stride 1144 * 1145 * @param[in] dst_strd 1146 * integer destination stride 1147 * 1148 * @param[in] nt 1149 * integer Transform Block size 1150 * 1151 * @param[in] mode 1152 * integer intraprediction mode 1153 * 1154 * @returns 1155 * 1156 * @remarks 1157 * None 1158 * 1159 ******************************************************************************* 1160 */ 1161 1162 void ihevc_intra_pred_chroma_mode_3_to_9_ssse3(UWORD8 *pu1_ref, 1163 WORD32 src_strd, 1164 UWORD8 *pu1_dst, 1165 WORD32 dst_strd, 1166 WORD32 nt, 1167 WORD32 mode) 1168 { 1169 WORD32 row, col; 1170 1171 WORD32 intra_pred_ang; 1172 1173 __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 1174 __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b; 1175 __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b, sm1; 1176 UNUSED(src_strd); 1177 1178 /* Intra Pred Angle according to the mode */ 1179 intra_pred_ang = gai4_ihevc_ang_table[mode]; 1180 1181 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 1182 /* samples dependent on distance to obtain destination sample */ 1183 1184 sm1 = _mm_load_si128((__m128i *)&IHEVCE_SHUFFLEMASKY7[0]); 1185 const_temp_4x32b = _mm_set1_epi16(16); 1186 const_temp2_4x32b = _mm_set1_epi32(31); 1187 const_temp3_4x32b = _mm_set1_epi16(32); 1188 const_temp4_4x32b = _mm_set1_epi32(4); 1189 1190 two_nt_4x32b = _mm_set1_epi32(1); 1191 1192 zero_8x16b = _mm_set1_epi16(0); 1193 1194 1195 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 1196 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); 1197 1198 row_4x32b = _mm_set_epi32(4, 3, 2, 1); 1199 1200 if(nt == 4) 1201 { 1202 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 1203 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 1204 const_temp2_4x32b = _mm_set1_epi16(31); 1205 const_temp4_4x32b = _mm_set1_epi16(4); 1206 two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2); 1207 1208 { 1209 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 1210 WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16]; 1211 1212 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b; 1213 __m128i src_values10; 1214 1215 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 1216 1217 /* pos = ((row + 1) * intra_pred_ang); */ 1218 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 1219 1220 /* fract = pos & (31); */ 1221 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 1222 1223 ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b, 5); 1224 1225 ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b); 1226 1227 ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b); 1228 1229 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 1230 1231 /*(32 - fract) */ 1232 src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 1233 1234 _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b); 1235 _mm_storel_epi64((__m128i *)(ai1_src_temp_val), src_values10); 1236 1237 fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/ 1238 fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/ 1239 fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/ 1240 fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/ 1241 1242 temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/ 1243 temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/ 1244 temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/ 1245 temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/ 1246 1247 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b); 1248 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b); 1249 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b); 1250 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b); 1251 1252 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 1253 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 1254 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 1255 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 1256 1257 { 1258 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1259 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1260 1261 /* loding 8-bit 16 pixels */ 1262 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - 8)); /* col=0*/ 1263 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - 8)); /* col=1*/ 1264 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - 8)); /* col=2*/ 1265 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - 8)); /* col=3*/ 1266 1267 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/ 1268 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/ 1269 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/ 1270 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/ 1271 1272 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/ 1273 src_temp2_8x16b = _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/ 1274 src_temp3_8x16b = _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/ 1275 src_temp4_8x16b = _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/ 1276 1277 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1278 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 1279 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 1280 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 1281 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 1282 1283 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1284 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 1285 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 1286 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 1287 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 1288 1289 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1290 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 1291 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 1292 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 1293 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 1294 1295 /* converting 16 bit to 8 bit */ 1296 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/ 1297 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/ 1298 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/ 1299 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/ 1300 1301 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1); 1302 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1); 1303 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1); 1304 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1); 1305 1306 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b); 1307 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b); 1308 1309 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b); 1310 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b); 1311 1312 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b); /* row=0*/ 1313 1314 src_temp2_8x16b = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2)); 1315 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b); /* row=1*/ 1316 1317 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b); /* row=2*/ 1318 1319 src_temp4_8x16b = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2)); 1320 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b); /* row=4*/ 1321 1322 } 1323 } 1324 } 1325 else 1326 { 1327 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 1328 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 1329 const_temp2_4x32b = _mm_set1_epi16(31); 1330 const_temp4_4x32b = _mm_set1_epi16(8); 1331 two_nt_4x32b = _mm_set1_epi16((4 * nt) - 2); 1332 1333 for(col = 0; col < 2 * nt; col += 16) 1334 { 1335 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 1336 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 1337 WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16]; 1338 1339 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b; 1340 __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10; 1341 1342 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 1343 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 1344 1345 /* pos = ((row + 1) * intra_pred_ang); */ 1346 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 1347 1348 /* fract = pos & (31); */ 1349 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 1350 1351 ref_main_idx_4x32b = _mm_srai_epi16(res_temp5_4x32b, 5); 1352 1353 ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b); 1354 1355 ref_main_idx_4x32b = _mm_sub_epi16(two_nt_4x32b, ref_main_idx_4x32b); 1356 1357 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 1358 1359 /*(32 - fract) */ 1360 src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 1361 1362 _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b); 1363 _mm_storeu_si128((__m128i *)(ai1_src_temp_val), src_values10); 1364 1365 fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/ 1366 fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/ 1367 fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/ 1368 fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/ 1369 1370 temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/ 1371 temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/ 1372 temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/ 1373 temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/ 1374 1375 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b); 1376 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b); 1377 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b); 1378 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b); 1379 1380 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 1381 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 1382 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 1383 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 1384 1385 fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]); /* col=5*/ 1386 fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]); /* col=6*/ 1387 fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]); /* col=7*/ 1388 fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]); /* col=8*/ 1389 1390 temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]); /* col=0*/ 1391 temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]); /* col=1*/ 1392 temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]); /* col=2*/ 1393 temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]); /* col=3*/ 1394 1395 temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b); 1396 temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b); 1397 temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b); 1398 temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b); 1399 1400 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 1401 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 1402 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 1403 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 1404 1405 for(row = 0; row < nt; row += 4) 1406 { 1407 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1408 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1409 1410 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 1411 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 1412 1413 /* loding 8-bit 16 pixels */ 1414 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx1 - row - (8 + row))); /* col=0*/ 1415 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx2 - row - (8 + row))); /* col=1*/ 1416 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx3 - row - (8 + row))); /* col=2*/ 1417 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx4 - row - (8 + row))); /* col=3*/ 1418 1419 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/ 1420 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/ 1421 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/ 1422 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/ 1423 1424 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp1_8x16b, src_temp5_8x16b); /* col=0*/ 1425 src_temp2_8x16b = _mm_unpacklo_epi8(src_temp2_8x16b, src_temp6_8x16b); /* col=1*/ 1426 src_temp3_8x16b = _mm_unpacklo_epi8(src_temp3_8x16b, src_temp7_8x16b); /* col=2*/ 1427 src_temp4_8x16b = _mm_unpacklo_epi8(src_temp4_8x16b, src_temp8_8x16b); /* col=3*/ 1428 1429 /* loding 8-bit 16 pixels */ 1430 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx5 - row - row - 8)); /* col=5*/ 1431 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx6 - row - row - 8)); /* col=6*/ 1432 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx7 - row - row - 8)); /* col=7*/ 1433 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(pu1_ref + pi2_ref_main_idx8 - row - row - 8)); /* col=8*/ 1434 1435 src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/ 1436 src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/ 1437 src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/ 1438 src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/ 1439 1440 src_temp11_8x16b = _mm_unpacklo_epi8(src_temp11_8x16b, src_temp15_8x16b); /* col=0*/ 1441 src_temp12_8x16b = _mm_unpacklo_epi8(src_temp12_8x16b, src_temp16_8x16b); /* col=1*/ 1442 src_temp13_8x16b = _mm_unpacklo_epi8(src_temp13_8x16b, src_temp17_8x16b); /* col=2*/ 1443 src_temp14_8x16b = _mm_unpacklo_epi8(src_temp14_8x16b, src_temp18_8x16b); /* col=3*/ 1444 1445 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1446 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 1447 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 1448 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 1449 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 1450 1451 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1452 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 1453 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 1454 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 1455 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 1456 1457 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1458 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 1459 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 1460 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 1461 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 1462 1463 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1464 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 1465 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 1466 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 1467 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 1468 1469 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1470 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 1471 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 1472 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 1473 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 1474 1475 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1476 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 1477 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 1478 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 1479 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 1480 1481 /* converting 16 bit to 8 bit */ 1482 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/ 1483 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/ 1484 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/ 1485 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/ 1486 1487 src_temp1_8x16b = _mm_shuffle_epi8(src_temp1_8x16b, sm1); 1488 src_temp2_8x16b = _mm_shuffle_epi8(src_temp2_8x16b, sm1); 1489 src_temp3_8x16b = _mm_shuffle_epi8(src_temp3_8x16b, sm1); 1490 src_temp4_8x16b = _mm_shuffle_epi8(src_temp4_8x16b, sm1); 1491 1492 /* converting 16 bit to 8 bit */ 1493 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/ 1494 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/ 1495 src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/ 1496 src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/ 1497 1498 src_temp11_8x16b = _mm_shuffle_epi8(src_temp11_8x16b, sm1); 1499 src_temp12_8x16b = _mm_shuffle_epi8(src_temp12_8x16b, sm1); 1500 src_temp13_8x16b = _mm_shuffle_epi8(src_temp13_8x16b, sm1); 1501 src_temp14_8x16b = _mm_shuffle_epi8(src_temp14_8x16b, sm1); 1502 1503 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b); 1504 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b); 1505 1506 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b); 1507 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b); 1508 1509 src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b); 1510 src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b); 1511 1512 src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b); 1513 src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b); 1514 1515 src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b); 1516 src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b); 1517 src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b); 1518 src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b); 1519 1520 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp11_8x16b); /* row=0*/ 1521 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b); /* row=1*/ 1522 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b); /* row=2*/ 1523 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b); /* row=4*/ 1524 1525 } 1526 } 1527 } 1528 } 1529 1530 /** 1531 ******************************************************************************* 1532 * 1533 * @brief 1534 * Intraprediction for mode 11 to 17 (negative angle, horizontal mode ) 1535 * with reference neighboring samples location pointed by 'pu1_ref' to the 1536 * TU block location pointed by 'pu1_dst' 1537 * 1538 * @par Description: 1539 * 1540 * 1541 * @param[in] pu1_src 1542 * UWORD8 pointer to the source 1543 * 1544 * @param[in] pu1_dst 1545 * UWORD8 pointer to the destination 1546 * 1547 * @param[in] src_strd 1548 * integer source stride 1549 * 1550 * @param[in] dst_strd 1551 * integer destination stride 1552 * 1553 * @param[in] nt 1554 * integer Transform Block size 1555 * 1556 * @param[in] mode 1557 * integer intraprediction mode 1558 * 1559 * @returns 1560 * 1561 * @remarks 1562 * None 1563 * 1564 ******************************************************************************* 1565 */ 1566 1567 1568 void ihevc_intra_pred_chroma_mode_11_to_17_ssse3(UWORD8 *pu1_ref, 1569 WORD32 src_strd, 1570 UWORD8 *pu1_dst, 1571 WORD32 dst_strd, 1572 WORD32 nt, 1573 WORD32 mode) 1574 { 1575 /* This function and ihevc_intra_pred_CHROMA_mode_19_to_25 are same except*/ 1576 /* for ref main & side samples assignment,can be combined for */ 1577 /* optimzation*/ 1578 1579 WORD32 row, col, k; 1580 WORD32 intra_pred_ang, inv_ang, inv_ang_sum; 1581 WORD32 ref_idx; 1582 1583 1584 __m128i const_temp_4x32b, const_temp2_4x32b, const_temp3_4x32b, const_temp4_4x32b; 1585 __m128i fract_4x32b, zero_8x16b, intra_pred_ang_4x32b; 1586 __m128i row_4x32b, two_nt_4x32b, ref_main_idx_4x32b, res_temp5_4x32b; 1587 1588 UWORD8 ref_temp[2 * MAX_CU_SIZE + 2]; 1589 UWORD8 *ref_main; 1590 UNUSED(src_strd); 1591 1592 inv_ang_sum = 128; 1593 1594 intra_pred_ang = gai4_ihevc_ang_table[mode]; 1595 1596 inv_ang = gai4_ihevc_inv_ang_table[mode - 11]; 1597 /* Intermediate reference samples for negative angle modes */ 1598 /* This have to be removed during optimization*/ 1599 1600 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */ 1601 1602 1603 ref_main = ref_temp + 2 * nt; 1604 for(k = 0; k < (2 * (nt + 1)); k += 2) 1605 { 1606 ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k]; 1607 ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) - k + 1]; 1608 } 1609 1610 ref_main = ref_temp + (2 * (nt - 1)); 1611 ref_idx = (nt * intra_pred_ang) >> 5; 1612 1613 /* SIMD Optimization can be done using look-up table for the loop */ 1614 /* For negative angled derive the main reference samples from side */ 1615 /* reference samples refer to section 8.4.4.2.6 */ 1616 1617 for(k = -2; k > (2 * ref_idx); k -= 2) 1618 { 1619 inv_ang_sum += inv_ang; 1620 ref_main[k] = pu1_ref[(4 * nt) + ((inv_ang_sum >> 8) << 1)]; 1621 ref_main[k + 1] = pu1_ref[((4 * nt) + 1) + ((inv_ang_sum >> 8) << 1)]; 1622 } 1623 1624 /* For the angles other then 45 degree, interpolation btw 2 neighboring */ 1625 /* samples dependent on distance to obtain destination sample */ 1626 1627 const_temp_4x32b = _mm_set1_epi16(16); 1628 const_temp2_4x32b = _mm_set1_epi32(31); 1629 const_temp3_4x32b = _mm_set1_epi16(32); 1630 const_temp4_4x32b = _mm_set1_epi32(4); 1631 1632 two_nt_4x32b = _mm_set1_epi32(1); 1633 1634 zero_8x16b = _mm_set1_epi16(0); 1635 1636 1637 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 1638 intra_pred_ang_4x32b = _mm_set1_epi32(intra_pred_ang); 1639 1640 row_4x32b = _mm_set_epi32(4, 3, 2, 1); 1641 1642 if(nt == 4) 1643 { 1644 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 1645 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 1646 const_temp2_4x32b = _mm_set1_epi16(31); 1647 const_temp4_4x32b = _mm_set1_epi16(4); 1648 two_nt_4x32b = _mm_set1_epi16(1); 1649 1650 { 1651 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 1652 WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16]; 1653 1654 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b; 1655 __m128i src_values10; 1656 1657 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 1658 1659 /* pos = ((row + 1) * intra_pred_ang); */ 1660 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 1661 1662 /* fract = pos & (31); */ 1663 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 1664 1665 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 1666 ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b); 1667 1668 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 1669 1670 /*(32 - fract) */ 1671 src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 1672 1673 _mm_storel_epi64((__m128i *)(ai1_fract_temp_val), fract_4x32b); 1674 _mm_storel_epi64((__m128i *)(ai1_src_temp_val), src_values10); 1675 1676 fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/ 1677 fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/ 1678 fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/ 1679 fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/ 1680 1681 temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/ 1682 temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/ 1683 temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/ 1684 temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/ 1685 1686 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b); 1687 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b); 1688 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b); 1689 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b); 1690 1691 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 1692 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 1693 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 1694 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 1695 1696 { 1697 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1698 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1699 1700 /* loding 8-bit 16 pixels */ 1701 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1)); /* col=0*/ 1702 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2)); /* col=1*/ 1703 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3)); /* col=2*/ 1704 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4)); /* col=3*/ 1705 1706 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/ 1707 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/ 1708 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/ 1709 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/ 1710 1711 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/ 1712 src_temp2_8x16b = _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/ 1713 src_temp3_8x16b = _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/ 1714 src_temp4_8x16b = _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/ 1715 1716 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1717 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 1718 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 1719 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 1720 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 1721 1722 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1723 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 1724 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 1725 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 1726 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 1727 1728 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1729 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 1730 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 1731 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 1732 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 1733 1734 /* converting 16 bit to 8 bit */ 1735 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/ 1736 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/ 1737 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/ 1738 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/ 1739 1740 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b); 1741 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b); 1742 1743 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b); 1744 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b); 1745 1746 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * 0)), src_temp8_8x16b); /* row=0*/ 1747 1748 src_temp2_8x16b = _mm_shuffle_epi32(src_temp8_8x16b, _MM_SHUFFLE(3, 2, 3, 2)); 1749 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (1))), src_temp2_8x16b); /* row=1*/ 1750 1751 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (2))), src_temp7_8x16b); /* row=2*/ 1752 1753 src_temp4_8x16b = _mm_shuffle_epi32(src_temp7_8x16b, _MM_SHUFFLE(3, 2, 3, 2)); 1754 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd * (3))), src_temp4_8x16b); /* row=4*/ 1755 1756 } 1757 } 1758 } 1759 else 1760 { 1761 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 1762 row_4x32b = _mm_set_epi16(8, 7, 6, 5, 4, 3, 2, 1); 1763 const_temp2_4x32b = _mm_set1_epi16(31); 1764 const_temp4_4x32b = _mm_set1_epi16(8); 1765 two_nt_4x32b = _mm_set1_epi16(1); 1766 1767 for(col = 0; col < 2 * nt; col += 16) 1768 { 1769 WORD16 pi2_ref_main_idx1, pi2_ref_main_idx2, pi2_ref_main_idx3, pi2_ref_main_idx4; 1770 WORD16 pi2_ref_main_idx5, pi2_ref_main_idx6, pi2_ref_main_idx7, pi2_ref_main_idx8; 1771 WORD8 ai1_fract_temp_val[16], ai1_src_temp_val[16]; 1772 1773 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b; 1774 __m128i fract5_8x16b, fract6_8x16b, fract7_8x16b, fract8_8x16b, src_values10; 1775 1776 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 1777 __m128i temp11_8x16b, temp12_8x16b, temp13_8x16b, temp14_8x16b; 1778 1779 /* pos = ((row + 1) * intra_pred_ang); */ 1780 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 1781 1782 /* fract = pos & (31); */ 1783 fract_4x32b = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 1784 1785 ref_main_idx_4x32b = _mm_add_epi16(two_nt_4x32b, _mm_srai_epi16(res_temp5_4x32b, 5)); 1786 ref_main_idx_4x32b = _mm_add_epi16(ref_main_idx_4x32b, ref_main_idx_4x32b); 1787 1788 row_4x32b = _mm_add_epi16(row_4x32b, const_temp4_4x32b); 1789 1790 /*(32 - fract) */ 1791 src_values10 = _mm_sub_epi16(const_temp3_4x32b, fract_4x32b); 1792 1793 _mm_storeu_si128((__m128i *)(ai1_fract_temp_val), fract_4x32b); 1794 _mm_storeu_si128((__m128i *)(ai1_src_temp_val), src_values10); 1795 1796 fract1_8x16b = _mm_set1_epi8(ai1_fract_temp_val[0]); /* col=0*/ 1797 fract2_8x16b = _mm_set1_epi8(ai1_fract_temp_val[2]); /* col=1*/ 1798 fract3_8x16b = _mm_set1_epi8(ai1_fract_temp_val[4]); /* col=2*/ 1799 fract4_8x16b = _mm_set1_epi8(ai1_fract_temp_val[6]); /* col=3*/ 1800 1801 temp1_8x16b = _mm_set1_epi8(ai1_src_temp_val[0]); /* col=0*/ 1802 temp2_8x16b = _mm_set1_epi8(ai1_src_temp_val[2]); /* col=1*/ 1803 temp3_8x16b = _mm_set1_epi8(ai1_src_temp_val[4]); /* col=2*/ 1804 temp4_8x16b = _mm_set1_epi8(ai1_src_temp_val[6]); /* col=3*/ 1805 1806 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b); 1807 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b); 1808 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b); 1809 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b); 1810 1811 pi2_ref_main_idx1 = _mm_extract_epi16(ref_main_idx_4x32b, 0); /* col=0*/ 1812 pi2_ref_main_idx2 = _mm_extract_epi16(ref_main_idx_4x32b, 1); /* col=1*/ 1813 pi2_ref_main_idx3 = _mm_extract_epi16(ref_main_idx_4x32b, 2); /* col=2*/ 1814 pi2_ref_main_idx4 = _mm_extract_epi16(ref_main_idx_4x32b, 3); /* col=3*/ 1815 1816 fract5_8x16b = _mm_set1_epi8(ai1_fract_temp_val[8]); /* col=5*/ 1817 fract6_8x16b = _mm_set1_epi8(ai1_fract_temp_val[10]); /* col=6*/ 1818 fract7_8x16b = _mm_set1_epi8(ai1_fract_temp_val[12]); /* col=7*/ 1819 fract8_8x16b = _mm_set1_epi8(ai1_fract_temp_val[14]); /* col=8*/ 1820 1821 temp11_8x16b = _mm_set1_epi8(ai1_src_temp_val[8]); /* col=0*/ 1822 temp12_8x16b = _mm_set1_epi8(ai1_src_temp_val[10]); /* col=1*/ 1823 temp13_8x16b = _mm_set1_epi8(ai1_src_temp_val[12]); /* col=2*/ 1824 temp14_8x16b = _mm_set1_epi8(ai1_src_temp_val[14]); /* col=3*/ 1825 1826 temp11_8x16b = _mm_unpacklo_epi8(temp11_8x16b, fract5_8x16b); 1827 temp12_8x16b = _mm_unpacklo_epi8(temp12_8x16b, fract6_8x16b); 1828 temp13_8x16b = _mm_unpacklo_epi8(temp13_8x16b, fract7_8x16b); 1829 temp14_8x16b = _mm_unpacklo_epi8(temp14_8x16b, fract8_8x16b); 1830 1831 pi2_ref_main_idx5 = _mm_extract_epi16(ref_main_idx_4x32b, 4); /* col=5*/ 1832 pi2_ref_main_idx6 = _mm_extract_epi16(ref_main_idx_4x32b, 5); /* col=6*/ 1833 pi2_ref_main_idx7 = _mm_extract_epi16(ref_main_idx_4x32b, 6); /* col=7*/ 1834 pi2_ref_main_idx8 = _mm_extract_epi16(ref_main_idx_4x32b, 7); /* col=8*/ 1835 1836 for(row = 0; row < nt; row += 4) 1837 { 1838 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1839 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1840 1841 __m128i src_temp11_8x16b, src_temp12_8x16b, src_temp13_8x16b, src_temp14_8x16b; 1842 __m128i src_temp15_8x16b, src_temp16_8x16b, src_temp17_8x16b, src_temp18_8x16b; 1843 1844 /* loding 8-bit 16 pixels */ 1845 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx1 + row + row)); /* col=0*/ 1846 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx2 + row + row)); /* col=1*/ 1847 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx3 + row + row)); /* col=2*/ 1848 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx4 + row + row)); /* col=3*/ 1849 1850 src_temp1_8x16b = _mm_srli_si128(src_temp5_8x16b, 2); /* col=0*/ 1851 src_temp2_8x16b = _mm_srli_si128(src_temp6_8x16b, 2); /* col=1*/ 1852 src_temp3_8x16b = _mm_srli_si128(src_temp7_8x16b, 2); /* col=2*/ 1853 src_temp4_8x16b = _mm_srli_si128(src_temp8_8x16b, 2); /* col=3*/ 1854 1855 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp5_8x16b, src_temp1_8x16b); /* col=0*/ 1856 src_temp2_8x16b = _mm_unpacklo_epi8(src_temp6_8x16b, src_temp2_8x16b); /* col=1*/ 1857 src_temp3_8x16b = _mm_unpacklo_epi8(src_temp7_8x16b, src_temp3_8x16b); /* col=2*/ 1858 src_temp4_8x16b = _mm_unpacklo_epi8(src_temp8_8x16b, src_temp4_8x16b); /* col=3*/ 1859 1860 /* loding 8-bit 16 pixels */ 1861 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx5 + row + row)); /* col=5*/ 1862 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx6 + row + row)); /* col=6*/ 1863 src_temp17_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx7 + row + row)); /* col=7*/ 1864 src_temp18_8x16b = _mm_loadu_si128((__m128i *)(ref_main + pi2_ref_main_idx8 + row + row)); /* col=8*/ 1865 1866 src_temp11_8x16b = _mm_srli_si128(src_temp15_8x16b, 2); /* col=5*/ 1867 src_temp12_8x16b = _mm_srli_si128(src_temp16_8x16b, 2); /* col=6*/ 1868 src_temp13_8x16b = _mm_srli_si128(src_temp17_8x16b, 2); /* col=7*/ 1869 src_temp14_8x16b = _mm_srli_si128(src_temp18_8x16b, 2); /* col=8*/ 1870 1871 src_temp11_8x16b = _mm_unpacklo_epi8(src_temp15_8x16b, src_temp11_8x16b); /* col=0*/ 1872 src_temp12_8x16b = _mm_unpacklo_epi8(src_temp16_8x16b, src_temp12_8x16b); /* col=1*/ 1873 src_temp13_8x16b = _mm_unpacklo_epi8(src_temp17_8x16b, src_temp13_8x16b); /* col=2*/ 1874 src_temp14_8x16b = _mm_unpacklo_epi8(src_temp18_8x16b, src_temp14_8x16b); /* col=3*/ 1875 1876 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1877 src_temp1_8x16b = _mm_maddubs_epi16(src_temp1_8x16b, temp1_8x16b); 1878 src_temp2_8x16b = _mm_maddubs_epi16(src_temp2_8x16b, temp2_8x16b); 1879 src_temp3_8x16b = _mm_maddubs_epi16(src_temp3_8x16b, temp3_8x16b); 1880 src_temp4_8x16b = _mm_maddubs_epi16(src_temp4_8x16b, temp4_8x16b); 1881 1882 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 1883 src_temp11_8x16b = _mm_maddubs_epi16(src_temp11_8x16b, temp11_8x16b); 1884 src_temp12_8x16b = _mm_maddubs_epi16(src_temp12_8x16b, temp12_8x16b); 1885 src_temp13_8x16b = _mm_maddubs_epi16(src_temp13_8x16b, temp13_8x16b); 1886 src_temp14_8x16b = _mm_maddubs_epi16(src_temp14_8x16b, temp14_8x16b); 1887 1888 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1889 src_temp1_8x16b = _mm_add_epi16(src_temp1_8x16b, const_temp_4x32b); 1890 src_temp2_8x16b = _mm_add_epi16(src_temp2_8x16b, const_temp_4x32b); 1891 src_temp3_8x16b = _mm_add_epi16(src_temp3_8x16b, const_temp_4x32b); 1892 src_temp4_8x16b = _mm_add_epi16(src_temp4_8x16b, const_temp_4x32b); 1893 1894 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1895 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, 5); /* col=0*/ 1896 src_temp2_8x16b = _mm_srai_epi16(src_temp2_8x16b, 5); /* col=1*/ 1897 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, 5); /* col=2*/ 1898 src_temp4_8x16b = _mm_srai_epi16(src_temp4_8x16b, 5); /* col=3*/ 1899 1900 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 1901 src_temp11_8x16b = _mm_add_epi16(src_temp11_8x16b, const_temp_4x32b); 1902 src_temp12_8x16b = _mm_add_epi16(src_temp12_8x16b, const_temp_4x32b); 1903 src_temp13_8x16b = _mm_add_epi16(src_temp13_8x16b, const_temp_4x32b); 1904 src_temp14_8x16b = _mm_add_epi16(src_temp14_8x16b, const_temp_4x32b); 1905 1906 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 1907 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, 5); /* col=5*/ 1908 src_temp12_8x16b = _mm_srai_epi16(src_temp12_8x16b, 5); /* col=6*/ 1909 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, 5); /* col=7*/ 1910 src_temp14_8x16b = _mm_srai_epi16(src_temp14_8x16b, 5); /* col=8*/ 1911 1912 /* converting 16 bit to 8 bit */ 1913 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, zero_8x16b); /* col=0*/ 1914 src_temp2_8x16b = _mm_packus_epi16(src_temp2_8x16b, zero_8x16b); /* col=1*/ 1915 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, zero_8x16b); /* col=2*/ 1916 src_temp4_8x16b = _mm_packus_epi16(src_temp4_8x16b, zero_8x16b); /* col=3*/ 1917 1918 /* converting 16 bit to 8 bit */ 1919 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, zero_8x16b); /* col=5*/ 1920 src_temp12_8x16b = _mm_packus_epi16(src_temp12_8x16b, zero_8x16b); /* col=6*/ 1921 src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, zero_8x16b); /* col=7*/ 1922 src_temp14_8x16b = _mm_packus_epi16(src_temp14_8x16b, zero_8x16b); /* col=8*/ 1923 1924 src_temp5_8x16b = _mm_unpacklo_epi16(src_temp1_8x16b, src_temp2_8x16b); 1925 src_temp6_8x16b = _mm_unpacklo_epi16(src_temp3_8x16b, src_temp4_8x16b); 1926 1927 src_temp8_8x16b = _mm_unpacklo_epi32(src_temp5_8x16b, src_temp6_8x16b); 1928 src_temp7_8x16b = _mm_unpackhi_epi32(src_temp5_8x16b, src_temp6_8x16b); 1929 1930 src_temp15_8x16b = _mm_unpacklo_epi16(src_temp11_8x16b, src_temp12_8x16b); 1931 src_temp16_8x16b = _mm_unpacklo_epi16(src_temp13_8x16b, src_temp14_8x16b); 1932 1933 src_temp18_8x16b = _mm_unpacklo_epi32(src_temp15_8x16b, src_temp16_8x16b); 1934 src_temp17_8x16b = _mm_unpackhi_epi32(src_temp15_8x16b, src_temp16_8x16b); 1935 1936 src_temp11_8x16b = _mm_unpacklo_epi64(src_temp8_8x16b, src_temp18_8x16b); 1937 src_temp12_8x16b = _mm_unpackhi_epi64(src_temp8_8x16b, src_temp18_8x16b); 1938 src_temp13_8x16b = _mm_unpacklo_epi64(src_temp7_8x16b, src_temp17_8x16b); 1939 src_temp14_8x16b = _mm_unpackhi_epi64(src_temp7_8x16b, src_temp17_8x16b); 1940 1941 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * row)), src_temp11_8x16b); /* row=0*/ 1942 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 1))), src_temp12_8x16b); /* row=1*/ 1943 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 2))), src_temp13_8x16b); /* row=2*/ 1944 _mm_storeu_si128((__m128i *)(pu1_dst + col + (dst_strd * (row + 3))), src_temp14_8x16b); /* row=4*/ 1945 1946 } 1947 } 1948 } 1949 } 1950 1951 /** 1952 ******************************************************************************* 1953 * 1954 * @brief 1955 * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with 1956 * reference neighboring samples location pointed by 'pu1_ref' to the TU 1957 * block location pointed by 'pu1_dst' 1958 * 1959 * @par Description: 1960 * 1961 * 1962 * @param[in] pu1_src 1963 * UWORD8 pointer to the source 1964 * 1965 * @param[in] pu1_dst 1966 * UWORD8 pointer to the destination 1967 * 1968 * @param[in] src_strd 1969 * integer source stride 1970 * 1971 * @param[in] dst_strd 1972 * integer destination stride 1973 * 1974 * @param[in] nt 1975 * integer Transform Block size 1976 * 1977 * @param[in] mode 1978 * integer intraprediction mode 1979 * 1980 * @returns 1981 * 1982 * @remarks 1983 * None 1984 * 1985 ******************************************************************************* 1986 */ 1987 1988 void ihevc_intra_pred_chroma_mode_19_to_25_ssse3(UWORD8 *pu1_ref, 1989 WORD32 src_strd, 1990 UWORD8 *pu1_dst, 1991 WORD32 dst_strd, 1992 WORD32 nt, 1993 WORD32 mode) 1994 { 1995 WORD32 row, k; 1996 WORD32 intra_pred_ang, idx; 1997 WORD32 inv_ang, inv_ang_sum, pos, fract; 1998 WORD32 ref_main_idx, ref_idx; 1999 UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 2]; 2000 UWORD8 *ref_main; 2001 2002 __m128i zero_8x16b, fract_8x16b, const_temp_8x16b; 2003 UNUSED(src_strd); 2004 2005 intra_pred_ang = gai4_ihevc_ang_table_chroma[mode]; 2006 inv_ang = gai4_ihevc_inv_ang_table_chroma[mode - 12]; 2007 2008 /* Intermediate reference samples for negative angle modes */ 2009 /* This have to be removed during optimization*/ 2010 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */ 2011 ref_main = ref_temp + 2 * nt; 2012 for(k = 0; k < (2 * (nt + 1)); k += 2) 2013 { 2014 ref_temp[k + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k]; 2015 ref_temp[k + 1 + (2 * (nt - 1))] = pu1_ref[(4 * nt) + k + 1]; 2016 } 2017 2018 ref_idx = (nt * intra_pred_ang) >> 5; 2019 inv_ang_sum = 128; 2020 ref_main = ref_temp + (2 * (nt - 1)); 2021 /* SIMD Optimization can be done using look-up table for the loop */ 2022 /* For negative angled derive the main reference samples from side */ 2023 /* reference samples refer to section 8.4.4.2.6 */ 2024 for(k = -2; k > (2 * ref_idx); k -= 2) 2025 { 2026 inv_ang_sum += inv_ang; 2027 ref_main[k] = pu1_ref[(4 * nt) - (inv_ang_sum >> 8) * 2]; 2028 ref_main[k + 1] = pu1_ref[((4 * nt) + 1) - (inv_ang_sum >> 8) * 2]; 2029 } 2030 2031 const_temp_8x16b = _mm_set1_epi16(16); 2032 2033 if(nt == 4) /* if nt =4*/ 2034 { 2035 __m128i const_temp2_4x32b, const_temp3_4x32b; 2036 __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b; 2037 __m128i row_4x32b, two_nt_4x32b, src_values12; 2038 2039 2040 const_temp2_4x32b = _mm_set1_epi32(31); 2041 const_temp3_4x32b = _mm_set1_epi32(32); 2042 2043 two_nt_4x32b = _mm_set1_epi32(2); 2044 2045 zero_8x16b = _mm_set1_epi16(0); 2046 2047 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 2048 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2049 2050 row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); 2051 { 2052 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 2053 WORD8 ai1_src_temp0_val[16], ai1_src_temp1_val[16]; 2054 2055 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b; 2056 __m128i src_values0, src_values1, src_values2, src_values3, src_values13; 2057 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2058 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b; 2059 2060 /* pos = ((row + 1) * intra_pred_ang); */ 2061 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2062 sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); 2063 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); 2064 2065 src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 2066 src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b, 5)); 2067 2068 ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */ 2069 ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */ 2070 ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */ 2071 ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/ 2072 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/ 2073 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/ 2074 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/ 2075 2076 /* fract = pos & (31); */ 2077 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2078 2079 /*(32 - fract) */ 2080 src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11); 2081 2082 _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11); 2083 _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10); 2084 2085 fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]); /* row=0*/ 2086 fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]); /* row=1*/ 2087 fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]); /* row=2*/ 2088 fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]); /* row=3*/ 2089 2090 temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]); /* row=0*/ 2091 temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]); /* row=1*/ 2092 temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]); /* row=2*/ 2093 temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]); /* row=3*/ 2094 2095 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b); 2096 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b); 2097 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b); 2098 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b); 2099 2100 // inner loop starts from here 2101 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */ 2102 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx2)); /* col = 8-15 */ 2103 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx3)); /* col = 16-23 */ 2104 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx4)); /* col = 24-31 */ 2105 2106 src_values10 = _mm_srli_si128(src_values0, 2); 2107 src_values11 = _mm_srli_si128(src_values1, 2); 2108 src_values12 = _mm_srli_si128(src_values2, 2); 2109 src_values13 = _mm_srli_si128(src_values3, 2); 2110 2111 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10); 2112 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11); 2113 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12); 2114 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13); 2115 2116 src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b); 2117 src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b); 2118 src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b); 2119 src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b); 2120 2121 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2122 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 2123 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 2124 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 2125 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 2126 2127 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2128 src_values0 = _mm_srai_epi16(src_values0, 5); 2129 src_values1 = _mm_srai_epi16(src_values1, 5); 2130 src_values2 = _mm_srai_epi16(src_values2, 5); 2131 src_values3 = _mm_srai_epi16(src_values3, 5); 2132 2133 /* converting 16 bit to 8 bit */ 2134 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b); 2135 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b); 2136 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b); 2137 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b); 2138 2139 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0); /* row=0*/ 2140 _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1); /* row=1*/ 2141 _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2); /* row=2*/ 2142 _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3); /* row=3*/ 2143 2144 } 2145 } 2146 else if(nt == 8) /* for nt = 16 case */ 2147 { 2148 WORD32 ref_main_idx1, fract1, temp, temp1; 2149 __m128i fract1_8x16b, temp_8x16b, temp1_8x16b; 2150 2151 zero_8x16b = _mm_set1_epi16(0); 2152 2153 for(row = 0; row < nt; row += 2) 2154 { 2155 __m128i src_values0, src_values1, src_values2, src_values3; 2156 __m128i src_values10, src_values11, src_values12, src_values13; 2157 2158 pos = ((row + 1) * intra_pred_ang); 2159 idx = pos >> 5; 2160 fract = pos & (31); 2161 temp = 32 - fract; 2162 ref_main_idx = 2 * idx + 2; /* col from 0-15 */ 2163 2164 pos = ((row + 2) * intra_pred_ang); 2165 idx = pos >> 5; 2166 fract1 = pos & (31); 2167 temp1 = 32 - fract1; 2168 ref_main_idx1 = 2 * idx + 2; /* col from 0-15 */ 2169 2170 fract_8x16b = _mm_set1_epi8(fract); 2171 fract1_8x16b = _mm_set1_epi8(fract1); 2172 temp_8x16b = _mm_set1_epi8(temp); 2173 temp1_8x16b = _mm_set1_epi8(temp1); 2174 2175 temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b); 2176 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b); 2177 2178 /* row=0 */ 2179 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx)); /* col = 0-7 */ 2180 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8)); /* col = 8-15 */ 2181 2182 /* row=1 */ 2183 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1)); /* col = 0-7 */ 2184 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx1 + 8)); /* col = 8-15 */ 2185 2186 src_values10 = _mm_srli_si128(src_values0, 2); 2187 src_values11 = _mm_srli_si128(src_values1, 2); 2188 src_values12 = _mm_srli_si128(src_values2, 2); 2189 src_values13 = _mm_srli_si128(src_values3, 2); 2190 2191 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10); 2192 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11); 2193 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12); 2194 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13); 2195 2196 src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b); 2197 src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b); 2198 2199 src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b); 2200 src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b); 2201 2202 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2203 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 2204 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 2205 2206 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 2207 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 2208 2209 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2210 src_values0 = _mm_srai_epi16(src_values0, 5); 2211 src_values1 = _mm_srai_epi16(src_values1, 5); 2212 2213 src_values2 = _mm_srai_epi16(src_values2, 5); 2214 src_values3 = _mm_srai_epi16(src_values3, 5); 2215 2216 /* converting 16 bit to 8 bit */ 2217 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b); 2218 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b); 2219 2220 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b); 2221 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b); 2222 2223 /* loding 8-bit 8 pixels values */ 2224 _mm_storel_epi64((__m128i *)(pu1_dst), src_values0); 2225 _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1); 2226 2227 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2); 2228 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3); 2229 2230 pu1_dst += 2 * dst_strd; 2231 } 2232 } 2233 else if(nt == 16) 2234 { 2235 WORD32 temp; 2236 /* unroll the col loop (inner) */ 2237 zero_8x16b = _mm_set1_epi16(0); 2238 2239 for(row = 0; row < nt; row += 1) 2240 { 2241 __m128i src_values0, src_values1, src_values2, src_values3, temp_8x16b; 2242 __m128i src_values10, src_values11, src_values12, src_values13; 2243 2244 pos = ((row + 1) * intra_pred_ang); 2245 idx = pos >> 5; 2246 fract = pos & (31); 2247 temp = 32 - fract; 2248 ref_main_idx = 2 * idx + 2; /* col from 0-31 */ 2249 2250 fract_8x16b = _mm_set1_epi8(fract); 2251 temp_8x16b = _mm_set1_epi8(temp); 2252 2253 temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b); 2254 2255 src_values0 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx)); /* col = 0-7 */ 2256 src_values1 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 8)); /* col = 8-15 */ 2257 src_values2 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 16)); /* col = 16-23 */ 2258 src_values3 = _mm_loadu_si128((__m128i *)(ref_main + ref_main_idx + 24)); /* col = 24-31 */ 2259 2260 src_values10 = _mm_srli_si128(src_values0, 2); 2261 src_values11 = _mm_srli_si128(src_values1, 2); 2262 src_values12 = _mm_srli_si128(src_values2, 2); 2263 src_values13 = _mm_srli_si128(src_values3, 2); 2264 2265 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10); 2266 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11); 2267 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12); 2268 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13); 2269 2270 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2271 src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b); 2272 src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b); 2273 src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b); 2274 src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b); 2275 2276 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2277 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 2278 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 2279 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 2280 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 2281 2282 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2283 src_values0 = _mm_srai_epi16(src_values0, 5); 2284 src_values1 = _mm_srai_epi16(src_values1, 5); 2285 src_values2 = _mm_srai_epi16(src_values2, 5); 2286 src_values3 = _mm_srai_epi16(src_values3, 5); 2287 2288 /* converting 16 bit to 8 bit */ 2289 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b); 2290 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b); 2291 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b); 2292 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b); 2293 2294 /* loding 8-bit 8 pixels values */ 2295 _mm_storel_epi64((__m128i *)(pu1_dst), src_values0); 2296 _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1); 2297 _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2); 2298 _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3); 2299 2300 pu1_dst += dst_strd; 2301 2302 } 2303 } 2304 } 2305 2306 2307 /** 2308 ******************************************************************************* 2309 * 2310 * @brief 2311 * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with 2312 * reference neighboring samples location pointed by 'pu1_ref' to the TU 2313 * block location pointed by 'pu1_dst' 2314 * 2315 * @par Description: 2316 * 2317 * 2318 * @param[in] pu1_src 2319 * UWORD8 pointer to the source 2320 * 2321 * @param[in] pu1_dst 2322 * UWORD8 pointer to the destination 2323 * 2324 * @param[in] src_strd 2325 * integer source stride 2326 * 2327 * @param[in] dst_strd 2328 * integer destination stride 2329 * 2330 * @param[in] nt 2331 * integer Transform Block size 2332 * 2333 * @param[in] mode 2334 * integer intraprediction mode 2335 * 2336 * @returns 2337 * 2338 * @remarks 2339 * None 2340 * 2341 ******************************************************************************* 2342 */ 2343 2344 void ihevc_intra_pred_chroma_mode_27_to_33_ssse3(UWORD8 *pu1_ref, 2345 WORD32 src_strd, 2346 UWORD8 *pu1_dst, 2347 WORD32 dst_strd, 2348 WORD32 nt, 2349 WORD32 mode) 2350 { 2351 WORD32 row; 2352 WORD32 pos, fract; 2353 WORD32 intra_pred_ang; 2354 WORD32 idx, ref_main_idx; 2355 2356 __m128i zero_8x16b, fract_8x16b, const_temp_8x16b; 2357 UNUSED(src_strd); 2358 2359 intra_pred_ang = gai4_ihevc_ang_table_chroma[mode]; 2360 const_temp_8x16b = _mm_set1_epi16(16); 2361 2362 if(nt == 4) /* if nt =4*/ 2363 { 2364 __m128i const_temp2_4x32b, const_temp3_4x32b; 2365 __m128i src_values10, src_values11, zero_8x16b, intra_pred_ang_4x32b; 2366 __m128i row_4x32b, two_nt_4x32b, src_values12; 2367 2368 const_temp2_4x32b = _mm_set1_epi32(31); 2369 const_temp3_4x32b = _mm_set1_epi32(32); 2370 2371 two_nt_4x32b = _mm_set1_epi32((4 * nt) + 2); 2372 2373 zero_8x16b = _mm_set1_epi16(0); 2374 2375 /* intra_pred_ang = gai4_ihevc_ang_table[mode]; */ 2376 intra_pred_ang_4x32b = _mm_set1_epi16(intra_pred_ang); 2377 row_4x32b = _mm_set_epi16(4, 3, 2, 1, 4, 3, 2, 1); 2378 2379 { 2380 WORD32 ref_main_idx1, ref_main_idx2, ref_main_idx3, ref_main_idx4; 2381 WORD8 ai1_src_temp0_val[16], ai1_src_temp1_val[16]; 2382 2383 __m128i fract1_8x16b, fract2_8x16b, fract3_8x16b, fract4_8x16b, res_temp5_4x32b; 2384 __m128i src_values0, src_values1, src_values2, src_values3, src_values13; 2385 __m128i temp1_8x16b, temp2_8x16b, temp3_8x16b, temp4_8x16b; 2386 __m128i ref_main_temp0, ref_main_temp1, ref_main_temp2, sign_8x16b; 2387 2388 /* pos = ((row + 1) * intra_pred_ang); */ 2389 res_temp5_4x32b = _mm_mullo_epi16(row_4x32b, intra_pred_ang_4x32b); 2390 sign_8x16b = _mm_cmpgt_epi16(zero_8x16b, res_temp5_4x32b); 2391 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, sign_8x16b); 2392 2393 src_values12 = _mm_add_epi32(two_nt_4x32b, _mm_srai_epi32(res_temp5_4x32b, 5)); 2394 src_values12 = _mm_add_epi32(src_values12, _mm_srai_epi32(res_temp5_4x32b, 5)); 2395 2396 ref_main_temp0 = _mm_srli_si128(src_values12, 4); /* next 32 bit values */ 2397 ref_main_temp1 = _mm_srli_si128(src_values12, 8); /* next 32 bit values */ 2398 ref_main_temp2 = _mm_srli_si128(src_values12, 12); /* next 32 bit values */ 2399 ref_main_idx1 = _mm_cvtsi128_si32(src_values12); /* row=0*/ 2400 ref_main_idx2 = _mm_cvtsi128_si32(ref_main_temp0); /* row=1*/ 2401 ref_main_idx3 = _mm_cvtsi128_si32(ref_main_temp1); /* row=2*/ 2402 ref_main_idx4 = _mm_cvtsi128_si32(ref_main_temp2); /* row=3*/ 2403 2404 /* fract = pos & (31); */ 2405 src_values11 = _mm_and_si128(res_temp5_4x32b, const_temp2_4x32b); 2406 2407 /*(32 - fract) */ 2408 src_values10 = _mm_sub_epi32(const_temp3_4x32b, src_values11); 2409 2410 _mm_storeu_si128((__m128i *)(ai1_src_temp1_val), src_values11); 2411 _mm_storeu_si128((__m128i *)(ai1_src_temp0_val), src_values10); 2412 2413 fract1_8x16b = _mm_set1_epi8(ai1_src_temp1_val[0]); /* row=0*/ 2414 fract2_8x16b = _mm_set1_epi8(ai1_src_temp1_val[4]); /* row=1*/ 2415 fract3_8x16b = _mm_set1_epi8(ai1_src_temp1_val[8]); /* row=2*/ 2416 fract4_8x16b = _mm_set1_epi8(ai1_src_temp1_val[12]); /* row=3*/ 2417 2418 temp1_8x16b = _mm_set1_epi8(ai1_src_temp0_val[0]); /* row=0*/ 2419 temp2_8x16b = _mm_set1_epi8(ai1_src_temp0_val[4]); /* row=1*/ 2420 temp3_8x16b = _mm_set1_epi8(ai1_src_temp0_val[8]); /* row=2*/ 2421 temp4_8x16b = _mm_set1_epi8(ai1_src_temp0_val[12]); /* row=3*/ 2422 2423 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b); 2424 temp2_8x16b = _mm_unpacklo_epi8(temp2_8x16b, fract2_8x16b); 2425 temp3_8x16b = _mm_unpacklo_epi8(temp3_8x16b, fract3_8x16b); 2426 temp4_8x16b = _mm_unpacklo_epi8(temp4_8x16b, fract4_8x16b); 2427 2428 // inner loop starts from here 2429 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */ 2430 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx2)); /* col = 8-15 */ 2431 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx3)); /* col = 16-23 */ 2432 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx4)); /* col = 24-31 */ 2433 2434 src_values10 = _mm_srli_si128(src_values0, 2); 2435 src_values11 = _mm_srli_si128(src_values1, 2); 2436 src_values12 = _mm_srli_si128(src_values2, 2); 2437 src_values13 = _mm_srli_si128(src_values3, 2); 2438 2439 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10); 2440 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11); 2441 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12); 2442 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13); 2443 2444 src_values0 = _mm_maddubs_epi16(src_values0, temp1_8x16b); 2445 src_values1 = _mm_maddubs_epi16(src_values1, temp2_8x16b); 2446 src_values2 = _mm_maddubs_epi16(src_values2, temp3_8x16b); 2447 src_values3 = _mm_maddubs_epi16(src_values3, temp4_8x16b); 2448 2449 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2450 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 2451 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 2452 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 2453 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 2454 2455 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2456 src_values0 = _mm_srai_epi16(src_values0, 5); 2457 src_values1 = _mm_srai_epi16(src_values1, 5); 2458 src_values2 = _mm_srai_epi16(src_values2, 5); 2459 src_values3 = _mm_srai_epi16(src_values3, 5); 2460 2461 /* converting 16 bit to 8 bit */ 2462 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b); 2463 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b); 2464 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b); 2465 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b); 2466 2467 _mm_storel_epi64((__m128i *)(pu1_dst + (0 * dst_strd)), src_values0); /* row=0*/ 2468 _mm_storel_epi64((__m128i *)(pu1_dst + ((1) * dst_strd)), src_values1); /* row=1*/ 2469 _mm_storel_epi64((__m128i *)(pu1_dst + ((2) * dst_strd)), src_values2); /* row=2*/ 2470 _mm_storel_epi64((__m128i *)(pu1_dst + ((3) * dst_strd)), src_values3); /* row=3*/ 2471 2472 } 2473 } 2474 2475 else if(nt == 8) /* for nt = 16 case */ 2476 { 2477 WORD32 ref_main_idx1, fract1, temp, temp1; 2478 __m128i fract1_8x16b, temp_8x16b, temp1_8x16b; 2479 2480 zero_8x16b = _mm_set1_epi16(0); 2481 2482 for(row = 0; row < nt; row += 2) 2483 { 2484 __m128i src_values0, src_values1, src_values2, src_values3; 2485 __m128i src_values10, src_values11, src_values12, src_values13; 2486 2487 pos = ((row + 1) * intra_pred_ang); 2488 idx = pos >> 5; 2489 fract = pos & (31); 2490 temp = 32 - fract; 2491 ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-15 */ 2492 2493 pos = ((row + 2) * intra_pred_ang); 2494 idx = pos >> 5; 2495 fract1 = pos & (31); 2496 temp1 = 32 - fract1; 2497 ref_main_idx1 = (4 * nt) + 2 * idx + 2; /* col from 0-15 */ 2498 2499 fract_8x16b = _mm_set1_epi8(fract); 2500 fract1_8x16b = _mm_set1_epi8(fract1); 2501 temp_8x16b = _mm_set1_epi8(temp); 2502 temp1_8x16b = _mm_set1_epi8(temp1); 2503 2504 temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b); 2505 temp1_8x16b = _mm_unpacklo_epi8(temp1_8x16b, fract1_8x16b); 2506 2507 /* row=0 */ 2508 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx)); /* col = 0-7 */ 2509 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8)); /* col = 8-15 */ 2510 2511 /* row=1 */ 2512 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1)); /* col = 0-7 */ 2513 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx1 + 8)); /* col = 8-15 */ 2514 2515 src_values10 = _mm_srli_si128(src_values0, 2); 2516 src_values11 = _mm_srli_si128(src_values1, 2); 2517 src_values12 = _mm_srli_si128(src_values2, 2); 2518 src_values13 = _mm_srli_si128(src_values3, 2); 2519 2520 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10); 2521 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11); 2522 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12); 2523 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13); 2524 2525 src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b); 2526 src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b); 2527 2528 src_values2 = _mm_maddubs_epi16(src_values2, temp1_8x16b); 2529 src_values3 = _mm_maddubs_epi16(src_values3, temp1_8x16b); 2530 2531 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2532 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 2533 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 2534 2535 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 2536 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 2537 2538 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2539 src_values0 = _mm_srai_epi16(src_values0, 5); 2540 src_values1 = _mm_srai_epi16(src_values1, 5); 2541 2542 src_values2 = _mm_srai_epi16(src_values2, 5); 2543 src_values3 = _mm_srai_epi16(src_values3, 5); 2544 2545 /* converting 16 bit to 8 bit */ 2546 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b); 2547 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b); 2548 2549 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b); 2550 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b); 2551 2552 /* loding 8-bit 8 pixels values */ 2553 _mm_storel_epi64((__m128i *)(pu1_dst), src_values0); 2554 _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1); 2555 2556 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), src_values2); 2557 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + 8), src_values3); 2558 2559 pu1_dst += 2 * dst_strd; 2560 } 2561 } 2562 else if(nt == 16) 2563 { 2564 WORD32 temp; 2565 /* unroll the col loop (inner) */ 2566 zero_8x16b = _mm_set1_epi16(0); 2567 2568 for(row = 0; row < nt; row += 1) 2569 { 2570 __m128i src_values0, src_values1, src_values2, src_values3, temp_8x16b; 2571 __m128i src_values10, src_values11, src_values12, src_values13; 2572 2573 pos = ((row + 1) * intra_pred_ang); 2574 idx = pos >> 5; 2575 fract = pos & (31); 2576 temp = 32 - fract; 2577 ref_main_idx = (4 * nt) + 2 * idx + 2; /* col from 0-31 */ 2578 2579 fract_8x16b = _mm_set1_epi8(fract); 2580 temp_8x16b = _mm_set1_epi8(temp); 2581 2582 temp_8x16b = _mm_unpacklo_epi8(temp_8x16b, fract_8x16b); 2583 2584 src_values0 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx)); /* col = 0-7 */ 2585 src_values1 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 8)); /* col = 8-15 */ 2586 src_values2 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 16)); /* col = 16-23 */ 2587 src_values3 = _mm_loadu_si128((__m128i *)(pu1_ref + ref_main_idx + 24)); /* col = 24-31 */ 2588 2589 src_values10 = _mm_srli_si128(src_values0, 2); 2590 src_values11 = _mm_srli_si128(src_values1, 2); 2591 src_values12 = _mm_srli_si128(src_values2, 2); 2592 src_values13 = _mm_srli_si128(src_values3, 2); 2593 2594 src_values0 = _mm_unpacklo_epi8(src_values0, src_values10); 2595 src_values1 = _mm_unpacklo_epi8(src_values1, src_values11); 2596 src_values2 = _mm_unpacklo_epi8(src_values2, src_values12); 2597 src_values3 = _mm_unpacklo_epi8(src_values3, src_values13); 2598 2599 /* fract*(pu1_ref[ref_main_idx + 1]- pu1_ref[ref_main_idx]) */ 2600 src_values0 = _mm_maddubs_epi16(src_values0, temp_8x16b); 2601 src_values1 = _mm_maddubs_epi16(src_values1, temp_8x16b); 2602 src_values2 = _mm_maddubs_epi16(src_values2, temp_8x16b); 2603 src_values3 = _mm_maddubs_epi16(src_values3, temp_8x16b); 2604 2605 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16)*/ 2606 src_values0 = _mm_add_epi16(src_values0, const_temp_8x16b); 2607 src_values1 = _mm_add_epi16(src_values1, const_temp_8x16b); 2608 src_values2 = _mm_add_epi16(src_values2, const_temp_8x16b); 2609 src_values3 = _mm_add_epi16(src_values3, const_temp_8x16b); 2610 2611 /*((32 - fract)* pu1_ref[ref_main_idx]+ fract * pu1_ref[ref_main_idx + 1] + 16) >>5*/ 2612 src_values0 = _mm_srai_epi16(src_values0, 5); 2613 src_values1 = _mm_srai_epi16(src_values1, 5); 2614 src_values2 = _mm_srai_epi16(src_values2, 5); 2615 src_values3 = _mm_srai_epi16(src_values3, 5); 2616 2617 /* converting 16 bit to 8 bit */ 2618 src_values0 = _mm_packus_epi16(src_values0, zero_8x16b); 2619 src_values1 = _mm_packus_epi16(src_values1, zero_8x16b); 2620 src_values2 = _mm_packus_epi16(src_values2, zero_8x16b); 2621 src_values3 = _mm_packus_epi16(src_values3, zero_8x16b); 2622 2623 /* loding 8-bit 8 pixels values */ 2624 _mm_storel_epi64((__m128i *)(pu1_dst), src_values0); 2625 _mm_storel_epi64((__m128i *)(pu1_dst + 8), src_values1); 2626 _mm_storel_epi64((__m128i *)(pu1_dst + 16), src_values2); 2627 _mm_storel_epi64((__m128i *)(pu1_dst + 24), src_values3); 2628 2629 pu1_dst += dst_strd; 2630 2631 } 2632 } 2633 } 2634