1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /** 21 ******************************************************************************* 22 * @file 23 * ih264_luma_intra_pred_filters_ssse3.c 24 * 25 * @brief 26 * Contains function definitions for luma intra prediction filters in x86 27 * intrinsics 28 * 29 * @author 30 * Ittiam 31 * 32 * @par List of Functions: 33 * - ih264_intra_pred_luma_4x4_mode_vert_ssse3 34 * - ih264_intra_pred_luma_4x4_mode_horz_ssse3 35 * - ih264_intra_pred_luma_4x4_mode_dc_ssse3 36 * - ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3 37 * - ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3 38 * - ih264_intra_pred_luma_4x4_mode_vert_r_ssse3 39 * - ih264_intra_pred_luma_4x4_mode_horz_d_ssse3 40 * - ih264_intra_pred_luma_4x4_mode_vert_l_ssse3 41 * - ih264_intra_pred_luma_4x4_mode_horz_u_ssse3 42 * - ih264_intra_pred_luma_8x8_mode_vert_ssse3 43 * - ih264_intra_pred_luma_8x8_mode_horz_ssse3 44 * - ih264_intra_pred_luma_8x8_mode_dc_ssse3 45 * - ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3 46 * - ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3 47 * - ih264_intra_pred_luma_8x8_mode_vert_r_ssse3 48 * - ih264_intra_pred_luma_8x8_mode_horz_d_ssse3 49 * - ih264_intra_pred_luma_8x8_mode_vert_l_ssse3 50 * - ih264_intra_pred_luma_8x8_mode_horz_u_ssse3 51 * - ih264_intra_pred_luma_16x16_mode_vert_ssse3 52 * - ih264_intra_pred_luma_16x16_mode_horz_ssse3 53 * - ih264_intra_pred_luma_16x16_mode_dc_ssse3 54 * - ih264_intra_pred_luma_16x16_mode_plane_ssse3 55 * 56 * @remarks 57 * None 58 * 59 ****************************************************************************** 60 */ 61 62 /*****************************************************************************/ 63 /* File Includes */ 64 /*****************************************************************************/ 65 /* System include files */ 66 #include <stdio.h> 67 #include <stddef.h> 68 #include <string.h> 69 #include <immintrin.h> 70 71 /* User include files */ 72 #include "ih264_defs.h" 73 #include "ih264_typedefs.h" 74 #include "ih264_macros.h" 75 #include "ih264_platform_macros.h" 76 #include "ih264_intra_pred_filters.h" 77 78 79 80 /******************* LUMA INTRAPREDICTION *******************/ 81 82 /******************* 4x4 Modes *******************/ 83 84 /** 85 ******************************************************************************* 86 * 87 * ih264_intra_pred_luma_4x4_mode_vert_ssse3 88 * 89 * @brief 90 * Perform Intra prediction for luma_4x4 mode:vertical 91 * 92 * @par Description: 93 * Perform Intra prediction for luma_4x4 mode:vertical ,described in sec 8.3.1.2.1 94 * 95 * @param[in] pu1_src 96 * UWORD8 pointer to the source 97 * 98 * @param[out] pu1_dst 99 * UWORD8 pointer to the destination 100 * 101 * @param[in] src_strd 102 * integer source stride 103 * 104 * @param[in] dst_strd 105 * integer destination stride 106 * 107 * @param[in] ngbr_avail 108 * availability of neighbouring pixels(Not used in this function) 109 * 110 * @returns 111 * 112 * @remarks 113 * None 114 * 115 ******************************************************************************* 116 */ 117 void ih264_intra_pred_luma_4x4_mode_vert_ssse3(UWORD8 *pu1_src, 118 UWORD8 *pu1_dst, 119 WORD32 src_strd, 120 WORD32 dst_strd, 121 WORD32 ngbr_avail) 122 { 123 UWORD8 *pu1_top; 124 WORD32 dst_strd2, dst_strd3; 125 WORD32 i4_top; 126 127 UNUSED(src_strd); 128 UNUSED(ngbr_avail); 129 130 pu1_top = pu1_src + BLK_SIZE + 1; 131 132 i4_top = *((WORD32 *)pu1_top); 133 134 dst_strd2 = dst_strd << 1; 135 dst_strd3 = dst_strd + dst_strd2; 136 137 *((WORD32 *)(pu1_dst)) = i4_top; 138 *((WORD32 *)(pu1_dst + dst_strd)) = i4_top; 139 *((WORD32 *)(pu1_dst + dst_strd2)) = i4_top; 140 *((WORD32 *)(pu1_dst + dst_strd3)) = i4_top; 141 } 142 143 /** 144 ******************************************************************************* 145 * 146 *ih264_intra_pred_luma_4x4_mode_horz_ssse3 147 * 148 * @brief 149 * Perform Intra prediction for luma_4x4 mode:horizontal 150 * 151 * @par Description: 152 * Perform Intra prediction for luma_4x4 mode:horizontal ,described in sec 8.3.1.2.2 153 * 154 * @param[in] pu1_src 155 * UWORD8 pointer to the source 156 * 157 * @param[out] pu1_dst 158 * UWORD8 pointer to the destination 159 * 160 * @param[in] src_strd 161 * integer source stride 162 * 163 * @param[in] dst_strd 164 * integer destination stride 165 * 166 * @param[in] ngbr_avail 167 * availability of neighbouring pixels(Not used in this function) 168 * 169 * @returns 170 * 171 * @remarks 172 * None 173 * 174 ******************************************************************************* 175 */ 176 void ih264_intra_pred_luma_4x4_mode_horz_ssse3(UWORD8 *pu1_src, 177 UWORD8 *pu1_dst, 178 WORD32 src_strd, 179 WORD32 dst_strd, 180 WORD32 ngbr_avail) 181 { 182 UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ 183 WORD32 row1,row2,row3,row4; 184 UWORD8 val; 185 WORD32 dst_strd2, dst_strd3; 186 187 UNUSED(src_strd); 188 UNUSED(ngbr_avail); 189 pu1_left = pu1_src + BLK_SIZE - 1; 190 191 val = *pu1_left; 192 row1 = val + (val << 8) + (val << 16) + (val << 24); 193 val = *(pu1_left - 1); 194 row2 = val + (val << 8) + (val << 16) + (val << 24); 195 val = *(pu1_left - 2); 196 row3 = val + (val << 8) + (val << 16) + (val << 24); 197 val = *(pu1_left - 3); 198 row4 = val + (val << 8) + (val << 16) + (val << 24); 199 200 dst_strd2 = dst_strd << 1; 201 dst_strd3 = dst_strd + dst_strd2; 202 203 *((WORD32 *)(pu1_dst)) = row1; 204 *((WORD32 *)(pu1_dst + dst_strd)) = row2; 205 *((WORD32 *)(pu1_dst + dst_strd2)) = row3; 206 *((WORD32 *)(pu1_dst + dst_strd3)) = row4; 207 } 208 209 /** 210 ******************************************************************************* 211 * 212 * ih264_intra_pred_luma_4x4_mode_dc_ssse3 213 * 214 * @brief 215 * Perform Intra prediction for luma_4x4 mode:DC 216 * 217 * @par Description: 218 * Perform Intra prediction for luma_4x4 mode:DC ,described in sec 8.3.1.2.3 219 * 220 * @param[in] pu1_src 221 * UWORD8 pointer to the source 222 * 223 * @param[out] pu1_dst 224 * UWORD8 pointer to the destination 225 * 226 * @param[in] src_strd 227 * integer source stride 228 * 229 * @param[in] dst_strd 230 * integer destination stride 231 * 232 * @param[in] ngbr_avail 233 * availability of neighbouring pixels 234 * 235 * @returns 236 * 237 * @remarks 238 * None 239 * 240 *******************************************************************************/ 241 void ih264_intra_pred_luma_4x4_mode_dc_ssse3(UWORD8 *pu1_src, 242 UWORD8 *pu1_dst, 243 WORD32 src_strd, 244 WORD32 dst_strd, 245 WORD32 ngbr_avail) 246 { 247 UWORD8 u1_useleft; /* availability of left predictors (only for DC) */ 248 UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ 249 UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ 250 UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ 251 WORD32 dst_strd2, dst_strd3; 252 WORD32 val = 0; 253 UNUSED(src_strd); 254 UNUSED(ngbr_avail); 255 u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); 256 u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); 257 pu1_top = pu1_src + BLK_SIZE + 1; 258 pu1_left = pu1_src + BLK_SIZE - 1; 259 260 if(u1_useleft) 261 { 262 val += *pu1_left--; 263 val += *pu1_left--; 264 val += *pu1_left--; 265 val += *pu1_left + 2; 266 } 267 if(u1_usetop) 268 { 269 val += *pu1_top + *(pu1_top + 1) + *(pu1_top + 2) + *(pu1_top + 3) 270 + 2; 271 } 272 /* Since 2 is added if either left/top pred is there, 273 val still being zero implies both preds are not there */ 274 val = (val) ? (val >> (1 + u1_useleft + u1_usetop)) : 128; 275 276 val = val + (val << 8) + (val << 16) + (val << 24); 277 278 dst_strd2 = dst_strd << 1; 279 dst_strd3 = dst_strd + dst_strd2; 280 281 *((WORD32 *)(pu1_dst)) = val; 282 *((WORD32 *)(pu1_dst + dst_strd)) = val; 283 *((WORD32 *)(pu1_dst + dst_strd2)) = val; 284 *((WORD32 *)(pu1_dst + dst_strd3)) = val; 285 } 286 287 /** 288 ******************************************************************************* 289 * 290 * ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3 291 * 292 * @brief 293 * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left 294 * 295 * @par Description: 296 * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Left ,described in sec 8.3.1.2.4 297 * 298 * @param[in] pu1_src 299 * UWORD8 pointer to the source 300 * 301 * @param[out] pu1_dst 302 * UWORD8 pointer to the destination 303 * 304 * @param[in] src_strd 305 * integer source stride 306 * 307 * @param[in] dst_strd 308 * integer destination stride 309 * 310 * @param[in] ngbr_avail 311 * availability of neighbouring pixels(Not used in this function) 312 * 313 * @returns 314 * 315 * @remarks 316 * None 317 * 318 *******************************************************************************/ 319 void ih264_intra_pred_luma_4x4_mode_diag_dl_ssse3(UWORD8 *pu1_src, 320 UWORD8 *pu1_dst, 321 WORD32 src_strd, 322 WORD32 dst_strd, 323 WORD32 ngbr_avail) 324 { 325 UWORD8 *pu1_top; 326 WORD32 dst_strd2, dst_strd3; 327 328 __m128i top_16x8b, top_8x16b, top_sh_8x16b; 329 __m128i res1_8x16b, res2_8x16b, res_16x8b; 330 __m128i zero_vector, const_2_8x16b; 331 WORD32 row1,row2,row3,row4; 332 333 UNUSED(src_strd); 334 UNUSED(ngbr_avail); 335 336 pu1_top = pu1_src + BLK_SIZE + 1; 337 338 top_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); 339 zero_vector = _mm_setzero_si128(); 340 top_8x16b = _mm_unpacklo_epi8(top_16x8b, zero_vector); //t0 t1 t2 t3 t4 t5 t6 t7 341 342 top_sh_8x16b = _mm_srli_si128(top_8x16b, 2); //t1 t2 t3 t4 t5 t6 t7 0 343 const_2_8x16b = _mm_set1_epi16(2); 344 345 top_sh_8x16b = _mm_shufflehi_epi16(top_sh_8x16b, 0xa4); //t1 t2 t3 t4 t5 t6 t7 t7 346 res1_8x16b = _mm_add_epi16(top_8x16b, top_sh_8x16b); 347 res2_8x16b = _mm_srli_si128(res1_8x16b, 2); 348 349 res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b); 350 res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b); 351 res1_8x16b = _mm_srai_epi16(res1_8x16b, 2); 352 353 dst_strd2 = dst_strd << 1; 354 dst_strd3 = dst_strd + dst_strd2; 355 356 res_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b); 357 row1 = _mm_cvtsi128_si32(res_16x8b); 358 res_16x8b = _mm_srli_si128(res_16x8b, 1); 359 row2 = _mm_cvtsi128_si32(res_16x8b); 360 res_16x8b = _mm_srli_si128(res_16x8b, 1); 361 row3 = _mm_cvtsi128_si32(res_16x8b); 362 res_16x8b = _mm_srli_si128(res_16x8b, 1); 363 row4 = _mm_cvtsi128_si32(res_16x8b); 364 365 *((WORD32 *)(pu1_dst)) = row1; 366 *((WORD32 *)(pu1_dst + dst_strd)) = row2; 367 *((WORD32 *)(pu1_dst + dst_strd2)) = row3; 368 *((WORD32 *)(pu1_dst + dst_strd3)) = row4; 369 } 370 371 /** 372 ******************************************************************************* 373 * 374 * ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3 375 * 376 * @brief 377 * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right 378 * 379 * @par Description: 380 * Perform Intra prediction for luma_4x4 mode:Diagonal_Down_Right ,described in sec 8.3.1.2.5 381 * 382 * @param[in] pu1_src 383 * UWORD8 pointer to the source 384 * 385 * @param[out] pu1_dst 386 * UWORD8 pointer to the destination 387 * 388 * @param[in] src_strd 389 * integer source stride 390 * 391 * @param[in] dst_strd 392 * integer destination stride 393 * 394 * @param[in] ngbr_avail 395 * availability of neighbouring pixels(Not used in this function) 396 * 397 * @returns 398 * 399 * @remarks 400 * None 401 * 402 *******************************************************************************/ 403 void ih264_intra_pred_luma_4x4_mode_diag_dr_ssse3(UWORD8 *pu1_src, 404 UWORD8 *pu1_dst, 405 WORD32 src_strd, 406 WORD32 dst_strd, 407 WORD32 ngbr_avail) 408 { 409 UWORD8 *pu1_left; 410 WORD32 dst_strd2, dst_strd3; 411 412 __m128i top_left_16x8b, top_left_8x16b; 413 __m128i top_left_sh_16x8b, top_left_sh_8x16b; 414 __m128i res1_8x16b, res2_8x16b; 415 __m128i res1_16x8b, res2_16x8b; 416 __m128i zero_vector, const_2_8x16b; 417 WORD32 row1,row2,row3,row4; 418 419 UNUSED(src_strd); 420 UNUSED(ngbr_avail); 421 422 pu1_left = pu1_src + BLK_SIZE - 1; 423 424 top_left_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 3)); //l3 l2 l1 l0 tl t0 t1 t2... 425 zero_vector = _mm_setzero_si128(); 426 top_left_sh_16x8b = _mm_srli_si128(top_left_16x8b, 1); //l2 l1 l0 tl t0 t1 t2 t3... 427 428 top_left_8x16b = _mm_unpacklo_epi8(top_left_16x8b, zero_vector); 429 top_left_sh_8x16b = _mm_unpacklo_epi8(top_left_sh_16x8b, zero_vector); 430 431 res1_8x16b = _mm_add_epi16(top_left_8x16b, top_left_sh_8x16b); //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3... 432 const_2_8x16b = _mm_set1_epi16(2); 433 res2_8x16b = _mm_srli_si128(res1_8x16b, 2); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3... 434 435 res1_8x16b = _mm_add_epi16(res1_8x16b, const_2_8x16b); 436 res1_8x16b = _mm_add_epi16(res2_8x16b, res1_8x16b); //l3+2*l2+l1+2 l2+2*l1+l0+2... 437 res1_8x16b = _mm_srai_epi16(res1_8x16b, 2); 438 res1_16x8b = _mm_packus_epi16(res1_8x16b, res1_8x16b); 439 440 dst_strd2 = dst_strd << 1; 441 dst_strd3 = dst_strd + dst_strd2; 442 443 res2_16x8b = _mm_srli_si128(res1_16x8b, 3); 444 445 row1 = _mm_cvtsi128_si32(res2_16x8b); 446 res2_16x8b = _mm_srli_si128(res1_16x8b, 2); 447 row2 = _mm_cvtsi128_si32(res2_16x8b); 448 res2_16x8b = _mm_srli_si128(res1_16x8b, 1); 449 row3 = _mm_cvtsi128_si32(res2_16x8b); 450 row4 = _mm_cvtsi128_si32(res1_16x8b); 451 452 *((WORD32 *)(pu1_dst)) = row1; 453 *((WORD32 *)(pu1_dst + dst_strd)) = row2; 454 *((WORD32 *)(pu1_dst + dst_strd2)) = row3; 455 *((WORD32 *)(pu1_dst + dst_strd3)) = row4; 456 } 457 458 /** 459 ******************************************************************************* 460 * 461 * ih264_intra_pred_luma_4x4_mode_vert_r_ssse3 462 * 463 * @brief 464 * Perform Intra prediction for luma_4x4 mode:Vertical_Right 465 * 466 * @par Description: 467 * Perform Intra prediction for luma_4x4 mode:Vertical_Right ,described in sec 8.3.1.2.6 468 * 469 * @param[in] pu1_src 470 * UWORD8 pointer to the source 471 * 472 * @param[out] pu1_dst 473 * UWORD8 pointer to the destination 474 * 475 * @param[in] src_strd 476 * integer source stride 477 * 478 * @param[in] dst_strd 479 * integer destination stride 480 * 481 * @param[in] ngbr_avail 482 * availability of neighbouring pixels(Not used in this function) 483 * 484 * @returns 485 * 486 * @remarks 487 * None 488 * 489 *******************************************************************************/ 490 void ih264_intra_pred_luma_4x4_mode_vert_r_ssse3(UWORD8 *pu1_src, 491 UWORD8 *pu1_dst, 492 WORD32 src_strd, 493 WORD32 dst_strd, 494 WORD32 ngbr_avail) 495 { 496 UWORD8 *pu1_left; 497 WORD32 dst_strd2, dst_strd3; 498 499 __m128i val_16x8b, temp_16x8b; 500 __m128i w11_a1_16x8b, w11_a2_16x8b; 501 __m128i w121_a1_8x16b, w121_a2_8x16b, w121_sh_8x16b; 502 __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; 503 __m128i zero_vector, const_2_8x16b; 504 WORD32 row1,row2,row3,row4; 505 506 UNUSED(src_strd); 507 UNUSED(ngbr_avail); 508 509 pu1_left = pu1_src + BLK_SIZE - 1; 510 511 val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 2)); 512 zero_vector = _mm_setzero_si128(); 513 514 w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l2 l1 l0 tl t0 t1 t2 t3 515 w11_a1_16x8b = _mm_srli_si128(val_16x8b, 3); 516 w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1 l0 tl t0 t1 t2 t3 0 517 w11_a2_16x8b = _mm_srli_si128(val_16x8b, 4); 518 519 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3 t3 520 row1_16x8b = _mm_avg_epu8(w11_a1_16x8b, w11_a2_16x8b); 521 w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2+t3 t3 0 522 523 const_2_8x16b = _mm_set1_epi16(2); 524 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l2+2*l1+l0 l1+2*l0+tl ... 525 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b); 526 w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2); 527 528 w121_sh_8x16b = _mm_shufflelo_epi16(w121_a1_8x16b, 0xe1); 529 w121_sh_8x16b = _mm_srli_si128(w121_sh_8x16b, 2); 530 531 row4_16x8b = _mm_packus_epi16(w121_sh_8x16b, w121_sh_8x16b); 532 temp_16x8b = _mm_slli_si128(w121_a1_8x16b, 13); 533 row2_16x8b = _mm_srli_si128(row4_16x8b, 1); 534 row3_16x8b = _mm_alignr_epi8(row1_16x8b, temp_16x8b, 15); 535 536 dst_strd2 = dst_strd << 1; 537 dst_strd3 = dst_strd + dst_strd2; 538 539 row1 = _mm_cvtsi128_si32(row1_16x8b); 540 row2 = _mm_cvtsi128_si32(row2_16x8b); 541 row3 = _mm_cvtsi128_si32(row3_16x8b); 542 row4 = _mm_cvtsi128_si32(row4_16x8b); 543 544 *((WORD32 *)(pu1_dst)) = row1; 545 *((WORD32 *)(pu1_dst + dst_strd)) = row2; 546 *((WORD32 *)(pu1_dst + dst_strd2)) = row3; 547 *((WORD32 *)(pu1_dst + dst_strd3)) = row4; 548 } 549 550 /* 551 ******************************************************************************* 552 * 553 * ih264_intra_pred_luma_4x4_mode_horz_d_ssse3 554 * 555 * @brief 556 * Perform Intra prediction for luma_4x4 mode:Horizontal_Down 557 * 558 * @par Description: 559 * Perform Intra prediction for luma_4x4 mode:Horizontal_Down ,described in sec 8.3.1.2.7 560 * 561 * @param[in] pu1_src 562 * UWORD8 pointer to the source 563 * 564 * @param[out] pu1_dst 565 * UWORD8 pointer to the destination 566 * 567 * @param[in] src_strd 568 * integer source stride 569 * 570 * @param[in] dst_strd 571 * integer destination stride 572 * 573 * @param[in] ngbr_avail 574 * availability of neighbouring pixels(Not used in this function) 575 * 576 * @returns 577 * 578 * @remarks 579 * None 580 * 581 *******************************************************************************/ 582 void ih264_intra_pred_luma_4x4_mode_horz_d_ssse3(UWORD8 *pu1_src, 583 UWORD8 *pu1_dst, 584 WORD32 src_strd, 585 WORD32 dst_strd, 586 WORD32 ngbr_avail) 587 { 588 UWORD8 *pu1_left; 589 WORD32 dst_strd2, dst_strd3; 590 WORD32 val_121_t0t1; 591 592 __m128i val_16x8b, val_sh_16x8b; 593 __m128i w11_16x8b; 594 __m128i w121_a1_8x16b, w121_a2_8x16b, w121_16x8b; 595 __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; 596 597 __m128i zero_vector, const_2_8x16b; 598 WORD32 row1,row2,row3,row4; 599 600 UNUSED(src_strd); 601 UNUSED(ngbr_avail); 602 603 pu1_left = pu1_src + BLK_SIZE - 1; 604 605 val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); 606 zero_vector = _mm_setzero_si128(); 607 val_sh_16x8b = _mm_srli_si128(val_16x8b, 1); 608 w11_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b); 609 610 w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l3 l2 l1 l0 tl t0 t1 t2 611 w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l2 l1 l0 tl t0 t1 t2 0 612 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l3+l2 l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2 613 w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l2+l1 l1+l0 l0+tl tl+t0 t0+t1 t1+t2 t2 0 614 615 zero_vector = _mm_setzero_si128(); 616 const_2_8x16b = _mm_set1_epi16(2); 617 618 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l3+2*l2+l1 l2+2*l1+l0 l1+2*l0+tl ... 619 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b); 620 w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2); 621 622 w121_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b); 623 624 row4_16x8b = _mm_unpacklo_epi8(w11_16x8b, w121_16x8b); 625 val_121_t0t1 = _mm_extract_epi16(w121_16x8b, 2); 626 row4_16x8b = _mm_insert_epi16(row4_16x8b, val_121_t0t1, 4); 627 628 dst_strd2 = dst_strd << 1; 629 dst_strd3 = dst_strd + dst_strd2; 630 631 row1_16x8b = _mm_srli_si128(row4_16x8b, 6); 632 row2_16x8b = _mm_srli_si128(row4_16x8b, 4); 633 row3_16x8b = _mm_srli_si128(row4_16x8b, 2); 634 635 row1 = _mm_cvtsi128_si32(row1_16x8b); 636 row2 = _mm_cvtsi128_si32(row2_16x8b); 637 row3 = _mm_cvtsi128_si32(row3_16x8b); 638 row4 = _mm_cvtsi128_si32(row4_16x8b); 639 640 *((WORD32 *)(pu1_dst)) = row1; 641 *((WORD32 *)(pu1_dst + dst_strd)) = row2; 642 *((WORD32 *)(pu1_dst + dst_strd2)) = row3; 643 *((WORD32 *)(pu1_dst + dst_strd3)) = row4; 644 } 645 646 /** 647 ******************************************************************************* 648 * 649 * ih264_intra_pred_luma_4x4_mode_vert_l_ssse3 650 * 651 * @brief 652 * Perform Intra prediction for luma_4x4 mode:Vertical_Left 653 * 654 * @par Description: 655 * Perform Intra prediction for luma_4x4 mode:Vertical_Left ,described in sec 8.3.1.2.8 656 * 657 * @param[in] pu1_src 658 * UWORD8 pointer to the source 659 * 660 * @param[out] pu1_dst 661 * UWORD8 pointer to the destination 662 * 663 * @param[in] src_strd 664 * integer source stride 665 * 666 * @param[in] dst_strd 667 * integer destination stride 668 * 669 * @param[in] ngbr_avail 670 * availability of neighbouring pixels(Not used in this function) 671 * 672 * @returns 673 * 674 * @remarks 675 * None 676 * 677 *******************************************************************************/ 678 void ih264_intra_pred_luma_4x4_mode_vert_l_ssse3(UWORD8 *pu1_src, 679 UWORD8 *pu1_dst, 680 WORD32 src_strd, 681 WORD32 dst_strd, 682 WORD32 ngbr_avail) 683 { 684 UWORD8 *pu1_top; 685 WORD32 dst_strd2, dst_strd3; 686 687 __m128i val_16x8b, val_sh_16x8b; 688 __m128i w121_a1_8x16b, w121_a2_8x16b; 689 __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; 690 691 __m128i zero_vector, const_2_8x16b; 692 WORD32 row1,row2,row3,row4; 693 694 UNUSED(src_strd); 695 UNUSED(ngbr_avail); 696 697 pu1_top = pu1_src +BLK_SIZE + 1; 698 699 val_16x8b = _mm_loadl_epi64((__m128i *)pu1_top); 700 zero_vector = _mm_setzero_si128(); 701 val_sh_16x8b = _mm_srli_si128(val_16x8b, 1); 702 row1_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b); 703 704 w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //t0 t1 t2 t3 t4 t5... 705 w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //t1 t2 t3 t4 t5 t6... 706 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //t0+t1 t1+t2 t2+t3 t3+t4 t4+t5... 707 w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //t1+t2 t2+t3 t3+t4 t4+t5 t5+t6... 708 709 zero_vector = _mm_setzero_si128(); 710 const_2_8x16b = _mm_set1_epi16(2); 711 712 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //t0+2*t1+t2 t1+2*t2+t3 t2+2*t3+t4... 713 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b); 714 w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2); 715 716 row2_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b); 717 718 dst_strd2 = dst_strd << 1; 719 dst_strd3 = dst_strd + dst_strd2; 720 721 row3_16x8b = _mm_srli_si128(row1_16x8b, 1); 722 row4_16x8b = _mm_srli_si128(row2_16x8b, 1); 723 724 row1 = _mm_cvtsi128_si32(row1_16x8b); 725 row2 = _mm_cvtsi128_si32(row2_16x8b); 726 row3 = _mm_cvtsi128_si32(row3_16x8b); 727 row4 = _mm_cvtsi128_si32(row4_16x8b); 728 729 *((WORD32 *)(pu1_dst)) = row1; 730 *((WORD32 *)(pu1_dst + dst_strd)) = row2; 731 *((WORD32 *)(pu1_dst + dst_strd2)) = row3; 732 *((WORD32 *)(pu1_dst + dst_strd3)) = row4; 733 } 734 735 /** 736 ******************************************************************************* 737 * 738 * ih264_intra_pred_luma_4x4_mode_horz_u_ssse3 739 * 740 * @brief 741 * Perform Intra prediction for luma_4x4 mode:Horizontal_Up 742 * 743 * @par Description: 744 * Perform Intra prediction for luma_4x4 mode:Horizontal_Up ,described in sec 8.3.1.2.9 745 * 746 * @param[in] pu1_src 747 * UWORD8 pointer to the source 748 * 749 * @param[out] pu1_dst 750 * UWORD8 pointer to the destination 751 * 752 * @param[in] src_strd 753 * integer source stride 754 * 755 * @param[in] dst_strd 756 * integer destination stride 757 * 758 * @param[in] ngbr_avail 759 * availability of neighbouring pixels(Not used in this function) 760 * 761 * @returns 762 * 763 * @remarks 764 * None 765 * 766 *******************************************************************************/ 767 void ih264_intra_pred_luma_4x4_mode_horz_u_ssse3(UWORD8 *pu1_src, 768 UWORD8 *pu1_dst, 769 WORD32 src_strd, 770 WORD32 dst_strd, 771 WORD32 ngbr_avail) 772 { 773 UWORD8 *pu1_left; 774 WORD32 dst_strd2, dst_strd3; 775 776 __m128i val_16x8b, val_sh_16x8b; 777 __m128i w11_16x8b; 778 __m128i w121_a1_8x16b, w121_a2_8x16b, w121_16x8b; 779 __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; 780 781 __m128i zero_vector, const_2_8x16b, rev_16x8b; 782 WORD32 row1,row2,row3,row4; 783 784 UNUSED(src_strd); 785 UNUSED(ngbr_avail); 786 787 pu1_left = pu1_src + BLK_SIZE - 1; 788 789 zero_vector = _mm_setzero_si128(); 790 rev_16x8b = _mm_setr_epi8(3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 791 792 val_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 3)); //l3 l2 l1 l0 0 0 0... 793 val_16x8b = _mm_shuffle_epi8(val_16x8b, rev_16x8b); //l0 l1 l2 l3 l3 l3 l3... 794 795 val_sh_16x8b = _mm_srli_si128(val_16x8b, 1); 796 w11_16x8b = _mm_avg_epu8(val_16x8b, val_sh_16x8b); 797 798 w121_a1_8x16b = _mm_unpacklo_epi8(val_16x8b, zero_vector); //l0 l1 l2 l3 l3 l3... 799 w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1 l2 l3 l3 l3 l3... 800 801 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l0+t1 l1+l2 l2+l3 2*l3 2*l3... 802 w121_a2_8x16b = _mm_srli_si128(w121_a1_8x16b, 2); //l1+t2 l2+l3 2*l3 2*l3 2*l3... 803 804 zero_vector = _mm_setzero_si128(); 805 const_2_8x16b = _mm_set1_epi16(2); 806 807 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, w121_a2_8x16b); //l0+2*l1+l2 l1+2*l2+l3 l2+3*l3 4*l3 4*l3... 808 w121_a1_8x16b = _mm_add_epi16(w121_a1_8x16b, const_2_8x16b); 809 w121_a1_8x16b = _mm_srai_epi16(w121_a1_8x16b, 2); 810 811 w121_16x8b = _mm_packus_epi16(w121_a1_8x16b, w121_a1_8x16b); 812 813 dst_strd2 = dst_strd << 1; 814 dst_strd3 = dst_strd + dst_strd2; 815 816 row1_16x8b = _mm_unpacklo_epi8(w11_16x8b, w121_16x8b); 817 row2_16x8b = _mm_srli_si128(row1_16x8b, 2); 818 row3_16x8b = _mm_srli_si128(row1_16x8b, 4); 819 row4_16x8b = _mm_srli_si128(row1_16x8b, 6); 820 821 row1 = _mm_cvtsi128_si32(row1_16x8b); 822 row2 = _mm_cvtsi128_si32(row2_16x8b); 823 row3 = _mm_cvtsi128_si32(row3_16x8b); 824 row4 = _mm_cvtsi128_si32(row4_16x8b); 825 826 *((WORD32 *)(pu1_dst)) = row1; 827 *((WORD32 *)(pu1_dst + dst_strd)) = row2; 828 *((WORD32 *)(pu1_dst + dst_strd2)) = row3; 829 *((WORD32 *)(pu1_dst + dst_strd3)) = row4; 830 } 831 832 /******************* 8x8 Modes *******************/ 833 834 /** 835 ******************************************************************************* 836 * 837 * ih264_intra_pred_luma_8x8_mode_vert_ssse3 838 * 839 * @brief 840 * Perform Intra prediction for luma_8x8 mode:vertical 841 * 842 * @par Description: 843 * Perform Intra prediction for luma_8x8 mode:vertical ,described in sec 8.3.2.2.2 844 * 845 * @param[in] pu1_src 846 * UWORD8 pointer to the source 847 * 848 * @param[out] pu1_dst 849 * UWORD8 pointer to the destination 850 * 851 * @param[in] src_strd 852 * integer source stride 853 * 854 * @param[in] dst_strd 855 * integer destination stride 856 * 857 * @param[in] ngbr_avail 858 * availability of neighbouring pixels(Not used in this function) 859 * 860 * @returns 861 * 862 * @remarks 863 * None 864 * 865 ******************************************************************************* 866 */ 867 void ih264_intra_pred_luma_8x8_mode_vert_ssse3(UWORD8 *pu1_src, 868 UWORD8 *pu1_dst, 869 WORD32 src_strd, 870 WORD32 dst_strd, 871 WORD32 ngbr_avail) 872 { 873 UWORD8 *pu1_top = NULL; 874 __m128i top_8x8b; 875 UNUSED(src_strd); 876 UNUSED(ngbr_avail); 877 pu1_top = pu1_src + BLK8x8SIZE + 1; 878 879 top_8x8b = _mm_loadl_epi64((__m128i *)pu1_top); 880 881 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), top_8x8b); 882 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), top_8x8b); 883 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), top_8x8b); 884 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), top_8x8b); 885 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), top_8x8b); 886 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), top_8x8b); 887 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), top_8x8b); 888 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), top_8x8b); 889 } 890 891 /** 892 ******************************************************************************* 893 * 894 * ih264_intra_pred_luma_8x8_mode_horz_ssse3 895 * 896 * @brief 897 * Perform Intra prediction for luma_8x8 mode:horizontal 898 * 899 * @par Description: 900 * Perform Intra prediction for uma_8x8 mode:horizontal ,described in sec 8.3.2.2.2 901 * 902 * @param[in] pu1_src 903 * UWORD8 pointer to the source 904 * 905 * @param[out] pu1_dst 906 * UWORD8 pointer to the destination 907 * 908 * @param[in] src_strd 909 * integer source stride 910 * 911 * @param[in] dst_strd 912 * integer destination stride 913 * 914 * @param[in] ngbr_avail 915 * availability of neighbouring pixels(Not used in this function) 916 * 917 * @returns 918 * 919 * @remarks 920 * None 921 * 922 ******************************************************************************* 923 */ 924 void ih264_intra_pred_luma_8x8_mode_horz_ssse3(UWORD8 *pu1_src, 925 UWORD8 *pu1_dst, 926 WORD32 src_strd, 927 WORD32 dst_strd, 928 WORD32 ngbr_avail) 929 { 930 UWORD8 *pu1_left = pu1_src + BLK8x8SIZE - 1; 931 __m128i row1_8x8b, row2_8x8b, row3_8x8b, row4_8x8b; 932 __m128i row5_8x8b, row6_8x8b, row7_8x8b, row8_8x8b; 933 934 UNUSED(src_strd); 935 UNUSED(ngbr_avail); 936 937 row1_8x8b = _mm_set1_epi8(pu1_left[0]); 938 row2_8x8b = _mm_set1_epi8(pu1_left[-1]); 939 row3_8x8b = _mm_set1_epi8(pu1_left[-2]); 940 row4_8x8b = _mm_set1_epi8(pu1_left[-3]); 941 row5_8x8b = _mm_set1_epi8(pu1_left[-4]); 942 row6_8x8b = _mm_set1_epi8(pu1_left[-5]); 943 row7_8x8b = _mm_set1_epi8(pu1_left[-6]); 944 row8_8x8b = _mm_set1_epi8(pu1_left[-7]); 945 946 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), row1_8x8b); 947 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), row2_8x8b); 948 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), row3_8x8b); 949 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), row4_8x8b); 950 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), row5_8x8b); 951 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), row6_8x8b); 952 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), row7_8x8b); 953 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), row8_8x8b); 954 } 955 956 /** 957 ******************************************************************************* 958 * 959 * ih264_intra_pred_luma_8x8_mode_dc_ssse3 960 * 961 * @brief 962 * Perform Intra prediction for luma_8x8 mode:DC 963 * 964 * @par Description: 965 * Perform Intra prediction for luma_8x8 mode:DC ,described in sec 8.3.2.2.4 966 * 967 * @param[in] pu1_src 968 * UWORD8 pointer to the source 969 * 970 * @param[out] pu1_dst 971 * UWORD8 pointer to the destination 972 * 973 * @param[in] src_strd 974 * integer source stride 975 * 976 * @param[in] dst_strd 977 * integer destination stride 978 * 979 * @param[in] ngbr_avail 980 * availability of neighbouring pixels 981 * 982 * @returns 983 * 984 * @remarks 985 * None 986 * 987 *******************************************************************************/ 988 void ih264_intra_pred_luma_8x8_mode_dc_ssse3(UWORD8 *pu1_src, 989 UWORD8 *pu1_dst, 990 WORD32 src_strd, 991 WORD32 dst_strd, 992 WORD32 ngbr_avail) 993 { 994 UWORD8 u1_useleft; /* availability of left predictors (only for DC) */ 995 UWORD8 u1_usetop; /* availability of top predictors (only for DC) */ 996 UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ 997 UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ 998 __m128i dc_val_8x8b; 999 WORD32 dc_val = 0; 1000 UNUSED(src_strd); 1001 1002 u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); 1003 u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); 1004 pu1_top = pu1_src + BLK8x8SIZE + 1; 1005 pu1_left = pu1_src + BLK8x8SIZE - 1; 1006 1007 if(u1_useleft || u1_usetop) 1008 { 1009 WORD32 shft = 2; 1010 __m128i val_8x8b, zero_8x8b, sum_8x16b; 1011 1012 zero_8x8b = _mm_setzero_si128(); 1013 1014 if(u1_useleft) 1015 { 1016 val_8x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 7)); 1017 sum_8x16b = _mm_sad_epu8(zero_8x8b, val_8x8b); 1018 1019 shft++; 1020 dc_val += 4; 1021 dc_val += _mm_extract_epi16(sum_8x16b, 0); 1022 } 1023 if(u1_usetop) 1024 { 1025 val_8x8b = _mm_loadl_epi64((__m128i *)pu1_top); 1026 sum_8x16b = _mm_sad_epu8(zero_8x8b, val_8x8b); 1027 1028 shft++; 1029 dc_val += 4; 1030 dc_val += _mm_extract_epi16(sum_8x16b, 0); 1031 } 1032 dc_val = dc_val >> shft; 1033 } 1034 else 1035 dc_val = 128; 1036 1037 dc_val_8x8b = _mm_set1_epi8(dc_val); 1038 1039 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), dc_val_8x8b); 1040 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), dc_val_8x8b); 1041 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), dc_val_8x8b); 1042 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), dc_val_8x8b); 1043 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), dc_val_8x8b); 1044 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), dc_val_8x8b); 1045 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), dc_val_8x8b); 1046 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), dc_val_8x8b); 1047 } 1048 1049 /** 1050 ******************************************************************************* 1051 * 1052 * ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3 1053 * 1054 * @brief 1055 * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left 1056 * 1057 * @par Description: 1058 * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Left ,described in sec 8.3.2.2.5 1059 * 1060 * @param[in] pu1_src 1061 * UWORD8 pointer to the source 1062 * 1063 * @param[out] pu1_dst 1064 * UWORD8 pointer to the destination 1065 * 1066 * @param[in] src_strd 1067 * integer source stride 1068 * 1069 * @param[in] dst_strd 1070 * integer destination stride 1071 * 1072 * @param[in] ngbr_avail 1073 * availability of neighbouring pixels(Not used in this function) 1074 * 1075 * @returns 1076 * 1077 * @remarks 1078 * None 1079 * 1080 *******************************************************************************/ 1081 void ih264_intra_pred_luma_8x8_mode_diag_dl_ssse3(UWORD8 *pu1_src, 1082 UWORD8 *pu1_dst, 1083 WORD32 src_strd, 1084 WORD32 dst_strd, 1085 WORD32 ngbr_avail) 1086 { 1087 UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ 1088 __m128i top_16x8; 1089 __m128i out_15x16; 1090 __m128i a0_8x16, a1_8x16, a2_8x16; 1091 __m128i temp1, temp2; 1092 __m128i res1_8x16, res2_8x16; 1093 __m128i zero = _mm_setzero_si128(); 1094 __m128i const_val2_8x16 = _mm_set1_epi16(2); 1095 1096 UNUSED(src_strd); 1097 UNUSED(ngbr_avail); 1098 1099 pu1_top = pu1_src + BLK8x8SIZE + 1; 1100 1101 top_16x8 = _mm_loadu_si128((__m128i *)(pu1_top)); 1102 1103 temp1 = _mm_srli_si128(top_16x8, 1); 1104 temp2 = _mm_srli_si128(top_16x8, 2); 1105 a0_8x16 = _mm_unpacklo_epi8(top_16x8, zero); 1106 a1_8x16 = _mm_unpacklo_epi8(temp1, zero); 1107 a2_8x16 = _mm_unpacklo_epi8(temp2, zero); 1108 1109 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1110 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1111 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1112 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1113 res1_8x16 = _mm_srai_epi16(a0_8x16, 2); 1114 1115 temp2 = _mm_srli_si128(top_16x8, 2); 1116 temp1 = _mm_srli_si128(top_16x8, 1); 1117 a2_8x16 = _mm_unpackhi_epi8(temp2, zero); 1118 a0_8x16 = _mm_unpackhi_epi8(top_16x8, zero); 1119 a2_8x16 = _mm_shufflehi_epi16(a2_8x16, 0x14); 1120 a1_8x16 = _mm_unpackhi_epi8(temp1, zero); 1121 1122 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1123 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1124 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1125 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1126 res2_8x16 = _mm_srai_epi16(a0_8x16, 2); 1127 1128 out_15x16 = _mm_packus_epi16(res1_8x16, res2_8x16); 1129 1130 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), out_15x16); 1131 out_15x16 = _mm_srli_si128(out_15x16, 1); 1132 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), out_15x16); 1133 out_15x16 = _mm_srli_si128(out_15x16, 1); 1134 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), out_15x16); 1135 out_15x16 = _mm_srli_si128(out_15x16, 1); 1136 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out_15x16); 1137 out_15x16 = _mm_srli_si128(out_15x16, 1); 1138 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), out_15x16); 1139 out_15x16 = _mm_srli_si128(out_15x16, 1); 1140 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), out_15x16); 1141 out_15x16 = _mm_srli_si128(out_15x16, 1); 1142 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), out_15x16); 1143 out_15x16 = _mm_srli_si128(out_15x16, 1); 1144 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out_15x16); 1145 } 1146 1147 /** 1148 ******************************************************************************* 1149 * 1150 * ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3 1151 * 1152 * @brief 1153 * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right 1154 * 1155 * @par Description: 1156 * Perform Intra prediction for luma_8x8 mode:Diagonal_Down_Right ,described in sec 8.3.2.2.6 1157 * 1158 * @param[in] pu1_src 1159 * UWORD8 pointer to the source 1160 * 1161 * @param[out] pu1_dst 1162 * UWORD8 pointer to the destination 1163 * 1164 * @param[in] src_strd 1165 * integer source stride 1166 * 1167 * @param[in] dst_strd 1168 * integer destination stride 1169 * 1170 * @param[in] ngbr_avail 1171 * availability of neighbouring pixels(Not used in this function) 1172 * 1173 * @returns 1174 * 1175 * @remarks 1176 * None 1177 * 1178 *******************************************************************************/ 1179 void ih264_intra_pred_luma_8x8_mode_diag_dr_ssse3(UWORD8 *pu1_src, 1180 UWORD8 *pu1_dst, 1181 WORD32 src_strd, 1182 WORD32 dst_strd, 1183 WORD32 ngbr_avail) 1184 { 1185 UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ 1186 UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ 1187 __m128i top_8x8, left_16x8; 1188 __m128i out_15x16; 1189 __m128i a0_8x16, a1_8x16, a2_8x16; 1190 __m128i temp1, temp2; 1191 __m128i res1_8x16, res2_8x16; 1192 __m128i zero = _mm_setzero_si128(); 1193 __m128i const_val2_8x16 = _mm_set1_epi16(2); 1194 __m128i str_8x8; 1195 1196 UNUSED(src_strd); 1197 UNUSED(ngbr_avail); 1198 1199 pu1_left = pu1_src + BLK8x8SIZE - 1; 1200 pu1_top = pu1_src + BLK8x8SIZE + 1; 1201 1202 left_16x8 = _mm_loadu_si128((__m128i *)(pu1_left - 7)); 1203 1204 temp1 = _mm_srli_si128(left_16x8, 1); 1205 temp2 = _mm_srli_si128(left_16x8, 2); 1206 a0_8x16 = _mm_unpacklo_epi8(left_16x8, zero); 1207 a1_8x16 = _mm_unpacklo_epi8(temp1, zero); 1208 a2_8x16 = _mm_unpacklo_epi8(temp2, zero); 1209 1210 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1211 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1212 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1213 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1214 res1_8x16 = _mm_srai_epi16(a0_8x16, 2); 1215 1216 top_8x8 = _mm_loadu_si128((__m128i *)(pu1_top - 1)); 1217 1218 temp1 = _mm_srli_si128(top_8x8, 1); 1219 temp2 = _mm_srli_si128(top_8x8, 2); 1220 a0_8x16 = _mm_unpacklo_epi8(top_8x8, zero); 1221 a1_8x16 = _mm_unpacklo_epi8(temp1, zero); 1222 a2_8x16 = _mm_unpacklo_epi8(temp2, zero); 1223 1224 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1225 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1226 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1227 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1228 res2_8x16 = _mm_srai_epi16(a0_8x16, 2); 1229 1230 out_15x16 = _mm_packus_epi16(res1_8x16, res2_8x16); 1231 1232 str_8x8 = _mm_srli_si128(out_15x16, 7); 1233 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8); 1234 str_8x8 = _mm_srli_si128(out_15x16, 6); 1235 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8); 1236 str_8x8 = _mm_srli_si128(out_15x16, 5); 1237 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8); 1238 str_8x8 = _mm_srli_si128(out_15x16, 4); 1239 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8); 1240 str_8x8 = _mm_srli_si128(out_15x16, 3); 1241 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8); 1242 str_8x8 = _mm_srli_si128(out_15x16, 2); 1243 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8); 1244 str_8x8 = _mm_srli_si128(out_15x16, 1); 1245 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8); 1246 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out_15x16); 1247 } 1248 1249 /** 1250 ******************************************************************************* 1251 * 1252 * ih264_intra_pred_luma_8x8_mode_vert_r_ssse3 1253 * 1254 * @brief 1255 * Perform Intra prediction for luma_8x8 mode:Vertical_Right 1256 * 1257 * @par Description: 1258 * Perform Intra prediction for luma_8x8 mode:Vertical_Right ,described in sec 8.3.2.2.7 1259 * 1260 * @param[in] pu1_src 1261 * UWORD8 pointer to the source 1262 * 1263 * @param[out] pu1_dst 1264 * UWORD8 pointer to the destination 1265 * 1266 * @param[in] src_strd 1267 * integer source stride 1268 * 1269 * @param[in] dst_strd 1270 * integer destination stride 1271 * 1272 * @param[in] ngbr_avail 1273 * availability of neighbouring pixels(Not used in this function) 1274 * 1275 * @returns 1276 * 1277 * @remarks 1278 * None 1279 * 1280 *******************************************************************************/ 1281 void ih264_intra_pred_luma_8x8_mode_vert_r_ssse3(UWORD8 *pu1_src, 1282 UWORD8 *pu1_dst, 1283 WORD32 src_strd, 1284 WORD32 dst_strd, 1285 WORD32 ngbr_avail) 1286 { 1287 UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ 1288 UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ 1289 __m128i top_8x8, left_16x8; 1290 __m128i out1_16x16, out2_16x16; 1291 __m128i a0_8x16, a1_8x16, a2_8x16; 1292 __m128i temp1, temp2; 1293 __m128i res1_8x16, res2_8x16, res3_8x16; 1294 __m128i zero = _mm_setzero_si128(); 1295 __m128i const_val2_8x16 = _mm_set1_epi16(2); 1296 __m128i str_8x8; 1297 __m128i mask = _mm_set1_epi32(0xFFFF); 1298 1299 UNUSED(src_strd); 1300 UNUSED(ngbr_avail); 1301 1302 pu1_left = pu1_src + BLK8x8SIZE - 1; 1303 pu1_top = pu1_src + BLK8x8SIZE + 1; 1304 1305 left_16x8 = _mm_loadu_si128((__m128i *)(pu1_left - 6)); 1306 1307 temp1 = _mm_srli_si128(left_16x8, 1); 1308 temp2 = _mm_srli_si128(left_16x8, 2); 1309 a0_8x16 = _mm_unpacklo_epi8(left_16x8, zero); 1310 a1_8x16 = _mm_unpacklo_epi8(temp1, zero); 1311 a2_8x16 = _mm_unpacklo_epi8(temp2, zero); 1312 1313 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1314 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1315 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1316 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1317 res1_8x16 = _mm_srai_epi16(a0_8x16, 2); 1318 1319 top_8x8 = _mm_loadu_si128((__m128i *)(pu1_top - 1)); 1320 1321 temp1 = _mm_srli_si128(top_8x8, 1); 1322 temp2 = _mm_srli_si128(top_8x8, 2); 1323 a0_8x16 = _mm_unpacklo_epi8(top_8x8, zero); 1324 a1_8x16 = _mm_unpacklo_epi8(temp1, zero); 1325 a2_8x16 = _mm_unpacklo_epi8(temp2, zero); 1326 1327 res3_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); 1328 1329 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1330 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1331 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1332 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1333 res2_8x16 = _mm_srai_epi16(a0_8x16, 2); 1334 1335 str_8x8 = _mm_packus_epi16(res3_8x16, zero); 1336 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8); 1337 1338 temp1 = _mm_and_si128(res1_8x16, mask); 1339 temp1 = _mm_packs_epi32(temp1, temp1); 1340 out1_16x16 = _mm_packus_epi16(temp1, res2_8x16); 1341 1342 res1_8x16 = _mm_slli_si128(res1_8x16, 2); 1343 temp1 = _mm_and_si128(res1_8x16, mask); 1344 temp1 = _mm_packs_epi32(temp1, temp1); 1345 out2_16x16 = _mm_packus_epi16(temp1, res3_8x16); 1346 1347 str_8x8 = _mm_srli_si128(out1_16x16, 7); 1348 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8); 1349 1350 str_8x8 = _mm_srli_si128(out2_16x16, 7); 1351 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8); 1352 1353 str_8x8 = _mm_srli_si128(out1_16x16, 6); 1354 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8); 1355 1356 str_8x8 = _mm_srli_si128(out2_16x16, 6); 1357 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8); 1358 1359 str_8x8 = _mm_srli_si128(out1_16x16, 5); 1360 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8); 1361 1362 str_8x8 = _mm_srli_si128(out2_16x16, 5); 1363 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8); 1364 1365 str_8x8 = _mm_srli_si128(out1_16x16, 4); 1366 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), str_8x8); 1367 } 1368 1369 /* 1370 ******************************************************************************* 1371 * 1372 * ih264_intra_pred_luma_8x8_mode_horz_d_ssse3 1373 * 1374 * @brief 1375 * Perform Intra prediction for luma_8x8 mode:Horizontal_Down 1376 * 1377 * @par Description: 1378 * Perform Intra prediction for luma_8x8 mode:Horizontal_Down ,described in sec 8.3.2.2.8 1379 * 1380 * @param[in] pu1_src 1381 * UWORD8 pointer to the source 1382 * 1383 * @param[out] pu1_dst 1384 * UWORD8 pointer to the destination 1385 * 1386 * @param[in] src_strd 1387 * integer source stride 1388 * 1389 * @param[in] dst_strd 1390 * integer destination stride 1391 * 1392 * @param[in] ngbr_avail 1393 * availability of neighbouring pixels(Not used in this function) 1394 * 1395 * @returns 1396 * 1397 * @remarks 1398 * None 1399 * 1400 *******************************************************************************/ 1401 void ih264_intra_pred_luma_8x8_mode_horz_d_ssse3(UWORD8 *pu1_src, 1402 UWORD8 *pu1_dst, 1403 WORD32 src_strd, 1404 WORD32 dst_strd, 1405 WORD32 ngbr_avail) 1406 { 1407 UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ 1408 __m128i pels_16x16; 1409 __m128i temp1, temp2, temp3, temp4; 1410 __m128i a0_8x16, a1_8x16, a2_8x16; 1411 __m128i zero = _mm_setzero_si128(); 1412 __m128i const_val2_8x16 = _mm_set1_epi16(2); 1413 __m128i res1_8x16, res2_8x16; 1414 __m128i out1_16x16, out2_16x16; 1415 __m128i str_8x8; 1416 UNUSED(src_strd); 1417 UNUSED(ngbr_avail); 1418 1419 pu1_left = pu1_src + BLK8x8SIZE - 1; 1420 1421 pels_16x16 = _mm_loadu_si128((__m128i *)(pu1_left - 7)); 1422 1423 temp1 = _mm_srli_si128(pels_16x16, 1); 1424 temp2 = _mm_srli_si128(pels_16x16, 2); 1425 a0_8x16 = _mm_unpacklo_epi8(pels_16x16, zero); 1426 a1_8x16 = _mm_unpacklo_epi8(temp1, zero); 1427 a2_8x16 = _mm_unpacklo_epi8(temp2, zero); 1428 1429 res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); 1430 1431 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1432 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1433 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1434 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1435 res2_8x16 = _mm_srai_epi16(a0_8x16, 2); 1436 1437 temp3 = _mm_unpacklo_epi16(res1_8x16, res2_8x16); 1438 temp4 = _mm_unpackhi_epi16(res1_8x16, res2_8x16); 1439 out2_16x16 = _mm_packus_epi16(temp3, temp4); 1440 1441 a0_8x16 = _mm_unpackhi_epi8(pels_16x16, zero); 1442 a1_8x16 = _mm_unpackhi_epi8(temp1, zero); 1443 a2_8x16 = _mm_unpackhi_epi8(temp2, zero); 1444 1445 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1446 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1447 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1448 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1449 res2_8x16 = _mm_srai_epi16(a0_8x16, 2); 1450 1451 out1_16x16 = _mm_packus_epi16(res2_8x16, zero); 1452 temp1 = _mm_srli_si128(out2_16x16, 8); 1453 out1_16x16 = _mm_unpacklo_epi64(temp1, out1_16x16); 1454 1455 str_8x8 = _mm_srli_si128(out1_16x16, 6); 1456 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8); 1457 str_8x8 = _mm_srli_si128(out1_16x16, 4); 1458 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8); 1459 str_8x8 = _mm_srli_si128(out1_16x16, 2); 1460 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8); 1461 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out1_16x16); 1462 1463 str_8x8 = _mm_srli_si128(out2_16x16, 6); 1464 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8); 1465 str_8x8 = _mm_srli_si128(out2_16x16, 4); 1466 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8); 1467 str_8x8 = _mm_srli_si128(out2_16x16, 2); 1468 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8); 1469 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out2_16x16); 1470 } 1471 1472 /** 1473 ******************************************************************************* 1474 * 1475 * ih264_intra_pred_luma_8x8_mode_vert_l_ssse3 1476 * 1477 * @brief 1478 * Perform Intra prediction for luma_8x8 mode:Vertical_Left 1479 * 1480 * @par Description: 1481 * Perform Intra prediction for luma_8x8 mode:Vertical_Left ,described in sec 8.3.2.2.9 1482 * 1483 * @param[in] pu1_src 1484 * UWORD8 pointer to the source 1485 * 1486 * @param[out] pu1_dst 1487 * UWORD8 pointer to the destination 1488 * 1489 * @param[in] src_strd 1490 * integer source stride 1491 * 1492 * @param[in] dst_strd 1493 * integer destination stride 1494 * 1495 * @param[in] ngbr_avail 1496 * availability of neighbouring pixels(Not used in this function) 1497 * 1498 * @returns 1499 * 1500 * @remarks 1501 * None 1502 * 1503 *******************************************************************************/ 1504 1505 void ih264_intra_pred_luma_8x8_mode_vert_l_ssse3(UWORD8 *pu1_src, 1506 UWORD8 *pu1_dst, 1507 WORD32 src_strd, 1508 WORD32 dst_strd, 1509 WORD32 ngbr_avail) 1510 { 1511 UWORD8 *pu1_top = NULL; /* Pointer to start of top predictors */ 1512 __m128i top_16x16; 1513 __m128i temp1, temp2; 1514 __m128i a0_8x16, a1_8x16, a2_8x16; 1515 __m128i zero = _mm_setzero_si128(); 1516 __m128i const_val2_8x16 = _mm_set1_epi16(2); 1517 __m128i res1_8x16, res2_8x16, res3_8x16, res4_8x16; 1518 __m128i out1_16x16, out2_16x16; 1519 UNUSED(src_strd); 1520 UNUSED(ngbr_avail); 1521 pu1_top = pu1_src + BLK8x8SIZE + 1; 1522 1523 top_16x16 = _mm_loadu_si128((__m128i *)(pu1_top)); 1524 temp1 = _mm_srli_si128(top_16x16, 1); 1525 temp2 = _mm_srli_si128(top_16x16, 2); 1526 a0_8x16 = _mm_unpacklo_epi8(top_16x16, zero); 1527 a1_8x16 = _mm_unpacklo_epi8(temp1, zero); 1528 a2_8x16 = _mm_unpacklo_epi8(temp2, zero); 1529 1530 res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); 1531 1532 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1533 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1534 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1535 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1536 res2_8x16 = _mm_srai_epi16(a0_8x16, 2); 1537 1538 a0_8x16 = _mm_unpackhi_epi8(top_16x16, zero); 1539 a1_8x16 = _mm_unpackhi_epi8(temp1, zero); 1540 a2_8x16 = _mm_unpackhi_epi8(temp2, zero); 1541 1542 res3_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); 1543 1544 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1545 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1546 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1547 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1548 res4_8x16 = _mm_srai_epi16(a0_8x16, 2); 1549 1550 out1_16x16 = _mm_packus_epi16(res1_8x16, res3_8x16); 1551 out2_16x16 = _mm_packus_epi16(res2_8x16, res4_8x16); 1552 1553 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), out1_16x16); 1554 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), out2_16x16); 1555 out1_16x16 = _mm_srli_si128(out1_16x16, 1); 1556 out2_16x16 = _mm_srli_si128(out2_16x16, 1); 1557 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), out1_16x16); 1558 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), out2_16x16); 1559 out1_16x16 = _mm_srli_si128(out1_16x16, 1); 1560 out2_16x16 = _mm_srli_si128(out2_16x16, 1); 1561 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), out1_16x16); 1562 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), out2_16x16); 1563 out1_16x16 = _mm_srli_si128(out1_16x16, 1); 1564 out2_16x16 = _mm_srli_si128(out2_16x16, 1); 1565 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), out1_16x16); 1566 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), out2_16x16); 1567 } 1568 1569 /** 1570 ******************************************************************************* 1571 * 1572 * ih264_intra_pred_luma_8x8_mode_horz_u_ssse3 1573 * 1574 * @brief 1575 * Perform Intra prediction for luma_8x8 mode:Horizontal_Up 1576 * 1577 * @par Description: 1578 * Perform Intra prediction for luma_8x8 mode:Horizontal_Up ,described in sec 8.3.2.2.10 1579 * 1580 * @param[in] pu1_src 1581 * UWORD8 pointer to the source 1582 * 1583 * @param[out] pu1_dst 1584 * UWORD8 pointer to the destination 1585 * 1586 * @param[in] src_strd 1587 * integer source stride 1588 * 1589 * @param[in] dst_strd 1590 * integer destination stride 1591 * 1592 * @param[in] ngbr_avail 1593 * availability of neighbouring pixels(Not used in this function) 1594 * 1595 * @returns 1596 * 1597 * @remarks 1598 * None 1599 * 1600 *******************************************************************************/ 1601 void ih264_intra_pred_luma_8x8_mode_horz_u_ssse3(UWORD8 *pu1_src, 1602 UWORD8 *pu1_dst, 1603 WORD32 src_strd, 1604 WORD32 dst_strd, 1605 WORD32 ngbr_avail) 1606 { 1607 UWORD8 *pu1_left = NULL; /* Pointer to start of left predictors */ 1608 __m128i left_16x16; 1609 __m128i temp1, temp2; 1610 __m128i a0_8x16, a1_8x16, a2_8x16; 1611 __m128i zero = _mm_setzero_si128(); 1612 __m128i const_val2_8x16 = _mm_set1_epi16(2); 1613 __m128i res1_8x16, res2_8x16; 1614 __m128i out1_16x16; 1615 __m128i str_8x8; 1616 __m128i shuffle_16x16; 1617 UNUSED(src_strd); 1618 UNUSED(ngbr_avail); 1619 1620 pu1_left = pu1_src + BLK8x8SIZE - 1; 1621 shuffle_16x16 = _mm_set_epi8(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 1622 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 1623 0x0F); 1624 1625 left_16x16 = _mm_loadu_si128((__m128i *)(pu1_left - 7)); 1626 temp1 = _mm_srli_si128(left_16x16, 1); 1627 a0_8x16 = _mm_unpacklo_epi8(left_16x16, zero); 1628 a0_8x16 = _mm_slli_si128(a0_8x16, 2); 1629 a1_8x16 = _mm_unpacklo_epi8(left_16x16, zero); 1630 a0_8x16 = _mm_shufflelo_epi16(a0_8x16, 0xE5); 1631 a2_8x16 = _mm_unpacklo_epi8(temp1, zero); 1632 1633 res1_8x16 = _mm_avg_epu16(a0_8x16, a1_8x16); 1634 1635 a0_8x16 = _mm_add_epi16(a0_8x16, a2_8x16); 1636 a1_8x16 = _mm_add_epi16(a1_8x16, a1_8x16); 1637 a0_8x16 = _mm_add_epi16(a0_8x16, const_val2_8x16); 1638 a0_8x16 = _mm_add_epi16(a0_8x16, a1_8x16); 1639 res2_8x16 = _mm_srai_epi16(a0_8x16, 2); 1640 1641 temp1 = _mm_unpacklo_epi16(res1_8x16, res2_8x16); 1642 temp2 = _mm_unpackhi_epi16(res1_8x16, res2_8x16); 1643 out1_16x16 = _mm_packus_epi16(temp1, temp2); 1644 out1_16x16 = _mm_shuffle_epi8(out1_16x16, shuffle_16x16); 1645 1646 str_8x8 = _mm_srli_si128(out1_16x16, 1); 1647 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), str_8x8); 1648 str_8x8 = _mm_srli_si128(out1_16x16, 3); 1649 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), str_8x8); 1650 str_8x8 = _mm_srli_si128(out1_16x16, 5); 1651 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), str_8x8); 1652 str_8x8 = _mm_srli_si128(out1_16x16, 7); 1653 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), str_8x8); 1654 temp1 = _mm_set1_epi8(pu1_left[-7]); 1655 str_8x8 = _mm_unpacklo_epi64(str_8x8, temp1); 1656 str_8x8 = _mm_srli_si128(str_8x8, 2); 1657 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), str_8x8); 1658 str_8x8 = _mm_srli_si128(str_8x8, 2); 1659 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), str_8x8); 1660 str_8x8 = _mm_srli_si128(str_8x8, 2); 1661 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), str_8x8); 1662 str_8x8 = _mm_srli_si128(str_8x8, 2); 1663 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), str_8x8); 1664 1665 } 1666 1667 1668 /******************* 16x16 Modes *******************/ 1669 1670 /** 1671 ******************************************************************************* 1672 * 1673 *ih264_intra_pred_luma_16x16_mode_vert_ssse3 1674 * 1675 * @brief 1676 * Perform Intra prediction for luma_16x16 mode:Vertical 1677 * 1678 * @par Description: 1679 * Perform Intra prediction for luma_16x16 mode:Vertical, described in sec 8.3.3.1 1680 * 1681 * @param[in] pu1_src 1682 * UWORD8 pointer to the source 1683 * 1684 * @param[out] pu1_dst 1685 * UWORD8 pointer to the destination 1686 * 1687 * @param[in] src_strd 1688 * integer source stride 1689 * 1690 * @param[in] dst_strd 1691 * integer destination stride 1692 * 1693 * @param[in] ngbr_avail 1694 * availability of neighbouring pixels (Not used in this function) 1695 * 1696 * @returns 1697 * 1698 * @remarks 1699 * None 1700 * 1701 *******************************************************************************/ 1702 void ih264_intra_pred_luma_16x16_mode_vert_ssse3(UWORD8 *pu1_src, 1703 UWORD8 *pu1_dst, 1704 WORD32 src_strd, 1705 WORD32 dst_strd, 1706 WORD32 ngbr_avail) 1707 { 1708 UWORD8 *pu1_top; 1709 WORD32 dst_strd2, dst_strd3, dst_strd4; 1710 1711 __m128i top_16x8b; 1712 1713 UNUSED(src_strd); 1714 UNUSED(ngbr_avail); 1715 1716 pu1_top = pu1_src + MB_SIZE + 1; 1717 1718 dst_strd2 = dst_strd << 1; 1719 dst_strd4 = dst_strd << 2; 1720 1721 top_16x8b = _mm_loadu_si128((__m128i *)pu1_top); 1722 1723 dst_strd3 = dst_strd + dst_strd2; 1724 1725 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1726 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); 1727 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b); 1728 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b); 1729 pu1_dst += dst_strd4; 1730 1731 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1732 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); 1733 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b); 1734 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b); 1735 pu1_dst += dst_strd4; 1736 1737 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1738 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); 1739 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b); 1740 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b); 1741 pu1_dst += dst_strd4; 1742 1743 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b); 1744 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b); 1745 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), top_16x8b); 1746 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), top_16x8b); 1747 } 1748 1749 /** 1750 ******************************************************************************* 1751 * 1752 *ih264_intra_pred_luma_16x16_mode_horz_ssse3 1753 * 1754 * @brief 1755 * Perform Intra prediction for luma_16x16 mode:Horizontal 1756 * 1757 * @par Description: 1758 * Perform Intra prediction for luma_16x16 mode:Horizontal, described in sec 8.3.3.2 1759 * 1760 * @param[in] pu1_src 1761 * UWORD8 pointer to the source 1762 * 1763 * @param[out] pu1_dst 1764 * UWORD8 pointer to the destination 1765 * 1766 * @param[in] src_strd 1767 * integer source stride 1768 * 1769 * @param[in] dst_strd 1770 * integer destination stride 1771 * 1772 * @param[in] ngbr_avail 1773 * availability of neighbouring pixels(Not used in this function) 1774 * 1775 * @returns 1776 * 1777 * @remarks 1778 * None 1779 * 1780 *******************************************************************************/ 1781 void ih264_intra_pred_luma_16x16_mode_horz_ssse3(UWORD8 *pu1_src, 1782 UWORD8 *pu1_dst, 1783 WORD32 src_strd, 1784 WORD32 dst_strd, 1785 WORD32 ngbr_avail) 1786 { 1787 UWORD8 *pu1_left; 1788 WORD32 dst_strd2, dst_strd3, dst_strd4; 1789 1790 __m128i row1_16x8b, row2_16x8b, row3_16x8b, row4_16x8b; 1791 1792 UNUSED(src_strd); 1793 UNUSED(ngbr_avail); 1794 1795 pu1_left = pu1_src + MB_SIZE - 1; 1796 1797 dst_strd4 = dst_strd << 2; 1798 1799 dst_strd2 = dst_strd << 1; 1800 dst_strd3 = dst_strd4 - dst_strd; 1801 1802 row1_16x8b = _mm_set1_epi8(*(pu1_left)); 1803 row2_16x8b = _mm_set1_epi8(*(pu1_left - 1)); 1804 row3_16x8b = _mm_set1_epi8(*(pu1_left - 2)); 1805 row4_16x8b = _mm_set1_epi8(*(pu1_left - 3)); 1806 1807 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); 1808 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); 1809 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); 1810 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); 1811 1812 pu1_dst += dst_strd4; 1813 row1_16x8b = _mm_set1_epi8(*(pu1_left - 4)); 1814 row2_16x8b = _mm_set1_epi8(*(pu1_left - 5)); 1815 row3_16x8b = _mm_set1_epi8(*(pu1_left - 6)); 1816 row4_16x8b = _mm_set1_epi8(*(pu1_left - 7)); 1817 1818 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); 1819 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); 1820 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); 1821 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); 1822 1823 pu1_dst += dst_strd4; 1824 row1_16x8b = _mm_set1_epi8(*(pu1_left - 8)); 1825 row2_16x8b = _mm_set1_epi8(*(pu1_left - 9)); 1826 row3_16x8b = _mm_set1_epi8(*(pu1_left - 10)); 1827 row4_16x8b = _mm_set1_epi8(*(pu1_left - 11)); 1828 1829 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); 1830 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); 1831 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); 1832 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); 1833 1834 pu1_dst += dst_strd4; 1835 row1_16x8b = _mm_set1_epi8(*(pu1_left - 12)); 1836 row2_16x8b = _mm_set1_epi8(*(pu1_left - 13)); 1837 row3_16x8b = _mm_set1_epi8(*(pu1_left - 14)); 1838 row4_16x8b = _mm_set1_epi8(*(pu1_left - 15)); 1839 1840 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b); 1841 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b); 1842 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), row3_16x8b); 1843 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), row4_16x8b); 1844 } 1845 1846 /** 1847 ******************************************************************************* 1848 * 1849 *ih264_intra_pred_luma_16x16_mode_dc_ssse3 1850 * 1851 * @brief 1852 * Perform Intra prediction for luma_16x16 mode:DC 1853 * 1854 * @par Description: 1855 * Perform Intra prediction for luma_16x16 mode:DC, described in sec 8.3.3.3 1856 * 1857 * @param[in] pu1_src 1858 * UWORD8 pointer to the source 1859 * 1860 * @param[out] pu1_dst 1861 * UWORD8 pointer to the destination 1862 * 1863 * @param[in] src_strd 1864 * integer source stride 1865 * 1866 * @param[in] dst_strd 1867 * integer destination stride 1868 * 1869 ** @param[in] ngbr_avail 1870 * availability of neighbouring pixels 1871 * 1872 * @returns 1873 * 1874 * @remarks 1875 * None 1876 * 1877 *******************************************************************************/ 1878 void ih264_intra_pred_luma_16x16_mode_dc_ssse3(UWORD8 *pu1_src, 1879 UWORD8 *pu1_dst, 1880 WORD32 src_strd, 1881 WORD32 dst_strd, 1882 WORD32 ngbr_avail) 1883 { 1884 WORD8 u1_useleft, u1_usetop; 1885 WORD32 dc_val; 1886 1887 WORD32 dst_strd2, dst_strd3, dst_strd4; 1888 1889 __m128i dc_val_16x8b; 1890 1891 UNUSED(src_strd); 1892 1893 u1_useleft = BOOLEAN(ngbr_avail & LEFT_MB_AVAILABLE_MASK); 1894 u1_usetop = BOOLEAN(ngbr_avail & TOP_MB_AVAILABLE_MASK); 1895 1896 if(u1_useleft || u1_usetop) 1897 { 1898 WORD32 shft; 1899 __m128i val_16x8b, zero_16x8b, sum_8x16b; 1900 1901 dc_val = 0; 1902 shft = 3; 1903 1904 zero_16x8b = _mm_setzero_si128(); 1905 1906 if(u1_useleft) 1907 { 1908 UWORD8 *pu1_left; 1909 1910 pu1_left = pu1_src + MB_SIZE - 1; 1911 1912 val_16x8b = _mm_loadu_si128((__m128i *)(pu1_left - 15)); 1913 sum_8x16b = _mm_sad_epu8(zero_16x8b, val_16x8b); 1914 1915 shft++; 1916 dc_val += 8; 1917 dc_val += _mm_extract_epi16(sum_8x16b, 0); 1918 dc_val += _mm_extract_epi16(sum_8x16b, 4); 1919 } 1920 if(u1_usetop) 1921 { 1922 UWORD8 *pu1_top; 1923 1924 pu1_top = pu1_src + MB_SIZE + 1; 1925 1926 val_16x8b = _mm_loadu_si128((__m128i *)pu1_top); 1927 sum_8x16b = _mm_sad_epu8(zero_16x8b, val_16x8b); 1928 1929 shft++; 1930 dc_val += 8; 1931 dc_val += _mm_extract_epi16(sum_8x16b, 0); 1932 dc_val += _mm_extract_epi16(sum_8x16b, 4); 1933 } 1934 dc_val = dc_val >> shft; 1935 } 1936 else 1937 dc_val = 128; 1938 1939 dc_val_16x8b = _mm_set1_epi8(dc_val); 1940 1941 dst_strd2 = dst_strd << 1; 1942 dst_strd4 = dst_strd << 2; 1943 dst_strd3 = dst_strd + dst_strd2; 1944 1945 _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b); 1946 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b); 1947 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b); 1948 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b); 1949 pu1_dst += dst_strd4; 1950 1951 _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b); 1952 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b); 1953 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b); 1954 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b); 1955 pu1_dst += dst_strd4; 1956 1957 _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b); 1958 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b); 1959 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b); 1960 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b); 1961 pu1_dst += dst_strd4; 1962 1963 _mm_storeu_si128((__m128i *)pu1_dst, dc_val_16x8b); 1964 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), dc_val_16x8b); 1965 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd2), dc_val_16x8b); 1966 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd3), dc_val_16x8b); 1967 } 1968 1969 /** 1970 ******************************************************************************* 1971 * 1972 *ih264_intra_pred_luma_16x16_mode_plane_ssse3 1973 * 1974 * @brief 1975 * Perform Intra prediction for luma_16x16 mode:PLANE 1976 * 1977 * @par Description: 1978 * Perform Intra prediction for luma_16x16 mode:PLANE, described in sec 8.3.3.4 1979 * 1980 * @param[in] pu1_src 1981 * UWORD8 pointer to the source 1982 * 1983 * @param[out] pu1_dst 1984 * UWORD8 pointer to the destination 1985 * 1986 * @param[in] src_strd 1987 * integer source stride 1988 * 1989 * @param[in] dst_strd 1990 * integer destination stride 1991 * 1992 * @param[in] ngbr_avail 1993 * availability of neighbouring pixels(Not used in this function) 1994 * 1995 * @returns 1996 * 1997 * @remarks 1998 * None 1999 * 2000 *******************************************************************************/ 2001 void ih264_intra_pred_luma_16x16_mode_plane_ssse3(UWORD8 *pu1_src, 2002 UWORD8 *pu1_dst, 2003 WORD32 src_strd, 2004 WORD32 dst_strd, 2005 WORD32 ngbr_avail) 2006 { 2007 UWORD8 *pu1_left, *pu1_top; 2008 WORD32 a, b, c; 2009 2010 __m128i rev_8x16b, mul_8x16b, zero_16x8b; 2011 2012 UNUSED(src_strd); 2013 UNUSED(ngbr_avail); 2014 2015 pu1_top = pu1_src + MB_SIZE + 1; 2016 pu1_left = pu1_src + MB_SIZE - 1; 2017 2018 rev_8x16b = _mm_setr_epi16(0x0f0e, 0x0d0c, 0x0b0a, 0x0908, 0x0706, 0x0504, 0x0302, 0x0100); 2019 //used to reverse the order of 16-bit values in a vector 2020 2021 mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); 2022 zero_16x8b = _mm_setzero_si128(); 2023 2024 //calculating a, b and c 2025 { 2026 WORD32 h, v; 2027 2028 __m128i h_val1_16x8b, h_val2_16x8b; 2029 __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b; 2030 __m128i v_val1_16x8b, v_val2_16x8b; 2031 __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b; 2032 __m128i hv_val_4x32b; 2033 2034 a = (pu1_top[15] + pu1_left[-15]) << 4; 2035 2036 h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8)); 2037 h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 1)); 2038 v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 15)); 2039 v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 6)); 2040 2041 h_val1_8x16b = _mm_unpacklo_epi8(h_val1_16x8b, zero_16x8b); 2042 h_val2_8x16b = _mm_unpacklo_epi8(h_val2_16x8b, zero_16x8b); 2043 v_val1_8x16b = _mm_unpacklo_epi8(v_val1_16x8b, zero_16x8b); 2044 v_val2_8x16b = _mm_unpacklo_epi8(v_val2_16x8b, zero_16x8b); 2045 2046 h_val2_8x16b = _mm_shuffle_epi8(h_val2_8x16b, rev_8x16b); 2047 v_val1_8x16b = _mm_shuffle_epi8(v_val1_8x16b, rev_8x16b); 2048 2049 h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b); 2050 v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b); 2051 2052 h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b); 2053 v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b); 2054 2055 hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b); 2056 hv_val_4x32b = _mm_hadd_epi32(hv_val_4x32b, hv_val_4x32b); 2057 2058 h = _mm_extract_epi16(hv_val_4x32b, 0); 2059 v = _mm_extract_epi16(hv_val_4x32b, 2); 2060 h = (h << 16) >> 16; 2061 v = (v << 16) >> 16; 2062 2063 b = ((h << 2) + h + 32) >> 6; 2064 c = ((v << 2) + v + 32) >> 6; 2065 } 2066 2067 //using a, b and c to compute the fitted plane values 2068 { 2069 __m128i const_8x16b, b_8x16b, c_8x16b, c2_8x16b; 2070 __m128i res1_l_8x16b, res1_h_8x16b; 2071 __m128i res2_l_8x16b, res2_h_8x16b; 2072 __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b; 2073 __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b; 2074 2075 b_8x16b = _mm_set1_epi16(b); 2076 c_8x16b = _mm_set1_epi16(c); 2077 c2_8x16b = _mm_set1_epi16(c << 1); 2078 const_8x16b = _mm_set1_epi16(a - c*7 + 16); 2079 2080 res1_h_8x16b = _mm_mullo_epi16(mul_8x16b, b_8x16b); 2081 //contains {b*1, b*2, b*3,... b*8} 2082 2083 res1_l_8x16b = _mm_shuffle_epi8(res1_h_8x16b, rev_8x16b); 2084 res1_l_8x16b = _mm_srli_si128(res1_l_8x16b, 2); 2085 res1_l_8x16b = _mm_sub_epi16(zero_16x8b, res1_l_8x16b); 2086 //contains {-b*7, -b*6,... -b*1, b*0} 2087 2088 // rows 1, 2 2089 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b); 2090 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b); 2091 res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c_8x16b); 2092 res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c_8x16b); 2093 2094 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 2095 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 2096 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 2097 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 2098 2099 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 2100 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 2101 2102 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 2103 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 2104 2105 // rows 3, 4 2106 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 2107 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 2108 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 2109 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 2110 2111 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 2112 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 2113 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 2114 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 2115 2116 pu1_dst += dst_strd << 1; 2117 2118 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 2119 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 2120 2121 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 2122 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 2123 2124 // rows 5, 6 2125 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 2126 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 2127 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 2128 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 2129 2130 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 2131 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 2132 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 2133 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 2134 2135 pu1_dst += dst_strd << 1; 2136 2137 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 2138 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 2139 2140 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 2141 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 2142 2143 // rows 7, 8 2144 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 2145 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 2146 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 2147 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 2148 2149 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 2150 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 2151 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 2152 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 2153 2154 pu1_dst += dst_strd << 1; 2155 2156 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 2157 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 2158 2159 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 2160 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 2161 2162 // rows 9, 10 2163 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 2164 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 2165 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 2166 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 2167 2168 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 2169 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 2170 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 2171 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 2172 2173 pu1_dst += dst_strd << 1; 2174 2175 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 2176 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 2177 2178 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 2179 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 2180 2181 // rows 11, 12 2182 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 2183 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 2184 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 2185 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 2186 2187 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 2188 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 2189 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 2190 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 2191 2192 pu1_dst += dst_strd << 1; 2193 2194 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 2195 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 2196 2197 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 2198 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 2199 2200 // rows 13, 14 2201 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 2202 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 2203 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 2204 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 2205 2206 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 2207 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 2208 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 2209 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 2210 2211 pu1_dst += dst_strd << 1; 2212 2213 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 2214 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 2215 2216 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 2217 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 2218 2219 // rows 15, 16 2220 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b); 2221 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b); 2222 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b); 2223 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b); 2224 2225 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5); 2226 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5); 2227 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5); 2228 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5); 2229 2230 pu1_dst += dst_strd << 1; 2231 2232 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b); 2233 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b); 2234 2235 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b); 2236 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b); 2237 } 2238 } 2239