1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_weighted_pred_atom_intr.c 22 * 23 * @brief 24 * Contains function definitions for weighted prediction used in inter 25 * prediction 26 * 27 * @author 28 * 29 * 30 * @par List of Functions: 31 * - ihevc_weighted_pred_uni_ssse3() 32 * - ihevc_weighted_pred_bi_ssse3() 33 * - ihevc_weighted_pred_bi_default_ssse3() 34 * - ihevc_weighted_pred_chroma_uni_ssse3() 35 * - ihevc_weighted_pred_chroma_bi_ssse3() 36 * - ihevc_weighted_pred_chroma_bi_default_ssse3() 37 * 38 * @remarks 39 * None 40 * 41 ******************************************************************************* 42 */ 43 /*****************************************************************************/ 44 /* File Includes */ 45 /*****************************************************************************/ 46 #include <stdio.h> 47 #include <assert.h> 48 49 #include "ihevc_debug.h" 50 #include "ihevc_typedefs.h" 51 #include "ihevc_macros.h" 52 #include "ihevc_platform_macros.h" 53 #include "ihevc_func_selector.h" 54 #include "ihevc_defs.h" 55 #include "ihevc_weighted_pred.h" 56 #include "ihevc_inter_pred.h" 57 58 59 #include <immintrin.h> 60 61 /** 62 ******************************************************************************* 63 * 64 * @brief 65 * Does uni-weighted prediction on the array pointed by pi2_src and stores 66 * it at the location pointed by pi2_dst 67 * 68 * @par Description: 69 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 70 * offset 71 * 72 * @param[in] pi2_src 73 * Pointer to the source 74 * 75 * @param[out] pu1_dst 76 * Pointer to the destination 77 * 78 * @param[in] src_strd 79 * Source stride 80 * 81 * @param[in] dst_strd 82 * Destination stride 83 * 84 * @param[in] wgt0 85 * weight to be multiplied to the source 86 * 87 * @param[in] off0 88 * offset to be added after rounding and 89 * 90 * @param[in] shifting 91 * 92 * 93 * @param[in] shift 94 * (14 Bit depth) + log2_weight_denominator 95 * 96 * @param[in] lvl_shift 97 * added before shift and offset 98 * 99 * @param[in] ht 100 * height of the source 101 * 102 * @param[in] wd 103 * width of the source 104 * 105 * @returns 106 * 107 * @remarks 108 * None 109 * 110 ******************************************************************************* 111 */ 112 113 void ihevc_weighted_pred_uni_ssse3(WORD16 *pi2_src, 114 UWORD8 *pu1_dst, 115 WORD32 src_strd, 116 WORD32 dst_strd, 117 WORD32 wgt0, 118 WORD32 off0, 119 WORD32 shift, 120 WORD32 lvl_shift, 121 WORD32 ht, 122 WORD32 wd) 123 { 124 WORD32 row, col, temp; 125 126 /* all 128 bit registers are named with a suffix mxnb, where m is the */ 127 /* number of n bits packed in the register */ 128 __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b; 129 __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b; 130 __m128i res_temp0_4x32b, res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b; 131 132 ASSERT(wd % 4 == 0); /* checking assumption*/ 133 ASSERT(ht % 4 == 0); /* checking assumption*/ 134 135 temp = 1 << (shift - 1); 136 137 // seting values in register 138 lvl_shift_4x32b = _mm_set1_epi16(lvl_shift); 139 wgt0_8x16b = _mm_set1_epi16(wgt0); 140 141 /* lvl_shift * wgt0 */ 142 res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b); 143 res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b); 144 145 const_temp_4x32b = _mm_set1_epi32(temp); 146 off0_4x32b = _mm_set1_epi32(off0); 147 148 149 /* lvl_shift * wgt0 */ 150 lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b); 151 /* lvl_shift * wgt0 + 1 << (shift - 1) */ 152 lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b); 153 154 if(0 == (wd & 7)) /* wd multiple of 8 case */ 155 { 156 __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b; 157 158 /* outer for loop starts from here */ 159 for(row = 0; row < ht; row += 4) 160 { 161 for(col = 0; col < wd; col += 8) 162 { /* for row =0 ,1,2,3*/ 163 164 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 165 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); 166 /* row = 1 */ 167 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 168 /* row = 2 */ 169 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd)); 170 /* row = 3 */ 171 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd)); 172 173 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 174 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 175 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 176 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b); 177 res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); 178 179 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ 180 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 181 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 182 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b); 183 src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); 184 185 /* Get 32 bit Result */ 186 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 187 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 188 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 189 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); 190 191 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 192 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 193 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 194 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); 195 196 /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 197 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b); 198 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b); 199 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b); 200 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b); 201 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 202 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 203 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); 204 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); 205 206 /* (i4_tmp >> shift) */ /* First 4 pixels */ 207 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 208 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 209 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); 210 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 211 212 /* (i4_tmp >> shift) */ /* Last 4 pixels */ 213 res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift); 214 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); 215 res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift); 216 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); 217 218 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */ 219 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 220 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 221 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); 222 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); 223 224 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 225 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b); 226 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b); 227 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b); 228 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b); 229 230 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp4_4x32b); 231 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); 232 res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp6_4x32b); 233 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); 234 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 235 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); 236 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 237 res_temp2_4x32b = _mm_packus_epi16(res_temp2_4x32b, res_temp2_4x32b); 238 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); 239 240 /* store four 8-bit output values */ 241 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ 242 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 2*/ 243 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), res_temp2_4x32b); /* row = 1*/ 244 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), res_temp3_4x32b); /* row = 3*/ 245 246 /* To update pointer */ 247 pi2_src += 8; 248 pu1_dst += 8; 249 250 } /* inner loop ends here(4-output values in single iteration) */ 251 252 pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ 253 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 254 255 } 256 } 257 else /* wd multiple of 4 case */ 258 { 259 WORD32 dst0, dst1, dst2, dst3; 260 /* outer for loop starts from here */ 261 for(row = 0; row < ht; row += 4) 262 { 263 for(col = 0; col < wd; col += 4) 264 { /* for row =0 ,1,2,3*/ 265 266 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 267 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src)); 268 /* row = 1 */ 269 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd)); 270 /* row = 2 */ 271 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 2 * src_strd)); 272 /* row = 3 */ 273 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + 3 * src_strd)); 274 275 /* 2 rows together */ 276 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp2_8x16b); 277 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 278 279 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 280 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 281 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 282 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Higher 16 bit */ 283 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 284 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 285 286 /* Get 32 bit Result */ 287 res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 288 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 289 290 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 291 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 292 293 /* i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 294 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); 295 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); 296 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 297 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 298 299 /* (i4_tmp >> shift) */ 300 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 301 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 302 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); 303 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 304 305 /*i4_tmp = (i4_tmp >> shift) + off0; */ 306 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 307 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 308 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); 309 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); 310 311 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b); 312 res_temp2_4x32b = _mm_packs_epi32(res_temp2_4x32b, res_temp3_4x32b); 313 314 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 315 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp2_4x32b); 316 317 dst0 = _mm_cvtsi128_si32(res_temp0_4x32b); 318 /* dst row = 1 to 3 */ 319 res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1); 320 res_temp2_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 2); 321 res_temp3_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 3); 322 323 /* store four 8-bit output values */ 324 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 325 326 dst1 = _mm_cvtsi128_si32(res_temp1_4x32b); 327 dst2 = _mm_cvtsi128_si32(res_temp2_4x32b); 328 dst3 = _mm_cvtsi128_si32(res_temp3_4x32b); 329 330 /* row = 1 to row = 3 */ 331 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 332 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; 333 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; 334 335 /* To update pointer */ 336 pi2_src += 4; 337 pu1_dst += 4; 338 339 } /* inner loop ends here(4-output values in single iteration) */ 340 341 pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ 342 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 343 344 } 345 } 346 } 347 348 /** 349 ******************************************************************************* 350 * 351 * @brief 352 * Does chroma uni-weighted prediction on array pointed by pi2_src and stores 353 * it at the location pointed by pi2_dst 354 * 355 * @par Description: 356 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 357 * offset 358 * 359 * @param[in] pi2_src 360 * Pointer to the source 361 * 362 * @param[out] pu1_dst 363 * Pointer to the destination 364 * 365 * @param[in] src_strd 366 * Source stride 367 * 368 * @param[in] dst_strd 369 * Destination stride 370 * 371 * @param[in] wgt0 372 * weight to be multiplied to the source 373 * 374 * @param[in] off0 375 * offset to be added after rounding and 376 * 377 * @param[in] shifting 378 * 379 * 380 * @param[in] shift 381 * (14 Bit depth) + log2_weight_denominator 382 * 383 * @param[in] lvl_shift 384 * added before shift and offset 385 * 386 * @param[in] ht 387 * height of the source 388 * 389 * @param[in] wd 390 * width of the source (each colour component) 391 * 392 * @returns 393 * 394 * @remarks 395 * None 396 * 397 ******************************************************************************* 398 */ 399 400 401 void ihevc_weighted_pred_chroma_uni_ssse3(WORD16 *pi2_src, 402 UWORD8 *pu1_dst, 403 WORD32 src_strd, 404 WORD32 dst_strd, 405 WORD32 wgt0_cb, 406 WORD32 wgt0_cr, 407 WORD32 off0_cb, 408 WORD32 off0_cr, 409 WORD32 shift, 410 WORD32 lvl_shift, 411 WORD32 ht, 412 WORD32 wd) 413 { 414 WORD32 row, col, temp, wdx2; 415 /* all 128 bit registers are named with a suffix mxnb, where m is the */ 416 /* number of n bits packed in the register */ 417 418 __m128i src_temp0_8x16b, src_temp1_8x16b; 419 __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_8x16b, off0_4x32b; 420 __m128i res_temp0_4x32b, res_temp1_4x32b; 421 422 ASSERT(wd % 2 == 0); /* checking assumption*/ 423 ASSERT(ht % 2 == 0); /* checking assumption*/ 424 425 temp = 1 << (shift - 1); 426 wdx2 = 2 * wd; 427 428 // seting values in register 429 lvl_shift_4x32b = _mm_set1_epi16(lvl_shift); 430 wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); 431 432 /* lvl_shift * wgt0 */ 433 res_temp0_4x32b = _mm_mullo_epi16(lvl_shift_4x32b, wgt0_8x16b); 434 res_temp1_4x32b = _mm_mulhi_epi16(lvl_shift_4x32b, wgt0_8x16b); 435 436 const_temp_4x32b = _mm_set1_epi32(temp); 437 off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb); 438 439 /* lvl_shift * wgt0 */ 440 lvl_shift_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, res_temp1_4x32b); 441 /* lvl_shift * wgt0 + 1 << (shift - 1) */ 442 lvl_shift_4x32b = _mm_add_epi32(lvl_shift_4x32b, const_temp_4x32b); 443 444 { 445 if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ 446 { 447 __m128i src_temp2_8x16b, src_temp3_8x16b; 448 __m128i res_temp2_4x32b, res_temp3_4x32b; 449 __m128i res_temp4_4x32b, res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b; 450 451 /* outer for loop starts from here */ 452 for(row = 0; row < ht; row += 2) 453 { 454 for(col = 0; col < wdx2; col += 16) 455 { 456 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 457 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); 458 /* row = 1 */ 459 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 460 /* row = 0 */ /* Next 8 pixels */ 461 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + 8)); 462 /* row = 1 */ 463 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8)); 464 465 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 466 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 467 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 468 res_temp4_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt0_8x16b); 469 res_temp5_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); 470 471 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ 472 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 473 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 474 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt0_8x16b); 475 src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); 476 477 /* Get 32 bit Result */ 478 res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 479 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 480 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp2_8x16b); 481 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp5_4x32b, src_temp3_8x16b); 482 483 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 484 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 485 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp2_8x16b); 486 res_temp5_4x32b = _mm_unpacklo_epi16(res_temp5_4x32b, src_temp3_8x16b); 487 488 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 489 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 490 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 491 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); 492 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); 493 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift_4x32b); 494 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift_4x32b); 495 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift_4x32b); 496 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift_4x32b); 497 498 /* (i4_tmp >> shift) */ 499 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 500 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 501 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); 502 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 503 /*i4_tmp = (i4_tmp >> shift) + off0; */ 504 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 505 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 506 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ 507 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); 508 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); 509 510 /* (i4_tmp >> shift) */ 511 res_temp4_4x32b = _mm_srai_epi32(res_temp4_4x32b, shift); 512 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); 513 res_temp6_4x32b = _mm_srai_epi32(res_temp6_4x32b, shift); 514 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); 515 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ 516 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, off0_4x32b); 517 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, off0_4x32b); 518 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 519 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, off0_4x32b); 520 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, off0_4x32b); 521 522 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b); 523 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); 524 res_temp4_4x32b = _mm_packs_epi32(res_temp4_4x32b, res_temp6_4x32b); 525 res_temp5_4x32b = _mm_packs_epi32(res_temp5_4x32b, res_temp7_4x32b); 526 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 527 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp4_4x32b); 528 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp5_4x32b); 529 530 /* store 16 8-bit output values */ 531 _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ 532 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/ 533 534 pi2_src += 16; /* Pointer update */ 535 pu1_dst += 16; /* Pointer update */ 536 537 } /* inner loop ends here(4-output values in single iteration) */ 538 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 539 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 540 } 541 } 542 else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */ 543 { 544 __m128i res_temp2_4x32b, res_temp3_4x32b; 545 /* outer for loop starts from here */ 546 for(row = 0; row < ht; row += 2) 547 { 548 for(col = 0; col < wdx2; col += 8) 549 { 550 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 551 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pi2_src)); 552 /* row = 1 */ 553 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 554 555 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 556 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 557 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 558 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ 559 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 560 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 561 562 /* Get 32 bit Result */ 563 res_temp2_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 564 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 565 566 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 567 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 568 569 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 570 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 571 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 572 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift_4x32b); 573 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift_4x32b); 574 575 /* (i4_tmp >> shift) */ 576 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 577 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 578 res_temp2_4x32b = _mm_srai_epi32(res_temp2_4x32b, shift); 579 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 580 581 /*i4_tmp = (i4_tmp >> shift) + off0; */ 582 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 583 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 584 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 585 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, off0_4x32b); 586 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, off0_4x32b); 587 588 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp2_4x32b); 589 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); 590 591 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 592 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); 593 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 594 595 /* store four 8-bit output values */ 596 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp0_4x32b); /* row = 0*/ 597 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp1_4x32b); /* row = 1*/ 598 599 pi2_src += 8; /* Pointer update */ 600 pu1_dst += 8; /* Pointer update */ 601 602 } /* inner loop ends here(4-output values in single iteration) */ 603 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 604 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 605 } 606 } 607 else /* 2*wd multiple of 4 case */ 608 { 609 WORD32 dst0, dst1; 610 /* outer for loop starts from here */ 611 for(row = 0; row < ht; row += 2) 612 { 613 for(col = 0; col < wdx2; col += 4) 614 { 615 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 616 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src)); 617 /* row = 1 */ 618 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd)); 619 620 /* 2 rows together */ 621 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b); 622 623 /*i4_tmp = (pi2_src[col]) * wgt0*/ /* Lower 16 bit */ 624 res_temp0_4x32b = _mm_mullo_epi16(src_temp0_8x16b, wgt0_8x16b); 625 /*i4_tmp = (pi2_src[col] ) * wgt0*/ /* Higher 16 bit */ 626 src_temp0_8x16b = _mm_mulhi_epi16(src_temp0_8x16b, wgt0_8x16b); 627 628 /* Get 32 bit Result */ 629 res_temp1_4x32b = _mm_unpackhi_epi16(res_temp0_4x32b, src_temp0_8x16b); 630 res_temp0_4x32b = _mm_unpacklo_epi16(res_temp0_4x32b, src_temp0_8x16b); 631 632 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0 + 1 << (shift - 1) */ 633 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, lvl_shift_4x32b); 634 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift_4x32b); 635 636 /* (i4_tmp >> shift) */ 637 res_temp0_4x32b = _mm_srai_epi32(res_temp0_4x32b, shift); 638 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 639 640 /*i4_tmp = (i4_tmp >> shift) + off0; */ 641 res_temp0_4x32b = _mm_add_epi32(res_temp0_4x32b, off0_4x32b); 642 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, off0_4x32b); 643 644 res_temp0_4x32b = _mm_packs_epi32(res_temp0_4x32b, res_temp1_4x32b); 645 646 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 647 res_temp0_4x32b = _mm_packus_epi16(res_temp0_4x32b, res_temp0_4x32b); 648 649 dst0 = _mm_cvtsi128_si32(res_temp0_4x32b); 650 /* dst row = 1 to 3 */ 651 res_temp1_4x32b = _mm_shuffle_epi32(res_temp0_4x32b, 1); 652 653 /* store four 8-bit output values */ 654 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 655 656 dst1 = _mm_cvtsi128_si32(res_temp1_4x32b); 657 /* row = 1 */ 658 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 659 660 pi2_src += 4; /* Pointer update */ 661 pu1_dst += 4; /* Pointer update */ 662 663 } /* inner loop ends here(4-output values in single iteration) */ 664 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 665 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 666 } 667 } 668 } 669 } 670 671 /** 672 ******************************************************************************* 673 * 674 * @brief 675 * Does bi-weighted prediction on the arrays pointed by pi2_src1 and 676 * pi2_src2 and stores it at location pointed by pi2_dst 677 * 678 * @par Description: 679 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 680 * off1 + 1) << (shift - 1) ) >> shift 681 * 682 * @param[in] pi2_src1 683 * Pointer to source 1 684 * 685 * @param[in] pi2_src2 686 * Pointer to source 2 687 * 688 * @param[out] pu1_dst 689 * Pointer to destination 690 * 691 * @param[in] src_strd1 692 * Source stride 1 693 * 694 * @param[in] src_strd2 695 * Source stride 2 696 * 697 * @param[in] dst_strd 698 * Destination stride 699 * 700 * @param[in] wgt0 701 * weight to be multiplied to source 1 702 * 703 * @param[in] off0 704 * offset 0 705 * 706 * @param[in] wgt1 707 * weight to be multiplied to source 2 708 * 709 * @param[in] off1 710 * offset 1 711 * 712 * @param[in] shift 713 * (14 Bit depth) + log2_weight_denominator 714 * 715 * @param[in] lvl_shift1 716 * added before shift and offset 717 * 718 * @param[in] lvl_shift2 719 * added before shift and offset 720 * 721 * @param[in] ht 722 * height of the source 723 * 724 * @param[in] wd 725 * width of the source 726 * 727 * @returns 728 * 729 * @remarks 730 * None 731 * 732 ******************************************************************************* 733 */ 734 735 736 void ihevc_weighted_pred_bi_ssse3(WORD16 *pi2_src1, 737 WORD16 *pi2_src2, 738 UWORD8 *pu1_dst, 739 WORD32 src_strd1, 740 WORD32 src_strd2, 741 WORD32 dst_strd, 742 WORD32 wgt0, 743 WORD32 off0, 744 WORD32 wgt1, 745 WORD32 off1, 746 WORD32 shift, 747 WORD32 lvl_shift1, 748 WORD32 lvl_shift2, 749 WORD32 ht, 750 WORD32 wd) 751 { 752 WORD32 row, col, temp; 753 754 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 755 __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b; 756 __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b; 757 758 #include <assert.h> 759 ASSERT(wd % 4 == 0); /* checking assumption*/ 760 ASSERT(ht % 4 == 0); /* checking assumption*/ 761 762 temp = (off0 + off1 + 1) << (shift - 1); 763 764 // seting values in register 765 lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1); 766 wgt0_8x16b = _mm_set1_epi16(wgt0); 767 lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2); 768 wgt1_8x16b = _mm_set1_epi16(wgt1); 769 770 /* lvl_shift1 * wgt0 */ 771 res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b); 772 res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b); 773 /* lvl_shift2 * wgt1 */ 774 res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b); 775 res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b); 776 777 const_temp_4x32b = _mm_set1_epi32(temp); 778 779 /* lvl_shift1 * wgt0 */ 780 lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b); 781 /* lvl_shift2 * wgt1 */ 782 lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b); 783 784 if(0 == (wd & 7)) /* wd multiple of 8 case */ 785 { 786 __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b; 787 /* outer for loop starts from here */ 788 for(row = 0; row < ht; row += 2) 789 { 790 for(col = 0; col < wd; col += 8) 791 { 792 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 793 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ 794 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ 795 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 796 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 797 798 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ 799 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 800 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); 801 res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); 802 res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b); 803 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ 804 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 805 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); 806 src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); 807 src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b); 808 809 /* Get 32 bit Result */ 810 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 811 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 812 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); 813 res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b); 814 815 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 816 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 817 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); 818 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b); 819 820 /* (pi2_src[col] + lvl_shift) * wgt */ 821 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b); 822 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b); 823 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b); 824 res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b); 825 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); 826 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); 827 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); 828 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); 829 830 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 831 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); 832 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); 833 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 834 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); 835 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); 836 /* (i4_tmp >> shift) */ 837 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 838 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 839 840 /* Next 4 Pixels */ 841 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b); 842 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b); 843 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b); 844 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b); 845 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); 846 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); 847 848 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); 849 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); 850 851 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 852 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 853 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); 854 855 /* store four 8-bit output values */ 856 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/ 857 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/ 858 859 pi2_src1 += 8; /* Pointer update */ 860 pi2_src2 += 8; /* Pointer update */ 861 pu1_dst += 8; /* Pointer update */ 862 863 } /* inner loop ends here(4-output values in single iteration) */ 864 865 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 866 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 867 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 868 869 } /* outer loop ends */ 870 } 871 else /* wd multiple of 4 case */ 872 { 873 WORD32 dst0, dst1; 874 /* outer for loop starts from here */ 875 for(row = 0; row < ht; row += 2) 876 { 877 for(col = 0; col < wd; col += 4) 878 { 879 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 880 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */ 881 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */ 882 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 883 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 884 885 /* 2 rows together */ 886 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 887 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 888 889 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ 890 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 891 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); 892 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ 893 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 894 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); 895 896 /* Get 32 bit Result */ 897 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 898 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 899 900 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 901 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 902 903 /* (pi2_src[col] + lvl_shift) * wgt */ 904 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); 905 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); 906 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); 907 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); 908 909 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 910 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); 911 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); 912 913 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 914 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); 915 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); 916 917 /* (i4_tmp >> shift) */ 918 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 919 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 920 921 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); 922 923 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 924 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 925 926 dst0 = _mm_cvtsi128_si32(res_temp1_4x32b); 927 928 /* dst row = 1 to 3 */ 929 res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1); 930 931 /* store four 8-bit output values */ 932 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 933 934 dst1 = _mm_cvtsi128_si32(res_temp2_4x32b); 935 936 /* row = 1 */ 937 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 938 939 pi2_src1 += 4; /* Pointer update */ 940 pi2_src2 += 4; /* Pointer update */ 941 pu1_dst += 4; /* Pointer update */ 942 943 } /* inner loop ends here(4-output values in single iteration) */ 944 945 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 946 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 947 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 948 949 } /* outer loop ends */ 950 } 951 952 } 953 954 /** 955 ******************************************************************************* 956 * 957 * @brief 958 * Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and 959 * pi2_src2 and stores it at location pointed by pi2_dst 960 * 961 * @par Description: 962 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 963 * off1 + 1) << (shift - 1) ) >> shift 964 * 965 * @param[in] pi2_src1 966 * Pointer to source 1 967 * 968 * @param[in] pi2_src2 969 * Pointer to source 2 970 * 971 * @param[out] pu1_dst 972 * Pointer to destination 973 * 974 * @param[in] src_strd1 975 * Source stride 1 976 * 977 * @param[in] src_strd2 978 * Source stride 2 979 * 980 * @param[in] dst_strd 981 * Destination stride 982 * 983 * @param[in] wgt0 984 * weight to be multiplied to source 1 985 * 986 * @param[in] off0 987 * offset 0 988 * 989 * @param[in] wgt1 990 * weight to be multiplied to source 2 991 * 992 * @param[in] off1 993 * offset 1 994 * 995 * @param[in] shift 996 * (14 Bit depth) + log2_weight_denominator 997 * 998 * @param[in] lvl_shift1 999 * added before shift and offset 1000 * 1001 * @param[in] lvl_shift2 1002 * added before shift and offset 1003 * 1004 * @param[in] ht 1005 * height of the source 1006 * 1007 * @param[in] wd 1008 * width of the source (each colour component) 1009 * 1010 * @returns 1011 * 1012 * @remarks 1013 * None 1014 * 1015 ******************************************************************************* 1016 */ 1017 1018 1019 void ihevc_weighted_pred_chroma_bi_ssse3(WORD16 *pi2_src1, 1020 WORD16 *pi2_src2, 1021 UWORD8 *pu1_dst, 1022 WORD32 src_strd1, 1023 WORD32 src_strd2, 1024 WORD32 dst_strd, 1025 WORD32 wgt0_cb, 1026 WORD32 wgt0_cr, 1027 WORD32 off0_cb, 1028 WORD32 off0_cr, 1029 WORD32 wgt1_cb, 1030 WORD32 wgt1_cr, 1031 WORD32 off1_cb, 1032 WORD32 off1_cr, 1033 WORD32 shift, 1034 WORD32 lvl_shift1, 1035 WORD32 lvl_shift2, 1036 WORD32 ht, 1037 WORD32 wd) 1038 { 1039 WORD32 row, col, temp1, temp2; 1040 WORD32 wdx2; 1041 1042 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1043 __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_8x16b, wgt1_8x16b; 1044 __m128i res_temp1_4x32b, res_temp2_4x32b, res_temp3_4x32b, res_temp4_4x32b; 1045 1046 ASSERT(wd % 2 == 0); /* checking assumption*/ 1047 ASSERT(ht % 2 == 0); /* checking assumption*/ 1048 1049 temp1 = (off0_cb + off1_cb + 1) << (shift - 1); 1050 temp2 = (off0_cr + off1_cr + 1) << (shift - 1); 1051 1052 // seting values in register 1053 lvl_shift1_4x32b = _mm_set1_epi16(lvl_shift1); 1054 wgt0_8x16b = _mm_set_epi16(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); 1055 lvl_shift2_4x32b = _mm_set1_epi16(lvl_shift2); 1056 wgt1_8x16b = _mm_set_epi16(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb); 1057 1058 /* lvl_shift1 * wgt0 */ 1059 res_temp1_4x32b = _mm_mullo_epi16(lvl_shift1_4x32b, wgt0_8x16b); 1060 res_temp2_4x32b = _mm_mulhi_epi16(lvl_shift1_4x32b, wgt0_8x16b); 1061 /* lvl_shift2 * wgt1 */ 1062 res_temp3_4x32b = _mm_mullo_epi16(lvl_shift2_4x32b, wgt1_8x16b); 1063 res_temp4_4x32b = _mm_mulhi_epi16(lvl_shift2_4x32b, wgt1_8x16b); 1064 1065 const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1); 1066 wdx2 = wd * 2; 1067 1068 /* lvl_shift1 * wgt0 */ 1069 lvl_shift1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, res_temp2_4x32b); 1070 /* lvl_shift2 * wgt1 */ 1071 lvl_shift2_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, res_temp4_4x32b); 1072 1073 if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */ 1074 { 1075 __m128i res_temp5_4x32b, res_temp6_4x32b, res_temp7_4x32b, res_temp8_4x32b; 1076 /* outer for loop starts from here */ 1077 for(row = 0; row < ht; row += 2) 1078 { 1079 for(col = 0; col < wdx2; col += 8) 1080 { 1081 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1082 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ 1083 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ 1084 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 1085 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 1086 1087 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ 1088 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 1089 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); 1090 res_temp3_4x32b = _mm_mullo_epi16(src_temp3_8x16b, wgt0_8x16b); 1091 res_temp4_4x32b = _mm_mullo_epi16(src_temp4_8x16b, wgt1_8x16b); 1092 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ 1093 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 1094 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); 1095 src_temp3_8x16b = _mm_mulhi_epi16(src_temp3_8x16b, wgt0_8x16b); 1096 src_temp4_8x16b = _mm_mulhi_epi16(src_temp4_8x16b, wgt1_8x16b); 1097 1098 /* Get 32 bit Result */ 1099 res_temp5_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 1100 res_temp6_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 1101 res_temp7_4x32b = _mm_unpackhi_epi16(res_temp3_4x32b, src_temp3_8x16b); 1102 res_temp8_4x32b = _mm_unpackhi_epi16(res_temp4_4x32b, src_temp4_8x16b); 1103 1104 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 1105 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 1106 res_temp3_4x32b = _mm_unpacklo_epi16(res_temp3_4x32b, src_temp3_8x16b); 1107 res_temp4_4x32b = _mm_unpacklo_epi16(res_temp4_4x32b, src_temp4_8x16b); 1108 1109 /* (pi2_src[col] + lvl_shift) * wgt */ 1110 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, lvl_shift1_4x32b); 1111 res_temp6_4x32b = _mm_add_epi32(res_temp6_4x32b, lvl_shift2_4x32b); 1112 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, lvl_shift1_4x32b); 1113 res_temp8_4x32b = _mm_add_epi32(res_temp8_4x32b, lvl_shift2_4x32b); 1114 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); 1115 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); 1116 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); 1117 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); 1118 1119 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 1120 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); 1121 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); 1122 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 1123 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); 1124 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); 1125 /* (i4_tmp >> shift) */ 1126 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 1127 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 1128 1129 /* Next 4 Pixels */ 1130 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, res_temp6_4x32b); 1131 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, res_temp8_4x32b); 1132 res_temp5_4x32b = _mm_add_epi32(res_temp5_4x32b, const_temp_4x32b); 1133 res_temp7_4x32b = _mm_add_epi32(res_temp7_4x32b, const_temp_4x32b); 1134 res_temp5_4x32b = _mm_srai_epi32(res_temp5_4x32b, shift); 1135 res_temp7_4x32b = _mm_srai_epi32(res_temp7_4x32b, shift); 1136 1137 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp5_4x32b); 1138 res_temp3_4x32b = _mm_packs_epi32(res_temp3_4x32b, res_temp7_4x32b); 1139 1140 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1141 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 1142 res_temp3_4x32b = _mm_packus_epi16(res_temp3_4x32b, res_temp3_4x32b); 1143 1144 /* store four 8-bit output values */ 1145 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), res_temp1_4x32b); /* row = 0*/ 1146 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), res_temp3_4x32b); /* row = 1*/ 1147 1148 pi2_src1 += 8; /* Pointer update */ 1149 pi2_src2 += 8; /* Pointer update */ 1150 pu1_dst += 8; /* Pointer update */ 1151 1152 } /* inner loop ends here(4-output values in single iteration) */ 1153 1154 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 1155 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 1156 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 1157 1158 } /* outer loop ends */ 1159 } 1160 else /* wdx2 multiple of 4 case */ 1161 { 1162 WORD32 dst0, dst1; 1163 /* outer for loop starts from here */ 1164 for(row = 0; row < ht; row += 2) 1165 { 1166 for(col = 0; col < wdx2; col += 4) 1167 { 1168 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1169 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); /* row = 0 */ 1170 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); /* row = 0 */ 1171 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 1172 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 1173 1174 /* 2 rows together */ 1175 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 1176 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 1177 1178 /*i4_tmp = (pi2_src[col]) * wgt*/ /* Lower 16 bit */ 1179 res_temp1_4x32b = _mm_mullo_epi16(src_temp1_8x16b, wgt0_8x16b); 1180 res_temp2_4x32b = _mm_mullo_epi16(src_temp2_8x16b, wgt1_8x16b); 1181 /*i4_tmp = (pi2_src[col] ) * wgt*/ /* Higher 16 bit */ 1182 src_temp1_8x16b = _mm_mulhi_epi16(src_temp1_8x16b, wgt0_8x16b); 1183 src_temp2_8x16b = _mm_mulhi_epi16(src_temp2_8x16b, wgt1_8x16b); 1184 1185 /* Get 32 bit Result */ 1186 res_temp3_4x32b = _mm_unpackhi_epi16(res_temp1_4x32b, src_temp1_8x16b); 1187 res_temp4_4x32b = _mm_unpackhi_epi16(res_temp2_4x32b, src_temp2_8x16b); 1188 1189 res_temp1_4x32b = _mm_unpacklo_epi16(res_temp1_4x32b, src_temp1_8x16b); 1190 res_temp2_4x32b = _mm_unpacklo_epi16(res_temp2_4x32b, src_temp2_8x16b); 1191 1192 /* (pi2_src[col] + lvl_shift) * wgt */ 1193 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, lvl_shift1_4x32b); 1194 res_temp4_4x32b = _mm_add_epi32(res_temp4_4x32b, lvl_shift2_4x32b); 1195 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, lvl_shift1_4x32b); 1196 res_temp2_4x32b = _mm_add_epi32(res_temp2_4x32b, lvl_shift2_4x32b); 1197 1198 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 1199 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, res_temp2_4x32b); 1200 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, res_temp4_4x32b); 1201 1202 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 1203 res_temp1_4x32b = _mm_add_epi32(res_temp1_4x32b, const_temp_4x32b); 1204 res_temp3_4x32b = _mm_add_epi32(res_temp3_4x32b, const_temp_4x32b); 1205 1206 /* (i4_tmp >> shift) */ 1207 res_temp1_4x32b = _mm_srai_epi32(res_temp1_4x32b, shift); 1208 res_temp3_4x32b = _mm_srai_epi32(res_temp3_4x32b, shift); 1209 1210 res_temp1_4x32b = _mm_packs_epi32(res_temp1_4x32b, res_temp3_4x32b); 1211 1212 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1213 res_temp1_4x32b = _mm_packus_epi16(res_temp1_4x32b, res_temp1_4x32b); 1214 1215 dst0 = _mm_cvtsi128_si32(res_temp1_4x32b); 1216 1217 /* dst row = 1 to 3 */ 1218 res_temp2_4x32b = _mm_shuffle_epi32(res_temp1_4x32b, 1); 1219 1220 /* store four 8-bit output values */ 1221 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 1222 1223 dst1 = _mm_cvtsi128_si32(res_temp2_4x32b); 1224 1225 /* row = 1 */ 1226 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 1227 1228 pi2_src1 += 4; /* Pointer update */ 1229 pi2_src2 += 4; /* Pointer update */ 1230 pu1_dst += 4; /* Pointer update */ 1231 1232 } /* inner loop ends here(4-output values in single iteration) */ 1233 1234 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 1235 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 1236 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 1237 } 1238 } 1239 1240 } 1241 1242 /** 1243 ******************************************************************************* 1244 * 1245 * @brief 1246 * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and 1247 * pi2_src2 and stores it at location pointed by pi2_dst 1248 * 1249 * @par Description: 1250 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 1251 * >> shift where shift = 15 - BitDepth 1252 * 1253 * @param[in] pi2_src1 1254 * Pointer to source 1 1255 * 1256 * @param[in] pi2_src2 1257 * Pointer to source 2 1258 * 1259 * @param[out] pu1_dst 1260 * Pointer to destination 1261 * 1262 * @param[in] src_strd1 1263 * Source stride 1 1264 * 1265 * @param[in] src_strd2 1266 * Source stride 2 1267 * 1268 * @param[in] dst_strd 1269 * Destination stride 1270 * 1271 * @param[in] lvl_shift1 1272 * added before shift and offset 1273 * 1274 * @param[in] lvl_shift2 1275 * added before shift and offset 1276 * 1277 * @param[in] ht 1278 * height of the source 1279 * 1280 * @param[in] wd 1281 * width of the source 1282 * 1283 * @returns 1284 * 1285 * @remarks 1286 * None 1287 * 1288 * Assumption : ht%4 == 0, wd%4 == 0 1289 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case, 1290 * final result will match even if intermediate precision is in 16 bit. 1291 * 1292 ******************************************************************************* 1293 */ 1294 void ihevc_weighted_pred_bi_default_ssse3(WORD16 *pi2_src1, 1295 WORD16 *pi2_src2, 1296 UWORD8 *pu1_dst, 1297 WORD32 src_strd1, 1298 WORD32 src_strd2, 1299 WORD32 dst_strd, 1300 WORD32 lvl_shift1, 1301 WORD32 lvl_shift2, 1302 WORD32 ht, 1303 WORD32 wd) 1304 { 1305 { 1306 WORD32 row, col, temp; 1307 WORD32 shift; 1308 1309 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1310 __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b; 1311 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1312 1313 ASSERT(wd % 4 == 0); /* checking assumption*/ 1314 ASSERT(ht % 2 == 0); /* checking assumption*/ 1315 1316 shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 1317 temp = 1 << (shift - 1); 1318 1319 // seting values in register 1320 lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1); 1321 lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2); 1322 const_temp_8x16b = _mm_set1_epi16(temp); 1323 1324 lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b); 1325 lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b); 1326 1327 if(0 == (ht & 3)) /* ht multiple of 4*/ 1328 { 1329 if(0 == (wd & 15)) /* wd multiple of 16 case */ 1330 { 1331 __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; 1332 __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b; 1333 /* outer for loop starts from here */ 1334 for(row = 0; row < ht; row += 4) 1335 { 1336 for(col = 0; col < wd; col += 16) 1337 { 1338 /*load 8 pixel values */ /* First 8 Values */ 1339 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 1340 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 1341 /* row = 1 */ 1342 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 1343 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 1344 /* row = 2 */ 1345 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 1346 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 1347 /* row = 3 */ 1348 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 1349 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 1350 1351 /*load 8 pixel values */ /* Second 8 Values */ 1352 src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); 1353 src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); 1354 /* row = 1 */ 1355 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); 1356 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); 1357 /* row = 2 */ 1358 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8)); 1359 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8)); 1360 1361 /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ 1362 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1363 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 1364 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 1365 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 1366 1367 /*load 8 pixel values */ /* Second 8 Values */ 1368 /* row = 3 */ 1369 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8)); 1370 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8)); 1371 1372 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ 1373 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1374 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 1375 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 1376 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 1377 1378 /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ 1379 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); 1380 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); 1381 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b); 1382 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b); 1383 1384 /* (i4_tmp >> shift) */ /* First 8 Values */ 1385 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1386 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 1387 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 1388 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 1389 1390 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ 1391 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); 1392 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); 1393 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b); 1394 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b); 1395 1396 /* (i4_tmp >> shift) */ /* Second 8 Values */ 1397 src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); 1398 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); 1399 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift); 1400 src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift); 1401 1402 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */ 1403 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b); 1404 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b); 1405 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b); 1406 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b); 1407 1408 /* store four 8-bit output values */ /* 16 8 Values */ 1409 _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 1410 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 1411 _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 1412 _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 1413 1414 /* To update pointer */ 1415 pi2_src1 += 16; 1416 pi2_src2 += 16; 1417 pu1_dst += 16; 1418 1419 } /* inner loop ends here(8-output values in single iteration) */ 1420 1421 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 1422 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 1423 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 1424 1425 } 1426 } 1427 else if(0 == (wd & 7)) /* multiple of 8 case */ 1428 { 1429 /* outer for loop starts from here */ 1430 for(row = 0; row < ht; row += 4) 1431 { 1432 for(col = 0; col < wd; col += 8) 1433 { 1434 /*load 8 pixel values */ 1435 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 1436 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 1437 /* row = 1 */ 1438 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 1439 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 1440 /* row = 2 */ 1441 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 1442 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 1443 /* row = 3 */ 1444 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 1445 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 1446 1447 /* (pi2_src1[col] + pi2_src2[col]) */ 1448 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1449 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 1450 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 1451 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 1452 1453 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 1454 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1455 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 1456 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 1457 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 1458 1459 /* (i4_tmp >> shift) */ 1460 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1461 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 1462 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 1463 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 1464 1465 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1466 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 1467 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 1468 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 1469 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); 1470 1471 /* store four 8-bit output values */ 1472 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 1473 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 1474 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 1475 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 1476 1477 /* To update pointer */ 1478 pi2_src1 += 8; 1479 pi2_src2 += 8; 1480 pu1_dst += 8; 1481 1482 } /* inner loop ends here(8-output values in single iteration) */ 1483 1484 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 1485 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 1486 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 1487 1488 } 1489 } 1490 else /* wd multiple of 4 case*/ 1491 { 1492 WORD32 dst0, dst1, dst2, dst3; 1493 1494 /* outer for loop starts from here */ 1495 for(row = 0; row < ht; row += 4) 1496 { 1497 for(col = 0; col < wd; col += 4) 1498 { 1499 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 1500 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 1501 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1502 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 1503 1504 /* row = 1 */ 1505 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 1506 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 1507 /* row = 2 */ 1508 src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1)); 1509 src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2)); 1510 /* row = 3 */ 1511 src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1)); 1512 src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2)); 1513 1514 /* Pack two rows together */ 1515 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 1516 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 1517 src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b); 1518 src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b); 1519 1520 /* (pi2_src1[col] + pi2_src2[col]) */ 1521 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1522 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 1523 1524 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 1525 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1526 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 1527 1528 /* (i4_tmp >> shift) */ 1529 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1530 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 1531 1532 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1533 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 1534 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 1535 1536 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 1537 /* dst row = 1 to 3 */ 1538 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 1539 src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1); 1540 1541 /* store four 8-bit output values */ 1542 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 1543 1544 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 1545 dst2 = _mm_cvtsi128_si32(src_temp5_8x16b); 1546 dst3 = _mm_cvtsi128_si32(src_temp4_8x16b); 1547 1548 /* row = 1 to row = 3 */ 1549 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 1550 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; 1551 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; 1552 1553 /* To update pointer */ 1554 pi2_src1 += 4; 1555 pi2_src2 += 4; 1556 pu1_dst += 4; 1557 1558 } /* inner loop ends here(4-output values in single iteration) */ 1559 1560 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 1561 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 1562 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 1563 1564 } 1565 } 1566 } 1567 else /* ht multiple of 2 case and wd multiple of 4 case*/ 1568 { 1569 1570 WORD32 dst0, dst1; 1571 1572 /* outer for loop starts from here */ 1573 for(row = 0; row < ht; row += 2) 1574 { 1575 for(col = 0; col < wd; col += 4) 1576 { 1577 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 1578 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 1579 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1580 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 1581 1582 /* row = 1 */ 1583 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 1584 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 1585 1586 /* Pack two rows together */ 1587 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 1588 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 1589 1590 /* (pi2_src1[col] + pi2_src2[col]) */ 1591 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1592 1593 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 1594 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1595 1596 /* (i4_tmp >> shift) */ 1597 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1598 1599 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1600 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 1601 1602 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 1603 /* dst row = 1 to 3 */ 1604 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 1605 1606 /* store four 8-bit output values */ 1607 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 1608 1609 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 1610 1611 /* row = 1 to row = 3 */ 1612 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 1613 1614 /* To update pointer */ 1615 pi2_src1 += 4; 1616 pi2_src2 += 4; 1617 pu1_dst += 4; 1618 1619 } /* inner loop ends here(4-output values in single iteration) */ 1620 1621 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 1622 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 1623 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 1624 1625 } 1626 1627 } 1628 1629 } 1630 } 1631 1632 1633 /** 1634 ******************************************************************************* 1635 * 1636 * @brief 1637 * Does chroma default bi-weighted prediction on arrays pointed by pi2_src1 and 1638 * pi2_src2 and stores it at location pointed by pi2_dst 1639 * 1640 * @par Description: 1641 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 1642 * >> shift where shift = 15 - BitDepth 1643 * 1644 * @param[in] pi2_src1 1645 * Pointer to source 1 1646 * 1647 * @param[in] pi2_src2 1648 * Pointer to source 2 1649 * 1650 * @param[out] pu1_dst 1651 * Pointer to destination 1652 * 1653 * @param[in] src_strd1 1654 * Source stride 1 1655 * 1656 * @param[in] src_strd2 1657 * Source stride 2 1658 * 1659 * @param[in] dst_strd 1660 * Destination stride 1661 * 1662 * @param[in] lvl_shift1 1663 * added before shift and offset 1664 * 1665 * @param[in] lvl_shift2 1666 * added before shift and offset 1667 * 1668 * @param[in] ht 1669 * height of the source 1670 * 1671 * @param[in] wd 1672 * width of the source (each colour component) 1673 * 1674 * @returns 1675 * 1676 * @remarks 1677 * None 1678 * 1679 * Assumption : ht%2 == 0, wd%2 == 0, lvl_shift1==0, lvl_shift2==0. 1680 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case, 1681 * final result will match even if intermediate precision is in 16 bit. 1682 ******************************************************************************* 1683 */ 1684 1685 void ihevc_weighted_pred_chroma_bi_default_ssse3(WORD16 *pi2_src1, 1686 WORD16 *pi2_src2, 1687 UWORD8 *pu1_dst, 1688 WORD32 src_strd1, 1689 WORD32 src_strd2, 1690 WORD32 dst_strd, 1691 WORD32 lvl_shift1, 1692 WORD32 lvl_shift2, 1693 WORD32 ht, 1694 WORD32 wd) 1695 { 1696 WORD32 row, col, temp; 1697 WORD32 shift, wdx2; 1698 1699 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1700 __m128i lvl_shift1_8x16b; 1701 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1702 1703 ASSERT(wd % 2 == 0); /* checking assumption*/ 1704 ASSERT(ht % 2 == 0); /* checking assumption*/ 1705 UNUSED(lvl_shift1); 1706 UNUSED(lvl_shift2); 1707 shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 1708 temp = 1 << (shift - 1); 1709 wdx2 = wd * 2; 1710 1711 // seting values in register 1712 lvl_shift1_8x16b = _mm_set1_epi16(temp); 1713 1714 if(0 == (ht & 3)) /* ht multiple of 4 case */ 1715 { 1716 if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ 1717 { 1718 __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; 1719 __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b; 1720 /* outer for loop starts from here */ 1721 for(row = 0; row < ht; row += 4) 1722 { 1723 for(col = 0; col < wdx2; col += 16) 1724 { 1725 /*load 8 pixel values */ /* First 8 Values */ 1726 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 1727 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 1728 /* row = 1 */ 1729 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 1730 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 1731 /* row = 2 */ 1732 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 1733 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 1734 /* row = 3 */ 1735 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 1736 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 1737 1738 /*load 8 pixel values */ /* Second 8 Values */ 1739 src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); 1740 src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); 1741 /* row = 1 */ 1742 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); 1743 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); 1744 /* row = 2 */ 1745 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8)); 1746 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8)); 1747 1748 /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ 1749 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1750 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 1751 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 1752 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 1753 1754 /*load 8 pixel values */ /* Second 8 Values */ 1755 /* row = 3 */ 1756 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8)); 1757 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8)); 1758 1759 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ 1760 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1761 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 1762 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 1763 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 1764 1765 /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ 1766 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); 1767 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); 1768 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b); 1769 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b); 1770 1771 /* (i4_tmp >> shift) */ /* First 8 Values */ 1772 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1773 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 1774 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 1775 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 1776 1777 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ 1778 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); 1779 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); 1780 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b); 1781 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b); 1782 1783 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */ 1784 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 1785 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 1786 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 1787 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); 1788 1789 /* (i4_tmp >> shift) */ /* Second 8 Values */ 1790 src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); 1791 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); 1792 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift); 1793 src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift); 1794 1795 /* store four 8-bit output values */ /* First 8 Values */ 1796 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 1797 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 1798 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 1799 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 1800 1801 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */ 1802 src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b); 1803 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b); 1804 src_temp13_8x16b = _mm_packus_epi16(src_temp13_8x16b, src_temp13_8x16b); 1805 src_temp15_8x16b = _mm_packus_epi16(src_temp15_8x16b, src_temp15_8x16b); 1806 1807 /* store four 8-bit output values */ /* Second 8 Values */ 1808 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/ 1809 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/ 1810 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd + 8), src_temp13_8x16b); /* row = 1*/ 1811 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd + 8), src_temp15_8x16b); /* row = 3*/ 1812 1813 /* To update pointer */ 1814 pi2_src1 += 16; 1815 pi2_src2 += 16; 1816 pu1_dst += 16; 1817 1818 } /* inner loop ends here(8-output values in single iteration) */ 1819 1820 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ 1821 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ 1822 pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ 1823 1824 } 1825 } 1826 else if(0 == (wdx2 & 7)) /* multiple of 8 case */ 1827 { 1828 /* outer for loop starts from here */ 1829 for(row = 0; row < ht; row += 4) 1830 { 1831 for(col = 0; col < wdx2; col += 8) 1832 { 1833 /*load 8 pixel values */ 1834 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 1835 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 1836 /* row = 1 */ 1837 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 1838 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 1839 /* row = 2 */ 1840 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 1841 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 1842 /* row = 3 */ 1843 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 1844 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 1845 1846 /* (pi2_src1[col] + pi2_src2[col]) */ 1847 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1848 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 1849 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 1850 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 1851 1852 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 1853 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1854 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 1855 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 1856 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 1857 1858 /* (i4_tmp >> shift) */ 1859 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1860 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 1861 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 1862 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 1863 1864 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1865 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 1866 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 1867 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 1868 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); 1869 1870 /* store four 8-bit output values */ 1871 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 1872 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 1873 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 1874 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 1875 1876 /* To update pointer */ 1877 pi2_src1 += 8; 1878 pi2_src2 += 8; 1879 pu1_dst += 8; 1880 1881 } /* inner loop ends here(8-output values in single iteration) */ 1882 1883 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ 1884 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ 1885 pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ 1886 1887 } 1888 } 1889 else /* 2*wd multiple of 4 case */ 1890 { 1891 WORD32 dst0, dst1, dst2, dst3; 1892 /* outer for loop starts from here */ 1893 for(row = 0; row < ht; row += 4) 1894 { 1895 for(col = 0; col < wdx2; col += 4) 1896 { 1897 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 1898 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 1899 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1900 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 1901 1902 /* row = 1 */ 1903 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 1904 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 1905 /* row = 2 */ 1906 src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1)); 1907 src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2)); 1908 /* row = 3 */ 1909 src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1)); 1910 src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2)); 1911 1912 /* Pack two rows together */ 1913 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 1914 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 1915 src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b); 1916 src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b); 1917 1918 /* (pi2_src1[col] + pi2_src2[col]) */ 1919 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1920 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 1921 1922 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 1923 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1924 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 1925 1926 /* (i4_tmp >> shift) */ 1927 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1928 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 1929 1930 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1931 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 1932 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 1933 1934 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 1935 /* dst row = 1 to 3 */ 1936 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 1937 src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1); 1938 1939 /* store four 8-bit output values */ 1940 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 1941 1942 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 1943 dst2 = _mm_cvtsi128_si32(src_temp5_8x16b); 1944 dst3 = _mm_cvtsi128_si32(src_temp4_8x16b); 1945 1946 /* row = 1 to row = 3 */ 1947 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 1948 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; 1949 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; 1950 1951 /* To update pointer */ 1952 pi2_src1 += 4; 1953 pi2_src2 += 4; 1954 pu1_dst += 4; 1955 1956 } /* inner loop ends here(4-output values in single iteration) */ 1957 1958 pi2_src1 = pi2_src1 - wdx2 + 4 * src_strd1; /* Pointer update */ 1959 pi2_src2 = pi2_src2 - wdx2 + 4 * src_strd2; /* Pointer update */ 1960 pu1_dst = pu1_dst - wdx2 + 4 * dst_strd; /* Pointer update */ 1961 1962 } 1963 } 1964 } 1965 else /* ht multiple of 2 case */ 1966 { 1967 if(0 == (wdx2 & 15)) /* 2*wd multiple of 16 case */ 1968 { 1969 __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; 1970 /* outer for loop starts from here */ 1971 for(row = 0; row < ht; row += 2) 1972 { 1973 for(col = 0; col < wdx2; col += 16) 1974 { 1975 /*load 8 pixel values */ /* First 8 Values */ 1976 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 1977 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 1978 /* row = 1 */ 1979 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 1980 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 1981 1982 /*load 8 pixel values */ /* Second 8 Values */ 1983 src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); 1984 src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); 1985 /* row = 1 */ 1986 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); 1987 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); 1988 1989 /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ 1990 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1991 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 1992 1993 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ 1994 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1995 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 1996 1997 /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ 1998 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); 1999 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); 2000 2001 /* (i4_tmp >> shift) */ /* First 8 Values */ 2002 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 2003 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 2004 2005 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ 2006 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); 2007 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); 2008 2009 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* First 8 Values */ 2010 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 2011 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 2012 2013 /* (i4_tmp >> shift) */ /* Second 8 Values */ 2014 src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); 2015 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); 2016 2017 /* store four 8-bit output values */ /* First 8 Values */ 2018 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 2019 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 2020 2021 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* Second 8 Values */ 2022 src_temp9_8x16b = _mm_packus_epi16(src_temp9_8x16b, src_temp9_8x16b); 2023 src_temp11_8x16b = _mm_packus_epi16(src_temp11_8x16b, src_temp11_8x16b); 2024 2025 /* store four 8-bit output values */ /* Second 8 Values */ 2026 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd + 8), src_temp9_8x16b); /* row = 0*/ 2027 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd + 8), src_temp11_8x16b); /* row = 2*/ 2028 2029 /* To update pointer */ 2030 pi2_src1 += 16; 2031 pi2_src2 += 16; 2032 pu1_dst += 16; 2033 2034 } /* inner loop ends here(8-output values in single iteration) */ 2035 2036 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 2037 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 2038 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 2039 2040 } 2041 } 2042 else if(0 == (wdx2 & 7)) /* multiple of 8 case */ 2043 { 2044 /* outer for loop starts from here */ 2045 for(row = 0; row < ht; row += 2) 2046 { 2047 for(col = 0; col < wdx2; col += 8) 2048 { 2049 /*load 8 pixel values */ 2050 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 2051 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 2052 /* row = 1 */ 2053 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 2054 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 2055 2056 /* (pi2_src1[col] + pi2_src2[col]) */ 2057 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 2058 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 2059 2060 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 2061 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 2062 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 2063 2064 /* (i4_tmp >> shift) */ 2065 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 2066 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 2067 2068 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 2069 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 2070 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 2071 2072 /* store four 8-bit output values */ 2073 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 2074 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 1*/ 2075 2076 /* To update pointer */ 2077 pi2_src1 += 8; 2078 pi2_src2 += 8; 2079 pu1_dst += 8; 2080 2081 } /* inner loop ends here(8-output values in single iteration) */ 2082 2083 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 2084 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 2085 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 2086 2087 } 2088 } 2089 else /* 2*wd multiple of 4 case */ 2090 { 2091 WORD32 dst0, dst1; 2092 /* outer for loop starts from here */ 2093 for(row = 0; row < ht; row += 2) 2094 { 2095 for(col = 0; col < wdx2; col += 4) 2096 { 2097 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 2098 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 2099 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 2100 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 2101 /* row = 1 */ 2102 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 2103 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 2104 2105 /* Pack two rows together */ 2106 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 2107 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 2108 2109 /* (pi2_src1[col] + pi2_src2[col]) */ 2110 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 2111 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 2112 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 2113 2114 /* (i4_tmp >> shift) */ 2115 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 2116 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 2117 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 2118 2119 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 2120 /* dst row = 1 */ 2121 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 2122 2123 /* store four 8-bit output values */ 2124 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 2125 2126 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 2127 /* row = 1 */ 2128 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 2129 2130 /* To update pointer */ 2131 pi2_src1 += 4; 2132 pi2_src2 += 4; 2133 pu1_dst += 4; 2134 } /* inner loop ends here(4-output values in single iteration) */ 2135 2136 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 2137 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 2138 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 2139 2140 } 2141 } 2142 } 2143 } 2144