1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_weighted_pred_x86_intr.c 22 * 23 * @brief 24 * Contains function definitions for weighted prediction used in inter 25 * prediction 26 * 27 * @author 28 * 29 * 30 * @par List of Functions: 31 * - ihevc_weighted_pred_uni_sse42() 32 * - ihevc_weighted_pred_bi_sse42() 33 * - ihevc_weighted_pred_bi_default_sse42() 34 * - ihevc_weighted_pred_chroma_uni_sse42() 35 * - ihevc_weighted_pred_chroma_bi_sse42() 36 * 37 * @remarks 38 * None 39 * 40 ******************************************************************************* 41 */ 42 /*****************************************************************************/ 43 /* File Includes */ 44 /*****************************************************************************/ 45 #include <stdio.h> 46 #include <assert.h> 47 48 #include "ihevc_debug.h" 49 #include "ihevc_typedefs.h" 50 #include "ihevc_macros.h" 51 #include "ihevc_platform_macros.h" 52 #include "ihevc_func_selector.h" 53 #include "ihevc_defs.h" 54 #include "ihevc_weighted_pred.h" 55 #include "ihevc_inter_pred.h" 56 57 #include <immintrin.h> 58 59 /** 60 ******************************************************************************* 61 * 62 * @brief 63 * Does uni-weighted prediction on the array pointed by pi2_src and stores 64 * it at the location pointed by pi2_dst 65 * 66 * @par Description: 67 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 68 * offset 69 * 70 * @param[in] pi2_src 71 * Pointer to the source 72 * 73 * @param[out] pu1_dst 74 * Pointer to the destination 75 * 76 * @param[in] src_strd 77 * Source stride 78 * 79 * @param[in] dst_strd 80 * Destination stride 81 * 82 * @param[in] wgt0 83 * weight to be multiplied to the source 84 * 85 * @param[in] off0 86 * offset to be added after rounding and 87 * 88 * @param[in] shifting 89 * 90 * 91 * @param[in] shift 92 * (14 Bit depth) + log2_weight_denominator 93 * 94 * @param[in] lvl_shift 95 * added before shift and offset 96 * 97 * @param[in] ht 98 * height of the source 99 * 100 * @param[in] wd 101 * width of the source 102 * 103 * @returns 104 * 105 * @remarks 106 * None 107 * 108 ******************************************************************************* 109 */ 110 111 void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src, 112 UWORD8 *pu1_dst, 113 WORD32 src_strd, 114 WORD32 dst_strd, 115 WORD32 wgt0, 116 WORD32 off0, 117 WORD32 shift, 118 WORD32 lvl_shift, 119 WORD32 ht, 120 WORD32 wd) 121 { 122 WORD32 row, col, temp; 123 WORD32 dst0, dst1, dst2, dst3; 124 125 /* all 128 bit registers are named with a suffix mxnb, where m is the */ 126 /* number of n bits packed in the register */ 127 __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b; 128 __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b; 129 130 ASSERT(wd % 4 == 0); /* checking assumption*/ 131 ASSERT(ht % 4 == 0); /* checking assumption*/ 132 133 temp = 1 << (shift - 1); 134 135 // seting values in register 136 const_temp_4x32b = _mm_set1_epi32(temp); 137 lvl_shift_4x32b = _mm_set1_epi32(lvl_shift); 138 wgt0_4x32b = _mm_set1_epi32(wgt0); 139 off0_4x32b = _mm_set1_epi32(off0); 140 141 if(0 == (wd & 7)) /* wd multiple of 8 case */ 142 { 143 __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b; 144 145 /* outer for loop starts from here */ 146 for(row = 0; row < ht; row += 4) 147 { 148 for(col = 0; col < wd; col += 8) 149 { /* for row =0 ,1,2,3*/ 150 151 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 152 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); 153 /* row = 1 */ 154 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 155 /* row = 2 */ 156 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd)); 157 /* row = 3 */ 158 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd)); 159 160 /* row = 0 */ /* Last 4 pixels */ 161 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4)); 162 /* row = 1 */ 163 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4)); 164 /* row = 2 */ 165 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4)); 166 /* row = 3 */ 167 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4)); 168 169 /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */ 170 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); 171 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 172 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 173 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 174 175 /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */ 176 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); 177 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); 178 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b); 179 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b); 180 181 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */ 182 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); 183 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 184 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b); 185 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); 186 187 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ 188 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); 189 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); 190 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); 191 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); 192 193 /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */ 194 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b); 195 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b); 196 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b); 197 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b); 198 199 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */ 200 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b); 201 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b); 202 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b); 203 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b); 204 205 /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */ 206 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); 207 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 208 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b); 209 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); 210 211 /* (i4_tmp >> shift) */ /* First 4 pixels */ 212 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); 213 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 214 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); 215 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 216 217 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ 218 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b); 219 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b); 220 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b); 221 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b); 222 223 /* (i4_tmp >> shift) */ /* Last 4 pixels */ 224 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift); 225 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); 226 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift); 227 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); 228 229 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */ 230 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); 231 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); 232 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b); 233 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b); 234 235 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 236 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b); 237 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b); 238 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b); 239 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b); 240 241 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b); 242 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b); 243 src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b); 244 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b); 245 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 246 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b); 247 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); 248 src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b); 249 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b); 250 251 /* store four 8-bit output values */ 252 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/ 253 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/ 254 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/ 255 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/ 256 257 /* To update pointer */ 258 pi2_src += 8; 259 pu1_dst += 8; 260 261 } /* inner loop ends here(4-output values in single iteration) */ 262 263 pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ 264 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 265 266 } 267 } 268 else /* wd multiple of 4 case */ 269 { 270 /* outer for loop starts from here */ 271 for(row = 0; row < ht; row += 4) 272 { 273 for(col = 0; col < wd; col += 4) 274 { /* for row =0 ,1,2,3*/ 275 276 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 277 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); 278 /* row = 1 */ 279 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 280 /* row = 2 */ 281 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd)); 282 /* row = 3 */ 283 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd)); 284 285 /* considering pix. 4:0 by converting 16-into 32 bit */ 286 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); 287 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 288 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 289 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 290 291 /* (pi2_src[col] + lvl_shift)*/ 292 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); 293 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); 294 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b); 295 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b); 296 297 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 298 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); 299 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 300 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b); 301 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); 302 303 /* i4_tmp += 1 << (shift - 1) */ 304 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); 305 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 306 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b); 307 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); 308 309 /* (i4_tmp >> shift) */ 310 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); 311 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 312 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); 313 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 314 315 /*i4_tmp = (i4_tmp >> shift) + off0; */ 316 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); 317 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); 318 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b); 319 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b); 320 321 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b); 322 src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b); 323 324 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 325 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b); 326 327 dst0 = _mm_cvtsi128_si32(src_temp0_4x32b); 328 /* dst row = 1 to 3 */ 329 src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1); 330 src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2); 331 src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3); 332 333 /* store four 8-bit output values */ 334 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 335 336 dst1 = _mm_cvtsi128_si32(src_temp1_4x32b); 337 dst2 = _mm_cvtsi128_si32(src_temp2_4x32b); 338 dst3 = _mm_cvtsi128_si32(src_temp3_4x32b); 339 340 /* row = 1 to row = 3 */ 341 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 342 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; 343 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; 344 345 /* To update pointer */ 346 pi2_src += 4; 347 pu1_dst += 4; 348 349 } /* inner loop ends here(4-output values in single iteration) */ 350 351 pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */ 352 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 353 354 } 355 } 356 } 357 358 /** 359 ******************************************************************************* 360 * 361 * @brief 362 * Does chroma uni-weighted prediction on array pointed by pi2_src and stores 363 * it at the location pointed by pi2_dst 364 * 365 * @par Description: 366 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 367 * offset 368 * 369 * @param[in] pi2_src 370 * Pointer to the source 371 * 372 * @param[out] pu1_dst 373 * Pointer to the destination 374 * 375 * @param[in] src_strd 376 * Source stride 377 * 378 * @param[in] dst_strd 379 * Destination stride 380 * 381 * @param[in] wgt0 382 * weight to be multiplied to the source 383 * 384 * @param[in] off0 385 * offset to be added after rounding and 386 * 387 * @param[in] shifting 388 * 389 * 390 * @param[in] shift 391 * (14 Bit depth) + log2_weight_denominator 392 * 393 * @param[in] lvl_shift 394 * added before shift and offset 395 * 396 * @param[in] ht 397 * height of the source 398 * 399 * @param[in] wd 400 * width of the source (each colour component) 401 * 402 * @returns 403 * 404 * @remarks 405 * None 406 * 407 ******************************************************************************* 408 */ 409 410 void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src, 411 UWORD8 *pu1_dst, 412 WORD32 src_strd, 413 WORD32 dst_strd, 414 WORD32 wgt0_cb, 415 WORD32 wgt0_cr, 416 WORD32 off0_cb, 417 WORD32 off0_cr, 418 WORD32 shift, 419 WORD32 lvl_shift, 420 WORD32 ht, 421 WORD32 wd) 422 { 423 WORD32 row, col, temp, wdx2; 424 /* all 128 bit registers are named with a suffix mxnb, where m is the */ 425 /* number of n bits packed in the register */ 426 427 __m128i src_temp0_4x32b, src_temp1_4x32b; 428 __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b; 429 430 ASSERT(wd % 2 == 0); /* checking assumption*/ 431 ASSERT(ht % 2 == 0); /* checking assumption*/ 432 433 temp = 1 << (shift - 1); 434 wdx2 = 2 * wd; 435 436 // seting values in register 437 const_temp_4x32b = _mm_set1_epi32(temp); 438 lvl_shift_4x32b = _mm_set1_epi32(lvl_shift); 439 wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); 440 off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb); 441 442 #if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */ 443 if( 0 == (ht & 3)) /* ht multiple of 4 case */ 444 { 445 if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */ 446 { 447 __m128i src_temp2_4x32b, src_temp3_4x32b; 448 __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b; 449 __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b; 450 __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b; 451 /* outer for loop starts from here */ 452 for(row = 0; row < ht; row +=4) 453 { 454 for(col = 0; col < wdx2; col +=16) 455 { 456 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 457 src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src)); 458 /* row = 1 */ 459 src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd)); 460 /* row = 0 */ /* Second 4 pixels */ 461 src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4)); 462 /* row = 1 */ 463 src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4)); 464 /* row = 0 */ /* Third 4 pixels */ 465 src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8)); 466 /* row = 1 */ 467 src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8)); 468 /* row = 0 */ /* Last 4 pixels */ 469 src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12)); 470 /* row = 1 */ 471 src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12)); 472 473 /* considering pix. 4:0 by converting 16-into 32 bit */ 474 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); 475 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 476 /* (pi2_src[col] + lvl_shift)*/ 477 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b); 478 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b); 479 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 480 src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b); 481 src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b); 482 483 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */ 484 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 485 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 486 /* (pi2_src[col] + lvl_shift)*/ 487 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b); 488 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b); 489 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 490 src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b); 491 src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b); 492 493 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */ 494 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); 495 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); 496 /* (pi2_src[col] + lvl_shift)*/ 497 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b); 498 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b); 499 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 500 src_temp4_4x32b = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b); 501 src_temp5_4x32b = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b); 502 503 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ 504 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); 505 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); 506 /* (pi2_src[col] + lvl_shift)*/ 507 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b); 508 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b); 509 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 510 src_temp6_4x32b = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b); 511 src_temp7_4x32b = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b); 512 513 /* i4_tmp += 1 << (shift - 1) */ 514 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b); 515 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b); 516 /* (i4_tmp >> shift) */ 517 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); 518 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 519 520 /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */ 521 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b); 522 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b); 523 /* (i4_tmp >> shift) */ 524 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); 525 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 526 527 /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */ 528 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b); 529 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b); 530 /* (i4_tmp >> shift) */ 531 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift); 532 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); 533 534 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ 535 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b); 536 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b); 537 /* (i4_tmp >> shift) */ 538 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift); 539 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); 540 541 /*i4_tmp = (i4_tmp >> shift) + off0; */ 542 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b); 543 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b); 544 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ 545 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b); 546 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b); 547 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ 548 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b); 549 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b); 550 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 551 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b); 552 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b); 553 554 src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b); 555 src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b); 556 src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b); 557 src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b); 558 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 559 src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b); 560 src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b); 561 562 /* store 16 8-bit output values */ 563 _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/ 564 _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/ 565 566 /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 567 src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd)); 568 /* row = 3 */ 569 src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd)); 570 /* row = 2 */ /* Second 4 pixels */ 571 src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4)); 572 /* row = 3 */ 573 src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4)); 574 /* row = 2 */ /* Third 4 pixels */ 575 src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8)); 576 /* row = 3 */ 577 src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8)); 578 /* row = 2 */ /* Last 4 pixels */ 579 src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12)); 580 /* row = 3 */ 581 src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12)); 582 583 /* considering pix. 4:0 by converting 16-into 32 bit */ 584 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b); 585 src_temp9_4x32b = _mm_cvtepi16_epi32(src_temp9_4x32b); 586 /* (pi2_src[col] + lvl_shift)*/ 587 src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b); 588 src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b); 589 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 590 src_temp8_4x32b = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b); 591 src_temp9_4x32b = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b); 592 593 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */ 594 src_temp10_4x32b = _mm_cvtepi16_epi32(src_temp10_4x32b); 595 src_temp11_4x32b = _mm_cvtepi16_epi32(src_temp11_4x32b); 596 /* (pi2_src[col] + lvl_shift)*/ 597 src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b); 598 src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b); 599 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 600 src_temp10_4x32b = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b); 601 src_temp11_4x32b = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b); 602 603 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */ 604 src_temp12_4x32b = _mm_cvtepi16_epi32(src_temp12_4x32b); 605 src_temp13_4x32b = _mm_cvtepi16_epi32(src_temp13_4x32b); 606 /* (pi2_src[col] + lvl_shift)*/ 607 src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b); 608 src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b); 609 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 610 src_temp12_4x32b = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b); 611 src_temp13_4x32b = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b); 612 613 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ 614 src_temp14_4x32b = _mm_cvtepi16_epi32(src_temp14_4x32b); 615 src_temp15_4x32b = _mm_cvtepi16_epi32(src_temp15_4x32b); 616 /* (pi2_src[col] + lvl_shift)*/ 617 src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b); 618 src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b); 619 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 620 src_temp14_4x32b = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b); 621 src_temp15_4x32b = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b); 622 623 /* i4_tmp += 1 << (shift - 1) */ 624 src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b); 625 src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b); 626 /* (i4_tmp >> shift) */ 627 src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b, shift); 628 src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b, shift); 629 630 /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */ 631 src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b); 632 src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b); 633 /* (i4_tmp >> shift) */ 634 src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b, shift); 635 src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b, shift); 636 637 /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */ 638 src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b); 639 src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b); 640 /* (i4_tmp >> shift) */ 641 src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b, shift); 642 src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b, shift); 643 644 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ 645 src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b); 646 src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b); 647 /* (i4_tmp >> shift) */ 648 src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b, shift); 649 src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b, shift); 650 651 /*i4_tmp = (i4_tmp >> shift) + off0; */ 652 src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b); 653 src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b); 654 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ 655 src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b); 656 src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b); 657 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ 658 src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b); 659 src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b); 660 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 661 src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b); 662 src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b); 663 664 src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b); 665 src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b); 666 src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b); 667 src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b); 668 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 669 src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b); 670 src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b); 671 672 /* store 16 8-bit output values */ 673 _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/ 674 _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/ 675 676 pi2_src += 16; /* Pointer update */ 677 pu1_dst += 16; /* Pointer update */ 678 679 } /* inner loop ends here(4-output values in single iteration) */ 680 pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */ 681 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */ 682 } 683 } 684 else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */ 685 { 686 __m128i src_temp2_4x32b,src_temp3_4x32b; 687 __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b; 688 /* outer for loop starts from here */ 689 for(row = 0; row < ht; row +=4) 690 { 691 for(col = 0; col < wdx2; col +=8) 692 { 693 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 694 src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src)); 695 /* row = 1 */ 696 src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd)); 697 /* row = 2 */ 698 src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd)); 699 /* row = 3 */ 700 src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd)); 701 702 /* row = 0 */ /* Last 4 pixels */ 703 src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4)); 704 /* row = 1 */ 705 src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4)); 706 /* row = 2 */ 707 src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4)); 708 /* row = 3 */ 709 src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4)); 710 711 /* considering pix. 4:0 by converting 16-into 32 bit */ 712 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); 713 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 714 /* (pi2_src[col] + lvl_shift)*/ 715 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b); 716 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b); 717 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 718 src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b); 719 src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b); 720 721 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ 722 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 723 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 724 /* (pi2_src[col] + lvl_shift)*/ 725 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b); 726 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b); 727 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 728 src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b); 729 src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b); 730 731 /* considering pix. 4:0 by converting 16-into 32 bit */ 732 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); 733 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); 734 /* (pi2_src[col] + lvl_shift)*/ 735 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b); 736 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b); 737 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 738 src_temp4_4x32b = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b); 739 src_temp5_4x32b = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b); 740 741 /* considering pix. 4:0 by converting 16-into 32 bit */ 742 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); 743 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); 744 /* (pi2_src[col] + lvl_shift)*/ 745 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b); 746 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b); 747 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 748 src_temp6_4x32b = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b); 749 src_temp7_4x32b = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b); 750 751 /* i4_tmp += 1 << (shift - 1) */ 752 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b); 753 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b); 754 /* (i4_tmp >> shift) */ 755 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); 756 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 757 758 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ 759 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b); 760 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b); 761 /* (i4_tmp >> shift) */ 762 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); 763 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 764 765 /* i4_tmp += 1 << (shift - 1) */ 766 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b); 767 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b); 768 /* (i4_tmp >> shift) */ 769 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift); 770 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); 771 772 /* i4_tmp += 1 << (shift - 1) */ 773 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b); 774 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b); 775 /* (i4_tmp >> shift) */ 776 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift); 777 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); 778 779 /*i4_tmp = (i4_tmp >> shift) + off0; */ 780 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b); 781 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b); 782 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 783 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b); 784 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b); 785 /*i4_tmp = (i4_tmp >> shift) + off0; */ 786 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b); 787 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b); 788 /*i4_tmp = (i4_tmp >> shift) + off0; */ 789 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b); 790 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b); 791 792 src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b); 793 src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b); 794 src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b); 795 src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b); 796 797 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 798 src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b); 799 src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b); 800 src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b); 801 src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b); 802 803 /* store four 8-bit output values */ 804 _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/ 805 _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/ 806 _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/ 807 _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/ 808 809 pi2_src += 8; /* Pointer update */ 810 pu1_dst += 8; /* Pointer update */ 811 812 } /* inner loop ends here(4-output values in single iteration) */ 813 pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */ 814 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */ 815 } 816 } 817 else /* 2*wd multiple of 4 case */ 818 { 819 WORD32 dst0, dst1, dst2, dst3; 820 __m128i src_temp2_4x32b,src_temp3_4x32b; 821 /* outer for loop starts from here */ 822 for(row = 0; row < ht; row +=4) 823 { 824 for(col = 0; col < wdx2; col +=4) 825 { 826 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 827 src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src)); 828 /* row = 1 */ 829 src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd)); 830 /* row = 2 */ 831 src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd)); 832 /* row = 3 */ 833 src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd)); 834 835 /* considering pix. 4:0 by converting 16-into 32 bit */ 836 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); 837 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 838 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 839 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 840 841 /* (pi2_src[col] + lvl_shift)*/ 842 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b); 843 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b); 844 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 845 src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b); 846 src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b); 847 848 /* (pi2_src[col] + lvl_shift)*/ 849 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b); 850 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b); 851 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 852 src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b); 853 src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b); 854 855 /* i4_tmp += 1 << (shift - 1) */ 856 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b); 857 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b); 858 /* (i4_tmp >> shift) */ 859 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); 860 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 861 /*i4_tmp = (i4_tmp >> shift) + off0; */ 862 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b); 863 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b); 864 865 /* i4_tmp += 1 << (shift - 1) */ 866 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b); 867 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b); 868 /* (i4_tmp >> shift) */ 869 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); 870 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 871 /*i4_tmp = (i4_tmp >> shift) + off0; */ 872 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b); 873 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b); 874 875 src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b); 876 src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b); 877 878 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 879 src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b); 880 881 dst0 = _mm_cvtsi128_si32(src_temp0_4x32b); 882 /* dst row = 1 to 3 */ 883 src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1); 884 src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2); 885 src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3); 886 887 /* store four 8-bit output values */ 888 *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0; 889 890 dst1 = _mm_cvtsi128_si32(src_temp1_4x32b); 891 dst2 = _mm_cvtsi128_si32(src_temp2_4x32b); 892 dst3 = _mm_cvtsi128_si32(src_temp3_4x32b); 893 /* row = 1 */ 894 *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1; 895 /* row = 2 */ 896 *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2; 897 /* row = 3 */ 898 *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3; 899 900 pi2_src += 4; /* Pointer update */ 901 pu1_dst += 4; /* Pointer update */ 902 903 } /* inner loop ends here(4-output values in single iteration) */ 904 pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */ 905 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */ 906 } 907 } 908 } 909 else /* ht multiple of 2 case */ 910 #endif 911 912 { 913 if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */ 914 { 915 __m128i src_temp2_4x32b, src_temp3_4x32b; 916 __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b; 917 /* outer for loop starts from here */ 918 for(row = 0; row < ht; row += 2) 919 { 920 for(col = 0; col < wdx2; col += 16) 921 { 922 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 923 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); 924 /* row = 1 */ 925 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 926 927 /* row = 0 */ /* Second 4 pixels */ 928 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4)); 929 /* row = 1 */ 930 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4)); 931 /* row = 0 */ /* Third 4 pixels */ 932 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8)); 933 /* row = 1 */ 934 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8)); 935 /* row = 0 */ /* Last 4 pixels */ 936 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12)); 937 /* row = 1 */ 938 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12)); 939 940 /* considering pix. 4:0 by converting 16-into 32 bit */ 941 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); 942 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 943 /* (pi2_src[col] + lvl_shift)*/ 944 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); 945 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); 946 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 947 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); 948 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 949 950 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */ 951 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 952 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 953 /* (pi2_src[col] + lvl_shift)*/ 954 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b); 955 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b); 956 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 957 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b); 958 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); 959 960 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */ 961 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); 962 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); 963 /* (pi2_src[col] + lvl_shift)*/ 964 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b); 965 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b); 966 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 967 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b); 968 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b); 969 970 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ 971 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); 972 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); 973 /* (pi2_src[col] + lvl_shift)*/ 974 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b); 975 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b); 976 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 977 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b); 978 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b); 979 980 /* i4_tmp += 1 << (shift - 1) */ 981 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); 982 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 983 /* (i4_tmp >> shift) */ 984 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); 985 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 986 987 /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */ 988 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b); 989 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); 990 /* (i4_tmp >> shift) */ 991 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); 992 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 993 994 /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */ 995 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b); 996 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b); 997 /* (i4_tmp >> shift) */ 998 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift); 999 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); 1000 1001 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ 1002 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b); 1003 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b); 1004 /* (i4_tmp >> shift) */ 1005 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift); 1006 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); 1007 1008 /*i4_tmp = (i4_tmp >> shift) + off0; */ 1009 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); 1010 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); 1011 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */ 1012 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b); 1013 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b); 1014 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */ 1015 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b); 1016 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b); 1017 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 1018 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b); 1019 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b); 1020 1021 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b); 1022 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b); 1023 src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b); 1024 src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b); 1025 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 1026 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b); 1027 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b); 1028 1029 /* store 16 8-bit output values */ 1030 _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/ 1031 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/ 1032 1033 pi2_src += 16; /* Pointer update */ 1034 pu1_dst += 16; /* Pointer update */ 1035 1036 } /* inner loop ends here(4-output values in single iteration) */ 1037 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 1038 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 1039 } 1040 } 1041 else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */ 1042 { 1043 __m128i src_temp2_4x32b, src_temp3_4x32b; 1044 /* outer for loop starts from here */ 1045 for(row = 0; row < ht; row += 2) 1046 { 1047 for(col = 0; col < wdx2; col += 8) 1048 { 1049 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1050 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); 1051 /* row = 1 */ 1052 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 1053 1054 /* row = 0 */ /* Last 4 pixels */ 1055 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4)); 1056 /* row = 1 */ 1057 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4)); 1058 1059 /* considering pix. 4:0 by converting 16-into 32 bit */ 1060 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); 1061 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 1062 /* (pi2_src[col] + lvl_shift)*/ 1063 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); 1064 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); 1065 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 1066 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); 1067 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 1068 1069 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */ 1070 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 1071 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 1072 /* (pi2_src[col] + lvl_shift)*/ 1073 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b); 1074 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b); 1075 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 1076 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b); 1077 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); 1078 1079 /* i4_tmp += 1 << (shift - 1) */ 1080 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); 1081 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 1082 /* (i4_tmp >> shift) */ 1083 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); 1084 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 1085 1086 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */ 1087 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b); 1088 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); 1089 /* (i4_tmp >> shift) */ 1090 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift); 1091 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 1092 1093 /*i4_tmp = (i4_tmp >> shift) + off0; */ 1094 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); 1095 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); 1096 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */ 1097 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b); 1098 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b); 1099 1100 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b); 1101 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b); 1102 1103 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 1104 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b); 1105 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); 1106 1107 /* store four 8-bit output values */ 1108 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/ 1109 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/ 1110 1111 pi2_src += 8; /* Pointer update */ 1112 pu1_dst += 8; /* Pointer update */ 1113 1114 } /* inner loop ends here(4-output values in single iteration) */ 1115 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 1116 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 1117 } 1118 } 1119 else /* 2*wd multiple of 4 case */ 1120 { 1121 WORD32 dst0, dst1; 1122 /* outer for loop starts from here */ 1123 for(row = 0; row < ht; row += 2) 1124 { 1125 for(col = 0; col < wdx2; col += 4) 1126 { 1127 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1128 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src)); 1129 /* row = 1 */ 1130 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd)); 1131 1132 /* considering pix. 4:0 by converting 16-into 32 bit */ 1133 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b); 1134 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 1135 1136 /* (pi2_src[col] + lvl_shift)*/ 1137 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b); 1138 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b); 1139 1140 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ 1141 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b); 1142 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 1143 1144 /* i4_tmp += 1 << (shift - 1) */ 1145 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b); 1146 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 1147 1148 /* (i4_tmp >> shift) */ 1149 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift); 1150 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 1151 1152 /*i4_tmp = (i4_tmp >> shift) + off0; */ 1153 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b); 1154 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b); 1155 1156 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b); 1157 1158 /* pu1_dst[col] = CLIP_U8(i4_tmp); */ 1159 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b); 1160 1161 dst0 = _mm_cvtsi128_si32(src_temp0_4x32b); 1162 /* dst row = 1 to 3 */ 1163 src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1); 1164 1165 /* store four 8-bit output values */ 1166 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 1167 1168 dst1 = _mm_cvtsi128_si32(src_temp1_4x32b); 1169 /* row = 1 */ 1170 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 1171 1172 pi2_src += 4; /* Pointer update */ 1173 pu1_dst += 4; /* Pointer update */ 1174 1175 } /* inner loop ends here(4-output values in single iteration) */ 1176 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */ 1177 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 1178 } 1179 } 1180 } 1181 } 1182 1183 /** 1184 ******************************************************************************* 1185 * 1186 * @brief 1187 * Does bi-weighted prediction on the arrays pointed by pi2_src1 and 1188 * pi2_src2 and stores it at location pointed by pi2_dst 1189 * 1190 * @par Description: 1191 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 1192 * off1 + 1) << (shift - 1) ) >> shift 1193 * 1194 * @param[in] pi2_src1 1195 * Pointer to source 1 1196 * 1197 * @param[in] pi2_src2 1198 * Pointer to source 2 1199 * 1200 * @param[out] pu1_dst 1201 * Pointer to destination 1202 * 1203 * @param[in] src_strd1 1204 * Source stride 1 1205 * 1206 * @param[in] src_strd2 1207 * Source stride 2 1208 * 1209 * @param[in] dst_strd 1210 * Destination stride 1211 * 1212 * @param[in] wgt0 1213 * weight to be multiplied to source 1 1214 * 1215 * @param[in] off0 1216 * offset 0 1217 * 1218 * @param[in] wgt1 1219 * weight to be multiplied to source 2 1220 * 1221 * @param[in] off1 1222 * offset 1 1223 * 1224 * @param[in] shift 1225 * (14 Bit depth) + log2_weight_denominator 1226 * 1227 * @param[in] lvl_shift1 1228 * added before shift and offset 1229 * 1230 * @param[in] lvl_shift2 1231 * added before shift and offset 1232 * 1233 * @param[in] ht 1234 * height of the source 1235 * 1236 * @param[in] wd 1237 * width of the source 1238 * 1239 * @returns 1240 * 1241 * @remarks 1242 * None 1243 * 1244 ******************************************************************************* 1245 */ 1246 1247 void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1, 1248 WORD16 *pi2_src2, 1249 UWORD8 *pu1_dst, 1250 WORD32 src_strd1, 1251 WORD32 src_strd2, 1252 WORD32 dst_strd, 1253 WORD32 wgt0, 1254 WORD32 off0, 1255 WORD32 wgt1, 1256 WORD32 off1, 1257 WORD32 shift, 1258 WORD32 lvl_shift1, 1259 WORD32 lvl_shift2, 1260 WORD32 ht, 1261 WORD32 wd) 1262 { 1263 WORD32 row, col, temp; 1264 1265 __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b; 1266 __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b; 1267 1268 1269 ASSERT(wd % 4 == 0); /* checking assumption*/ 1270 ASSERT(ht % 2 == 0); /* checking assumption*/ 1271 1272 temp = (off0 + off1 + 1) << (shift - 1); 1273 1274 // seting values in register 1275 const_temp_4x32b = _mm_set1_epi32(temp); 1276 lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1); 1277 lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2); 1278 wgt0_4x32b = _mm_set1_epi32(wgt0); 1279 wgt1_4x32b = _mm_set1_epi32(wgt1); 1280 1281 if(0 == (wd & 7)) /* wd multiple of 8 case */ 1282 { 1283 __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b; 1284 /* outer for loop starts from here */ 1285 for(row = 0; row < ht; row += 2) 1286 { 1287 for(col = 0; col < wd; col += 8) 1288 { 1289 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1290 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ 1291 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ 1292 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 1293 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 1294 /* Next 4 pixels */ 1295 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */ 1296 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */ 1297 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */ 1298 src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */ 1299 1300 /* considering pix. 4:0 by converting 16-into 32 bit */ 1301 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 1302 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 1303 /* (pi2_src1[col] + lvl_shift1) */ 1304 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b); 1305 /* (pi2_src2[col] + lvl_shift2) */ 1306 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b); 1307 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */ 1308 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 1309 /*(pi2_src2[col] + lvl_shift2) * wgt1 */ 1310 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b); 1311 1312 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 1313 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); 1314 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b); 1315 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b); 1316 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); 1317 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b); 1318 1319 /* Next 4 Pixels */ 1320 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); 1321 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); 1322 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b); 1323 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b); 1324 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b); 1325 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b); 1326 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); 1327 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b); 1328 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b); 1329 src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b); 1330 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b); 1331 src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b); 1332 1333 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 1334 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b); 1335 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b); 1336 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 1337 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 1338 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); 1339 /* (i4_tmp >> shift) */ 1340 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 1341 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 1342 1343 /* Next 4 Pixels */ 1344 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b); 1345 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b); 1346 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b); 1347 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b); 1348 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); 1349 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); 1350 1351 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b); 1352 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b); 1353 1354 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1355 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); 1356 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b); 1357 1358 /* store four 8-bit output values */ 1359 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/ 1360 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/ 1361 1362 pi2_src1 += 8; /* Pointer update */ 1363 pi2_src2 += 8; /* Pointer update */ 1364 pu1_dst += 8; /* Pointer update */ 1365 1366 } /* inner loop ends here(4-output values in single iteration) */ 1367 1368 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 1369 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 1370 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 1371 1372 } /* outer loop ends */ 1373 } 1374 else /* wd multiple of 4 case */ 1375 { 1376 WORD32 dst0, dst1; 1377 /* outer for loop starts from here */ 1378 for(row = 0; row < ht; row += 2) 1379 { 1380 for(col = 0; col < wd; col += 4) 1381 { 1382 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1383 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ 1384 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ 1385 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 1386 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 1387 1388 /* considering pix. 4:0 by converting 16-into 32 bit */ 1389 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 1390 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 1391 /* (pi2_src1[col] + lvl_shift1) */ 1392 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b); 1393 /* (pi2_src2[col] + lvl_shift2) */ 1394 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b); 1395 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */ 1396 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 1397 /*(pi2_src2[col] + lvl_shift2) * wgt1 */ 1398 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b); 1399 1400 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 1401 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); 1402 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b); 1403 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b); 1404 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); 1405 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b); 1406 1407 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 1408 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b); 1409 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b); 1410 1411 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 1412 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 1413 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); 1414 1415 /* (i4_tmp >> shift) */ 1416 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 1417 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 1418 1419 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b); 1420 1421 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1422 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); 1423 1424 dst0 = _mm_cvtsi128_si32(src_temp1_4x32b); 1425 1426 /* dst row = 1 to 3 */ 1427 src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1); 1428 1429 /* store four 8-bit output values */ 1430 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 1431 1432 dst1 = _mm_cvtsi128_si32(src_temp2_4x32b); 1433 1434 /* row = 1 to 3 */ 1435 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 1436 1437 pi2_src1 += 4; /* Pointer update */ 1438 pi2_src2 += 4; /* Pointer update */ 1439 pu1_dst += 4; /* Pointer update */ 1440 1441 } /* inner loop ends here(4-output values in single iteration) */ 1442 1443 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 1444 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 1445 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 1446 1447 } /* outer loop ends */ 1448 } 1449 1450 } 1451 1452 /** 1453 ******************************************************************************* 1454 * 1455 * @brief 1456 * Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and 1457 * pi2_src2 and stores it at location pointed by pi2_dst 1458 * 1459 * @par Description: 1460 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 1461 * off1 + 1) << (shift - 1) ) >> shift 1462 * 1463 * @param[in] pi2_src1 1464 * Pointer to source 1 1465 * 1466 * @param[in] pi2_src2 1467 * Pointer to source 2 1468 * 1469 * @param[out] pu1_dst 1470 * Pointer to destination 1471 * 1472 * @param[in] src_strd1 1473 * Source stride 1 1474 * 1475 * @param[in] src_strd2 1476 * Source stride 2 1477 * 1478 * @param[in] dst_strd 1479 * Destination stride 1480 * 1481 * @param[in] wgt0 1482 * weight to be multiplied to source 1 1483 * 1484 * @param[in] off0 1485 * offset 0 1486 * 1487 * @param[in] wgt1 1488 * weight to be multiplied to source 2 1489 * 1490 * @param[in] off1 1491 * offset 1 1492 * 1493 * @param[in] shift 1494 * (14 Bit depth) + log2_weight_denominator 1495 * 1496 * @param[in] lvl_shift1 1497 * added before shift and offset 1498 * 1499 * @param[in] lvl_shift2 1500 * added before shift and offset 1501 * 1502 * @param[in] ht 1503 * height of the source 1504 * 1505 * @param[in] wd 1506 * width of the source (each colour component) 1507 * 1508 * @returns 1509 * 1510 * @remarks 1511 * None 1512 * 1513 ******************************************************************************* 1514 */ 1515 1516 void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1, 1517 WORD16 *pi2_src2, 1518 UWORD8 *pu1_dst, 1519 WORD32 src_strd1, 1520 WORD32 src_strd2, 1521 WORD32 dst_strd, 1522 WORD32 wgt0_cb, 1523 WORD32 wgt0_cr, 1524 WORD32 off0_cb, 1525 WORD32 off0_cr, 1526 WORD32 wgt1_cb, 1527 WORD32 wgt1_cr, 1528 WORD32 off1_cb, 1529 WORD32 off1_cr, 1530 WORD32 shift, 1531 WORD32 lvl_shift1, 1532 WORD32 lvl_shift2, 1533 WORD32 ht, 1534 WORD32 wd) 1535 { 1536 WORD32 row, col, temp1, temp2; 1537 WORD32 wdx2; 1538 1539 __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b; 1540 __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b; 1541 1542 1543 ASSERT(wd % 2 == 0); /* checking assumption*/ 1544 ASSERT(ht % 2 == 0); /* checking assumption*/ 1545 1546 temp1 = (off0_cb + off1_cb + 1) << (shift - 1); 1547 temp2 = (off0_cr + off1_cr + 1) << (shift - 1); 1548 1549 // seting values in register 1550 const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1); 1551 lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1); 1552 lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2); 1553 wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb); 1554 wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb); 1555 1556 wdx2 = wd * 2; 1557 1558 if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */ 1559 { 1560 __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b; 1561 /* outer for loop starts from here */ 1562 for(row = 0; row < ht; row += 2) 1563 { 1564 for(col = 0; col < wdx2; col += 8) 1565 { 1566 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1567 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ 1568 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ 1569 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 1570 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 1571 /* Next 4 pixels */ 1572 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */ 1573 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */ 1574 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */ 1575 src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */ 1576 1577 /* considering pix. 4:0 by converting 16-into 32 bit */ 1578 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 1579 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 1580 /* (pi2_src1[col] + lvl_shift1) */ 1581 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b); 1582 /* (pi2_src2[col] + lvl_shift2) */ 1583 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b); 1584 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */ 1585 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 1586 /*(pi2_src2[col] + lvl_shift2) * wgt1 */ 1587 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b); 1588 1589 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 1590 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); 1591 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b); 1592 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b); 1593 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); 1594 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b); 1595 1596 /* Next 4 Pixels */ 1597 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b); 1598 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b); 1599 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b); 1600 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b); 1601 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b); 1602 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b); 1603 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b); 1604 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b); 1605 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b); 1606 src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b); 1607 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b); 1608 src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b); 1609 1610 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 1611 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b); 1612 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b); 1613 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 1614 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 1615 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); 1616 /* (i4_tmp >> shift) */ 1617 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 1618 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 1619 1620 /* Next 4 Pixels */ 1621 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b); 1622 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b); 1623 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b); 1624 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b); 1625 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift); 1626 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift); 1627 1628 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b); 1629 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b); 1630 1631 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1632 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); 1633 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b); 1634 1635 /* store four 8-bit output values */ 1636 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/ 1637 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/ 1638 1639 pi2_src1 += 8; /* Pointer update */ 1640 pi2_src2 += 8; /* Pointer update */ 1641 pu1_dst += 8; /* Pointer update */ 1642 1643 } /* inner loop ends here(4-output values in single iteration) */ 1644 1645 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 1646 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 1647 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 1648 1649 } /* outer loop ends */ 1650 } 1651 else /* wdx2 multiple of 4 case */ 1652 { 1653 WORD32 dst0, dst1; 1654 /* outer for loop starts from here */ 1655 for(row = 0; row < ht; row += 2) 1656 { 1657 for(col = 0; col < wdx2; col += 4) 1658 { 1659 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1660 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */ 1661 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */ 1662 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */ 1663 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */ 1664 1665 /* considering pix. 4:0 by converting 16-into 32 bit */ 1666 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b); 1667 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b); 1668 /* (pi2_src1[col] + lvl_shift1) */ 1669 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b); 1670 /* (pi2_src2[col] + lvl_shift2) */ 1671 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b); 1672 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */ 1673 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b); 1674 /*(pi2_src2[col] + lvl_shift2) * wgt1 */ 1675 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b); 1676 1677 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b); 1678 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b); 1679 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b); 1680 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b); 1681 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b); 1682 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b); 1683 1684 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */ 1685 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b); 1686 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b); 1687 1688 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */ 1689 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b); 1690 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b); 1691 1692 /* (i4_tmp >> shift) */ 1693 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift); 1694 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift); 1695 1696 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b); 1697 1698 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1699 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b); 1700 1701 dst0 = _mm_cvtsi128_si32(src_temp1_4x32b); 1702 1703 /* dst row = 1 to 3 */ 1704 src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1); 1705 1706 /* store four 8-bit output values */ 1707 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 1708 1709 dst1 = _mm_cvtsi128_si32(src_temp2_4x32b); 1710 1711 /* row = 1 to 3 */ 1712 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 1713 1714 pi2_src1 += 4; /* Pointer update */ 1715 pi2_src2 += 4; /* Pointer update */ 1716 pu1_dst += 4; /* Pointer update */ 1717 1718 } /* inner loop ends here(4-output values in single iteration) */ 1719 1720 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */ 1721 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */ 1722 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */ 1723 } 1724 } 1725 1726 } 1727 1728 /** 1729 ******************************************************************************* 1730 * 1731 * @brief 1732 * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and 1733 * pi2_src2 and stores it at location pointed by pi2_dst 1734 * 1735 * @par Description: 1736 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 1737 * >> shift where shift = 15 - BitDepth 1738 * 1739 * @param[in] pi2_src1 1740 * Pointer to source 1 1741 * 1742 * @param[in] pi2_src2 1743 * Pointer to source 2 1744 * 1745 * @param[out] pu1_dst 1746 * Pointer to destination 1747 * 1748 * @param[in] src_strd1 1749 * Source stride 1 1750 * 1751 * @param[in] src_strd2 1752 * Source stride 2 1753 * 1754 * @param[in] dst_strd 1755 * Destination stride 1756 * 1757 * @param[in] lvl_shift1 1758 * added before shift and offset 1759 * 1760 * @param[in] lvl_shift2 1761 * added before shift and offset 1762 * 1763 * @param[in] ht 1764 * height of the source 1765 * 1766 * @param[in] wd 1767 * width of the source 1768 * 1769 * @returns 1770 * 1771 * @remarks 1772 * None 1773 * 1774 * Assumption : ht%4 == 0, wd%4 == 0 1775 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case, 1776 * final result will match even if intermediate precision is in 16 bit. 1777 * 1778 ******************************************************************************* 1779 */ 1780 1781 void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1, 1782 WORD16 *pi2_src2, 1783 UWORD8 *pu1_dst, 1784 WORD32 src_strd1, 1785 WORD32 src_strd2, 1786 WORD32 dst_strd, 1787 WORD32 lvl_shift1, 1788 WORD32 lvl_shift2, 1789 WORD32 ht, 1790 WORD32 wd) 1791 { 1792 WORD32 row, col, temp; 1793 WORD32 shift; 1794 1795 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b; 1796 __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b; 1797 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b; 1798 1799 ASSERT(wd % 4 == 0); /* checking assumption*/ 1800 ASSERT(ht % 2 == 0); /* checking assumption*/ 1801 1802 shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 1803 temp = 1 << (shift - 1); 1804 1805 // seting values in register 1806 lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1); 1807 lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2); 1808 const_temp_8x16b = _mm_set1_epi16(temp); 1809 1810 lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b); 1811 lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b); 1812 1813 if(0 == (ht & 3)) /* ht multiple of 4*/ 1814 { 1815 if(0 == (wd & 15)) /* wd multiple of 16 case */ 1816 { 1817 __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b; 1818 __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b; 1819 /* outer for loop starts from here */ 1820 for(row = 0; row < ht; row += 4) 1821 { 1822 for(col = 0; col < wd; col += 16) 1823 { 1824 /*load 8 pixel values */ /* First 8 Values */ 1825 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 1826 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 1827 /* row = 1 */ 1828 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 1829 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 1830 /* row = 2 */ 1831 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 1832 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 1833 /* row = 3 */ 1834 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 1835 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 1836 1837 /*load 8 pixel values */ /* Second 8 Values */ 1838 src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8)); 1839 src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8)); 1840 /* row = 1 */ 1841 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8)); 1842 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8)); 1843 /* row = 2 */ 1844 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8)); 1845 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8)); 1846 1847 /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */ 1848 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1849 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 1850 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 1851 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 1852 1853 /*load 8 pixel values */ /* Second 8 Values */ 1854 /* row = 3 */ 1855 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8)); 1856 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8)); 1857 1858 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */ 1859 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1860 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 1861 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 1862 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 1863 1864 /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */ 1865 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b); 1866 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b); 1867 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b); 1868 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b); 1869 1870 /* (i4_tmp >> shift) */ /* First 8 Values */ 1871 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1872 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 1873 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 1874 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 1875 1876 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */ 1877 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b); 1878 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b); 1879 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b); 1880 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b); 1881 1882 /* (i4_tmp >> shift) */ /* Second 8 Values */ 1883 src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift); 1884 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift); 1885 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift); 1886 src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift); 1887 1888 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */ 1889 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b); 1890 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b); 1891 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b); 1892 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b); 1893 1894 /* store four 8-bit output values */ /* 16 8 Values */ 1895 _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 1896 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 1897 _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 1898 _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 1899 1900 /* To update pointer */ 1901 pi2_src1 += 16; 1902 pi2_src2 += 16; 1903 pu1_dst += 16; 1904 1905 } /* inner loop ends here(8-output values in single iteration) */ 1906 1907 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 1908 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 1909 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 1910 1911 } 1912 } 1913 else if(0 == (wd & 7)) /* multiple of 8 case */ 1914 { 1915 /* outer for loop starts from here */ 1916 for(row = 0; row < ht; row += 4) 1917 { 1918 for(col = 0; col < wd; col += 8) 1919 { 1920 /*load 8 pixel values */ 1921 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1)); 1922 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2)); 1923 /* row = 1 */ 1924 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1)); 1925 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2)); 1926 /* row = 2 */ 1927 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1)); 1928 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2)); 1929 /* row = 3 */ 1930 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1)); 1931 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2)); 1932 1933 /* (pi2_src1[col] + pi2_src2[col]) */ 1934 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 1935 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b); 1936 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 1937 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b); 1938 1939 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 1940 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 1941 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b); 1942 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 1943 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b); 1944 1945 /* (i4_tmp >> shift) */ 1946 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 1947 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift); 1948 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 1949 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift); 1950 1951 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 1952 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 1953 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b); 1954 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 1955 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b); 1956 1957 /* store four 8-bit output values */ 1958 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/ 1959 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/ 1960 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/ 1961 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/ 1962 1963 /* To update pointer */ 1964 pi2_src1 += 8; 1965 pi2_src2 += 8; 1966 pu1_dst += 8; 1967 1968 } /* inner loop ends here(8-output values in single iteration) */ 1969 1970 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 1971 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 1972 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 1973 1974 } 1975 } 1976 else /* wd multiple of 4 case*/ 1977 { 1978 WORD32 dst0, dst1, dst2, dst3; 1979 1980 /* outer for loop starts from here */ 1981 for(row = 0; row < ht; row += 4) 1982 { 1983 for(col = 0; col < wd; col += 4) 1984 { 1985 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 1986 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 1987 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 1988 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 1989 1990 /* row = 1 */ 1991 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 1992 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 1993 /* row = 2 */ 1994 src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1)); 1995 src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2)); 1996 /* row = 3 */ 1997 src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1)); 1998 src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2)); 1999 2000 /* Pack two rows together */ 2001 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 2002 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 2003 src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b); 2004 src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b); 2005 2006 /* (pi2_src1[col] + pi2_src2[col]) */ 2007 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 2008 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b); 2009 2010 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 2011 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 2012 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b); 2013 2014 /* (i4_tmp >> shift) */ 2015 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 2016 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift); 2017 2018 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 2019 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 2020 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b); 2021 2022 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 2023 /* dst row = 1 to 3 */ 2024 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 2025 src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1); 2026 2027 /* store four 8-bit output values */ 2028 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 2029 2030 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 2031 dst2 = _mm_cvtsi128_si32(src_temp5_8x16b); 2032 dst3 = _mm_cvtsi128_si32(src_temp4_8x16b); 2033 2034 /* row = 1 to row = 3 */ 2035 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 2036 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; 2037 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; 2038 2039 /* To update pointer */ 2040 pi2_src1 += 4; 2041 pi2_src2 += 4; 2042 pu1_dst += 4; 2043 2044 } /* inner loop ends here(4-output values in single iteration) */ 2045 2046 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */ 2047 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */ 2048 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */ 2049 2050 } 2051 } 2052 } 2053 else /* ht multiple of 2 case and wd multiple of 4 case*/ 2054 { 2055 2056 WORD32 dst0, dst1; 2057 2058 /* outer for loop starts from here */ 2059 for(row = 0; row < ht; row += 2) 2060 { 2061 for(col = 0; col < wd; col += 4) 2062 { 2063 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/ 2064 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1)); 2065 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 2066 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2)); 2067 2068 /* row = 1 */ 2069 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1)); 2070 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2)); 2071 2072 /* Pack two rows together */ 2073 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b); 2074 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b); 2075 2076 /* (pi2_src1[col] + pi2_src2[col]) */ 2077 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b); 2078 2079 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ 2080 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b); 2081 2082 /* (i4_tmp >> shift) */ 2083 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift); 2084 2085 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ 2086 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b); 2087 2088 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b); 2089 /* dst row = 1 to 3 */ 2090 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1); 2091 2092 /* store four 8-bit output values */ 2093 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; 2094 2095 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b); 2096 2097 /* row = 1 to row = 3 */ 2098 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; 2099 2100 /* To update pointer */ 2101 pi2_src1 += 4; 2102 pi2_src2 += 4; 2103 pu1_dst += 4; 2104 2105 } /* inner loop ends here(4-output values in single iteration) */ 2106 2107 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */ 2108 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */ 2109 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */ 2110 2111 } 2112 2113 } 2114 2115 } 2116