1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 19 20 /** 21 ******************************************************************************* 22 * @file 23 * ihevc_inter_pred_filters_x86_intr.c 24 * 25 * @brief 26 * Contains function definitions for inter prediction interpolation filters 27 * coded in x86 intrinsics 28 * 29 * 30 * @author 31 * 32 * 33 * @par List of Functions: 34 * - ihevc_inter_pred_luma_copy_w16out_sse42() 35 * - ihevc_inter_pred_chroma_copy_sse42() 36 * - ihevc_inter_pred_chroma_copy_w16out_sse42() 37 * 38 * @remarks 39 * None 40 * 41 ******************************************************************************* 42 */ 43 44 45 /*****************************************************************************/ 46 /* File Includes */ 47 /*****************************************************************************/ 48 #include <assert.h> 49 50 #include "ihevc_debug.h" 51 #include "ihevc_typedefs.h" 52 #include "ihevc_defs.h" 53 #include "ihevc_inter_pred.h" 54 #include "ihevc_macros.h" 55 #include "ihevc_platform_macros.h" 56 #include "ihevc_func_selector.h" 57 58 #include <immintrin.h> 59 #include <emmintrin.h> 60 #include <smmintrin.h> 61 #include <tmmintrin.h> 62 63 /*****************************************************************************/ 64 /* Function Definitions */ 65 /*****************************************************************************/ 66 67 68 69 /** 70 ******************************************************************************* 71 * 72 * @brief 73 * Interprediction luma filter for copy 16bit output 74 * 75 * @par Description: 76 * Copies the array of width 'wd' and height 'ht' from the location pointed 77 * by 'src' to the location pointed by 'dst' The output is upshifted by 6 78 * bits and is used as input for vertical filtering or weighted prediction 79 * 80 * @param[in] pu1_src 81 * UWORD8 pointer to the source 82 * 83 * @param[out] pi2_dst 84 * WORD16 pointer to the destination 85 * 86 * @param[in] src_strd 87 * integer source stride 88 * 89 * @param[in] dst_strd 90 * integer destination stride 91 * 92 * @param[in] pi1_coeff 93 * WORD8 pointer to the filter coefficients 94 * 95 * @param[in] ht 96 * integer height of the array 97 * 98 * @param[in] wd 99 * integer width of the array 100 * 101 * @returns 102 * 103 * @remarks 104 * None 105 * 106 ******************************************************************************* 107 */ 108 109 void ihevc_inter_pred_luma_copy_w16out_sse42(UWORD8 *pu1_src, 110 WORD16 *pi2_dst, 111 WORD32 src_strd, 112 WORD32 dst_strd, 113 WORD8 *pi1_coeff, 114 WORD32 ht, 115 WORD32 wd) 116 { 117 WORD32 row, col; 118 __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b; 119 UNUSED(pi1_coeff); 120 ASSERT(wd % 4 == 0); /* checking assumption*/ 121 ASSERT(ht % 4 == 0); /* checking assumption*/ 122 123 if(0 == (wd & 7)) /* multiple of 8 case */ 124 { 125 for(row = 0; row < ht; row += 4) 126 { 127 for(col = 0; col < wd; col += 8) 128 { 129 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 130 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 131 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 132 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 133 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 134 135 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 136 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 137 src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b); 138 src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b); 139 140 src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 141 src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 142 src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 143 src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 144 145 /* storing 16 8-bit output values */ 146 _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 147 _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 148 _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 149 _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 150 151 pu1_src += 8; /* pointer update */ 152 pi2_dst += 8; /* pointer update */ 153 } /* inner for loop ends here(8-output values in single iteration) */ 154 155 pu1_src += 4 * src_strd - wd; /* pointer update */ 156 pi2_dst += 4 * dst_strd - wd; /* pointer update */ 157 } 158 } 159 else /* wd = multiple of 4 case */ 160 { 161 for(row = 0; row < ht; row += 4) 162 { 163 for(col = 0; col < wd; col += 4) 164 { 165 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 166 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 167 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 168 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 169 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 170 171 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 172 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 173 src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b); 174 src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b); 175 176 src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 177 src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 178 src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 179 src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 180 181 /* storing 16 8-bit output values */ 182 _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 183 _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 184 _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 185 _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 186 187 pu1_src += 4; /* pointer update */ 188 pi2_dst += 4; /* pointer update */ 189 } /* inner for loop ends here(4-output values in single iteration) */ 190 191 pu1_src += 4 * src_strd - wd; /* pointer update */ 192 pi2_dst += 4 * dst_strd - wd; /* pointer update */ 193 } 194 } 195 } 196 197 /** 198 ******************************************************************************* 199 * 200 * @brief 201 * Chroma interprediction filter for copy 202 * 203 * @par Description: 204 * Copies the array of width 'wd' and height 'ht' from the location pointed 205 * by 'src' to the location pointed by 'dst' 206 * 207 * @param[in] pu1_src 208 * UWORD8 pointer to the source 209 * 210 * @param[out] pu1_dst 211 * UWORD8 pointer to the destination 212 * 213 * @param[in] src_strd 214 * integer source stride 215 * 216 * @param[in] dst_strd 217 * integer destination stride 218 * 219 * @param[in] pi1_coeff 220 * WORD8 pointer to the filter coefficients 221 * 222 * @param[in] ht 223 * integer height of the array 224 * 225 * @param[in] wd 226 * integer width of the array 227 * 228 * @returns 229 * 230 * @remarks 231 * None 232 * 233 ******************************************************************************* 234 */ 235 236 void ihevc_inter_pred_chroma_copy_sse42(UWORD8 *pu1_src, 237 UWORD8 *pu1_dst, 238 WORD32 src_strd, 239 WORD32 dst_strd, 240 WORD8 *pi1_coeff, 241 WORD32 ht, 242 WORD32 wd) 243 { 244 WORD32 row, col, wdx2; 245 __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b; 246 247 ASSERT(wd % 2 == 0); /* checking assumption*/ 248 ASSERT(ht % 2 == 0); /* checking assumption*/ 249 UNUSED(pi1_coeff); 250 wdx2 = wd * 2; 251 252 if(0 == (ht & 3)) /* ht multiple of 4 */ 253 { 254 if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */ 255 { 256 for(row = 0; row < ht; row += 4) 257 { 258 for(col = 0; col < wdx2; col += 16) 259 { 260 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 261 src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */ 262 src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 263 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 264 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 265 266 /* storing 16 8-bit output values */ 267 _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 268 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 269 _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 270 _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 271 272 pu1_src += 16; /* pointer update */ 273 pu1_dst += 16; /* pointer update */ 274 } /* inner for loop ends here(16-output values in single iteration) */ 275 276 pu1_src += 4 * src_strd - wdx2; /* pointer update */ 277 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */ 278 } 279 280 } 281 else if(0 == (wdx2 & 7)) /* multiple of 8 case */ 282 { 283 for(row = 0; row < ht; row += 4) 284 { 285 for(col = 0; col < wdx2; col += 8) 286 { 287 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 288 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 289 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 290 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 291 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 292 293 /* storing 16 8-bit output values */ 294 _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 295 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 296 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 297 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 298 299 pu1_src += 8; /* pointer update */ 300 pu1_dst += 8; /* pointer update */ 301 } /* inner for loop ends here(8-output values in single iteration) */ 302 303 pu1_src += 4 * src_strd - wdx2; /* pointer update */ 304 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */ 305 } 306 } 307 else /* wdx2 = multiple of 4 case */ 308 { 309 WORD32 dst0, dst1, dst2, dst3; 310 for(row = 0; row < ht; row += 4) 311 { 312 for(col = 0; col < wdx2; col += 4) 313 { 314 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 315 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 316 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 317 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 318 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 319 320 dst0 = _mm_cvtsi128_si32(src0_16x8b); 321 dst1 = _mm_cvtsi128_si32(src1_16x8b); 322 dst2 = _mm_cvtsi128_si32(src2_16x8b); 323 dst3 = _mm_cvtsi128_si32(src3_16x8b); 324 325 /* storing 4 8-bit output values */ 326 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */ 327 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */ 328 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */ 329 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */ 330 331 pu1_src += 4; /* pointer update */ 332 pu1_dst += 4; /* pointer update */ 333 } /* inner for loop ends here(4- output values in single iteration) */ 334 335 pu1_src += 4 * src_strd - wdx2; /* pointer update */ 336 pu1_dst += 4 * dst_strd - wdx2; /* pointer update */ 337 } 338 } 339 } 340 else /* ht multiple of 2 */ 341 { 342 if(0 == (wdx2 & 15)) /* wdx2 multiple of 16 case */ 343 { 344 for(row = 0; row < ht; row += 2) 345 { 346 for(col = 0; col < wdx2; col += 16) 347 { 348 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 349 src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */ 350 src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 351 352 /* storing 16 8-bit output values */ 353 _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 354 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 355 356 pu1_src += 16; /* pointer update */ 357 pu1_dst += 16; /* pointer update */ 358 } /* inner for loop ends here(16-output values in single iteration) */ 359 360 pu1_src += 2 * src_strd - wdx2; /* pointer update */ 361 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */ 362 } 363 364 } 365 else if(0 == (wdx2 & 7)) /* multiple of 8 case */ 366 { 367 for(row = 0; row < ht; row += 2) 368 { 369 for(col = 0; col < wdx2; col += 8) 370 { 371 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 372 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 373 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 374 375 /* storing 16 8-bit output values */ 376 _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 377 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 378 379 pu1_src += 8; /* pointer update */ 380 pu1_dst += 8; /* pointer update */ 381 } /* inner for loop ends here(8-output values in single iteration) */ 382 383 pu1_src += 2 * src_strd - wdx2; /* pointer update */ 384 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */ 385 } 386 } 387 else /* wdx2 = multiple of 4 case */ 388 { 389 WORD32 dst0, dst1; 390 for(row = 0; row < ht; row += 2) 391 { 392 for(col = 0; col < wdx2; col += 4) 393 { 394 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 395 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 396 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 397 398 dst0 = _mm_cvtsi128_si32(src0_16x8b); 399 dst1 = _mm_cvtsi128_si32(src1_16x8b); 400 401 402 /* storing 4 8-bit output values */ 403 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */ 404 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */ 405 406 pu1_src += 4; /* pointer update */ 407 pu1_dst += 4; /* pointer update */ 408 } /* inner for loop ends here(4- output values in single iteration) */ 409 410 pu1_src += 2 * src_strd - wdx2; /* pointer update */ 411 pu1_dst += 2 * dst_strd - wdx2; /* pointer update */ 412 } 413 } 414 } 415 } 416 417 /** 418 ******************************************************************************* 419 * 420 * @brief 421 * chroma interprediction filter for copying 16bit output 422 * 423 * @par Description: 424 * Copies the array of width 'wd' and height 'ht' from the location pointed 425 * by 'src' to the location pointed by 'dst' The output is upshifted by 6 426 * bits and is used as input for vertical filtering or weighted prediction 427 * 428 * @param[in] pu1_src 429 * UWORD8 pointer to the source 430 * 431 * @param[out] pi2_dst 432 * WORD16 pointer to the destination 433 * 434 * @param[in] src_strd 435 * integer source stride 436 * 437 * @param[in] dst_strd 438 * integer destination stride 439 * 440 * @param[in] pi1_coeff 441 * WORD8 pointer to the filter coefficients 442 * 443 * @param[in] ht 444 * integer height of the array 445 * 446 * @param[in] wd 447 * integer width of the array 448 * 449 * @returns 450 * 451 * @remarks 452 * None 453 * 454 ******************************************************************************* 455 */ 456 457 void ihevc_inter_pred_chroma_copy_w16out_sse42(UWORD8 *pu1_src, 458 WORD16 *pi2_dst, 459 WORD32 src_strd, 460 WORD32 dst_strd, 461 WORD8 *pi1_coeff, 462 WORD32 ht, 463 WORD32 wd) 464 { 465 WORD32 row, col, wdx2; 466 __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b; 467 468 ASSERT(wd % 2 == 0); /* checking assumption*/ 469 ASSERT(ht % 2 == 0); /* checking assumption*/ 470 UNUSED(pi1_coeff); 471 wdx2 = wd * 2; 472 473 if(0 == (ht & 3)) /* multiple of 4 case */ 474 { 475 if(0 == (wdx2 & 7)) /* multiple of 8 case */ 476 { 477 for(row = 0; row < ht; row += 4) 478 { 479 for(col = 0; col < wdx2; col += 8) 480 { 481 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 482 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 483 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 484 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 485 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 486 487 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 488 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 489 src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b); 490 src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b); 491 492 src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 493 src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 494 src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 495 src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 496 497 /* storing 16 8-bit output values */ 498 _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 499 _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 500 _mm_storeu_si128((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 501 _mm_storeu_si128((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 502 503 pu1_src += 8; /* pointer update */ 504 pi2_dst += 8; /* pointer update */ 505 } /* inner for loop ends here(8-output values in single iteration) */ 506 507 pu1_src += 4 * src_strd - wdx2; /* pointer update */ 508 pi2_dst += 4 * dst_strd - wdx2; /* pointer update */ 509 } 510 } 511 else /* wdx2 = multiple of 4 case */ 512 { 513 for(row = 0; row < ht; row += 4) 514 { 515 for(col = 0; col < wdx2; col += 4) 516 { 517 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 518 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 519 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 520 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 521 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 522 523 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 524 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 525 src2_16x8b = _mm_cvtepu8_epi16(src2_16x8b); 526 src3_16x8b = _mm_cvtepu8_epi16(src3_16x8b); 527 528 src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 529 src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 530 src2_16x8b = _mm_slli_epi16(src2_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 531 src3_16x8b = _mm_slli_epi16(src3_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 532 533 /* storing 16 8-bit output values */ 534 _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 535 _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 536 _mm_storel_epi64((__m128i *)(pi2_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 537 _mm_storel_epi64((__m128i *)(pi2_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 538 539 pu1_src += 4; /* pointer update */ 540 pi2_dst += 4; /* pointer update */ 541 } /* inner for loop ends here(4-output values in single iteration) */ 542 543 pu1_src += 4 * src_strd - wdx2; /* pointer update */ 544 pi2_dst += 4 * dst_strd - wdx2; /* pointer update */ 545 } 546 } 547 } 548 else /* ht multiple of 2 case */ 549 { 550 if(0 == (wdx2 & 7)) /* multiple of 8 case */ 551 { 552 for(row = 0; row < ht; row += 2) 553 { 554 for(col = 0; col < wdx2; col += 8) 555 { 556 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 557 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 558 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 559 560 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 561 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 562 563 src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 564 src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 565 566 /* storing 16 8-bit output values */ 567 _mm_storeu_si128((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 568 _mm_storeu_si128((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 569 570 pu1_src += 8; /* pointer update */ 571 pi2_dst += 8; /* pointer update */ 572 } /* inner for loop ends here(8-output values in single iteration) */ 573 574 pu1_src += 2 * src_strd - wdx2; /* pointer update */ 575 pi2_dst += 2 * dst_strd - wdx2; /* pointer update */ 576 } 577 } 578 else /* wdx2 = multiple of 4 case */ 579 { 580 for(row = 0; row < ht; row += 2) 581 { 582 for(col = 0; col < wdx2; col += 4) 583 { 584 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/ 585 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 586 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 587 588 src0_16x8b = _mm_cvtepu8_epi16(src0_16x8b); 589 src1_16x8b = _mm_cvtepu8_epi16(src1_16x8b); 590 591 src0_16x8b = _mm_slli_epi16(src0_16x8b, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 592 src1_16x8b = _mm_slli_epi16(src1_16x8b, SHIFT_14_MINUS_BIT_DEPTH); 593 594 /* storing 16 8-bit output values */ 595 _mm_storel_epi64((__m128i *)(pi2_dst), src0_16x8b); /* row =0 */ 596 _mm_storel_epi64((__m128i *)(pi2_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 597 598 pu1_src += 4; /* pointer update */ 599 pi2_dst += 4; /* pointer update */ 600 } /* inner for loop ends here(4-output values in single iteration) */ 601 602 pu1_src += 2 * src_strd - wdx2; /* pointer update */ 603 pi2_dst += 2 * dst_strd - wdx2; /* pointer update */ 604 } 605 } 606 } 607 } 608