1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 19 /** 20 ******************************************************************************* 21 * @file 22 * ihevc_inter_pred_filters_atom_intr.c 23 * 24 * @brief 25 * Contains function definitions for inter prediction interpolation filters 26 * coded in x86 intrinsics 27 * 28 * 29 * @author 30 * 31 * 32 * @par List of Functions: 33 * - ihevc_inter_pred_luma_copy_ssse3() 34 * - ihevc_inter_pred_luma_horz_ssse3() 35 * - ihevc_inter_pred_luma_vert_ssse3() 36 * - ihevc_inter_pred_luma_copy_w16out_ssse3() 37 * - ihevc_inter_pred_luma_horz_w16out_ssse3() 38 * - ihevc_inter_pred_luma_vert_w16out_ssse3() 39 * - ihevc_inter_pred_luma_vert_w16inp_ssse3() 40 * - ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3() 41 * - ihevc_inter_pred_chroma_copy_ssse3() 42 * - ihevc_inter_pred_chroma_horz_ssse3() 43 * - ihevc_inter_pred_chroma_vert_ssse3() 44 * - ihevc_inter_pred_chroma_copy_w16out_ssse3() 45 * - ihevc_inter_pred_chroma_horz_w16out_ssse3() 46 * - ihevc_inter_pred_chroma_vert_w16out_ssse3() 47 * - ihevc_inter_pred_chroma_vert_w16inp_ssse3() 48 * - ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3() 49 * 50 * @remarks 51 * None 52 * 53 ******************************************************************************* 54 */ 55 56 57 /*****************************************************************************/ 58 /* File Includes */ 59 /*****************************************************************************/ 60 #include <assert.h> 61 62 #include "ihevc_debug.h" 63 #include "ihevc_typedefs.h" 64 #include "ihevc_defs.h" 65 #include "ihevc_inter_pred.h" 66 #include "ihevc_platform_macros.h" 67 #include "ihevc_macros.h" 68 #include "ihevc_func_selector.h" 69 70 #include <immintrin.h> 71 72 /*****************************************************************************/ 73 /* Function Definitions */ 74 /*****************************************************************************/ 75 76 /** 77 ******************************************************************************* 78 * 79 * @brief 80 * Interprediction luma function for copy 81 * 82 * @par Description: 83 * Copies the array of width 'wd' and height 'ht' from the location pointed 84 * by 'src' to the location pointed by 'dst' 85 * 86 * @param[in] pu1_src 87 * UWORD8 pointer to the source 88 * 89 * @param[out] pu1_dst 90 * UWORD8 pointer to the destination 91 * 92 * @param[in] src_strd 93 * integer source stride 94 * 95 * @param[in] dst_strd 96 * integer destination stride 97 * 98 * @param[in] pi1_coeff 99 * WORD8 pointer to the filter coefficients 100 * 101 * @param[in] ht 102 * integer height of the array 103 * 104 * @param[in] wd 105 * integer width of the array 106 * 107 * @returns 108 * 109 * @remarks 110 * None 111 * 112 * Assumption : ht%4 == 0, wd%4 == 0 113 * 114 ******************************************************************************* 115 */ 116 117 118 void ihevc_inter_pred_luma_copy_ssse3(UWORD8 *pu1_src, 119 UWORD8 *pu1_dst, 120 WORD32 src_strd, 121 WORD32 dst_strd, 122 WORD8 *pi1_coeff, 123 WORD32 ht, 124 WORD32 wd) 125 { 126 127 WORD32 row, col; 128 __m128i src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b; 129 UNUSED(pi1_coeff); 130 ASSERT(wd % 4 == 0); /* checking assumption*/ 131 ASSERT(ht % 4 == 0); /* checking assumption*/ 132 133 /* outer for loop starts from here */ 134 if(0 == (wd & 15)) /* wd multiple of 16 case */ 135 { 136 for(row = 0; row < ht; row += 4) 137 { 138 for(col = 0; col < wd; col += 16) 139 { 140 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 141 src0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src)); /* row =0 */ 142 src1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 143 src2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 144 src3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 145 146 /* storing 16 8-bit output values */ 147 _mm_storeu_si128((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 148 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 149 _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 150 _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 151 152 pu1_src += 16; /* pointer update */ 153 pu1_dst += 16; /* pointer update */ 154 } /* inner for loop ends here(16-output values in single iteration) */ 155 156 pu1_src += 4 * src_strd - wd; /* pointer update */ 157 pu1_dst += 4 * dst_strd - wd; /* pointer update */ 158 } 159 160 } 161 else if(0 == (wd & 7)) /* multiple of 8 case */ 162 { 163 for(row = 0; row < ht; row += 4) 164 { 165 for(col = 0; col < wd; col += 8) 166 { 167 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 168 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 169 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 170 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 171 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 172 173 /* storing 16 8-bit output values */ 174 _mm_storel_epi64((__m128i *)(pu1_dst), src0_16x8b); /* row =0 */ 175 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src1_16x8b); /* row =1 */ 176 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src2_16x8b); /* row =2 */ 177 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src3_16x8b); /* row =3 */ 178 179 pu1_src += 8; /* pointer update */ 180 pu1_dst += 8; /* pointer update */ 181 } /* inner for loop ends here(8-output values in single iteration) */ 182 183 pu1_src += 4 * src_strd - wd; /* pointer update */ 184 pu1_dst += 4 * dst_strd - wd; /* pointer update */ 185 } 186 } 187 else /* wd = multiple of 4 case */ 188 { 189 WORD32 dst0, dst1, dst2, dst3; 190 for(row = 0; row < ht; row += 4) 191 { 192 for(col = 0; col < wd; col += 4) 193 { 194 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 195 src0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); /* row =0 */ 196 src1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 1 * src_strd)); /* row =1 */ 197 src2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd)); /* row =2 */ 198 src3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd)); /* row =3 */ 199 200 dst0 = _mm_cvtsi128_si32(src0_16x8b); 201 dst1 = _mm_cvtsi128_si32(src1_16x8b); 202 dst2 = _mm_cvtsi128_si32(src2_16x8b); 203 dst3 = _mm_cvtsi128_si32(src3_16x8b); 204 205 /* storing 4 8-bit output values */ 206 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0; /* row =0 */ 207 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1; /* row =1 */ 208 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2; /* row =2 */ 209 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3; /* row =3 */ 210 211 pu1_src += 4; /* pointer update */ 212 pu1_dst += 4; /* pointer update */ 213 } /* inner for loop ends here(4- output values in single iteration) */ 214 215 pu1_src += 4 * src_strd - wd; /* pointer update */ 216 pu1_dst += 4 * dst_strd - wd; /* pointer update */ 217 } 218 } 219 } 220 221 /* INTER_PRED_LUMA_COPY */ 222 223 /** 224 ******************************************************************************* 225 * 226 * @brief 227 * Interprediction luma filter for horizontal input 228 * 229 * @par Description: 230 * Applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 231 * to the elements pointed by 'pu1_src' and writes to the location pointed 232 * by 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits 233 * 234 * @param[in] pu1_src 235 * UWORD8 pointer to the source 236 * 237 * @param[out] pu1_dst 238 * UWORD8 pointer to the destination 239 * 240 * @param[in] src_strd 241 * integer source stride 242 * 243 * @param[in] dst_strd 244 * integer destination stride 245 * 246 * @param[in] pi1_coeff 247 * WORD8 pointer to the filter coefficients 248 * 249 * @param[in] ht 250 * integer height of the array 251 * 252 * @param[in] wd 253 * integer width of the array 254 * 255 * @returns 256 * 257 * @remarks 258 * None 259 * 260 ******************************************************************************* 261 */ 262 void ihevc_inter_pred_luma_horz_ssse3(UWORD8 *pu1_src, 263 UWORD8 *pu1_dst, 264 WORD32 src_strd, 265 WORD32 dst_strd, 266 WORD8 *pi1_coeff, 267 WORD32 ht, 268 WORD32 wd) 269 { 270 WORD32 row, col; 271 272 /* all 128 bit registers are named with a suffix mxnb, where m is the */ 273 /* number of n bits packed in the register */ 274 __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b; 275 __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b; 276 __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b; 277 __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b, res_temp8_8x16b; 278 __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b, res_temp18_8x16b; 279 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b; 280 __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b; 281 282 ASSERT(wd % 4 == 0); /* checking assumption*/ 283 284 PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0) 285 PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0) 286 PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0) 287 PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0) 288 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 289 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 290 291 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 292 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff); 293 zero_8x16b = _mm_set1_epi32(0); 294 offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */ 295 296 mask_low_32b = _mm_cmpeq_epi16(zero_8x16b, zero_8x16b); 297 mask_high_96b = _mm_srli_si128(mask_low_32b, 12); 298 mask_low_32b = _mm_slli_si128(mask_low_32b, 4); 299 300 control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */ 301 control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */ 302 control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */ 303 control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */ 304 305 coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b); /* pi1_coeff[4] */ 306 coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b); /* pi1_coeff[4] */ 307 308 coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b); /* pi1_coeff[4] */ 309 coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b); /* pi1_coeff[4] */ 310 311 if(0 == (ht & 1)) /* ht multiple of 2 case */ 312 { 313 314 if(0 == (wd & 7)) /* wd = multiple of 8 case */ 315 { 316 for(row = 0; row < ht; row += 2) 317 { 318 319 int offset = 0; 320 321 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 322 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 323 324 325 for(col = 0; col < wd; col += 8) 326 { 327 /*load 16 pixel values from row 0*/ 328 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 329 330 /*load 16 pixel values from row 1*/ 331 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */ 332 333 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 334 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 335 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 336 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 337 /* row = 0 */ 338 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 339 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 340 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 341 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 342 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 343 344 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 345 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 346 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 347 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 348 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 349 350 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 351 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 352 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 353 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 354 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 355 356 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 357 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 358 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 359 360 res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */ 361 res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */ 362 res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */ 363 364 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b); 365 366 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row =1 */ 367 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row =1 */ 368 src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 369 res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */ 370 /* row = 1 */ 371 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 372 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 373 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row =1 */ 374 src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 375 res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */ 376 377 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 378 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 379 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row =1 */ 380 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 381 res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */ 382 383 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 384 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 385 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row =1 */ 386 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 387 res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */ 388 389 res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 390 res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b); 391 res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b); 392 393 res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b); /* row = 1 */ 394 res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */ 395 res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b); /* row = 1 */ 396 397 /* to store the 1st 4 pixels res. */ 398 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b); 399 400 offset += 8; /* To pointer updates*/ 401 } 402 pu1_src += 2 * src_strd; /* pointer updates*/ 403 pu1_dst += 2 * dst_strd; /* pointer updates*/ 404 } 405 } 406 else /* wd = multiple of 4 case */ 407 { 408 for(row = 0; row < ht; row += 2) 409 { 410 int offset = 0; 411 412 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 413 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 414 415 416 for(col = 0; col < wd; col += 4) 417 { 418 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 419 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 420 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */ 421 422 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 423 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 424 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 425 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 426 /* row = 0 */ 427 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 428 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 429 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 430 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 431 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 432 433 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 434 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 435 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 436 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 437 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 438 439 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 440 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 441 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 442 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 443 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 444 445 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 446 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 447 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 448 449 res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */ 450 res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */ 451 res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */ 452 453 res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset)); 454 res_temp8_8x16b = _mm_and_si128(res_temp7_8x16b, mask_low_32b); 455 res_temp7_8x16b = _mm_and_si128(res_temp5_8x16b, mask_high_96b); 456 res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b); 457 458 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b); 459 460 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */ 461 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */ 462 src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 463 res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */ 464 /* row = 1 */ 465 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 466 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 467 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 468 src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 469 res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */ 470 471 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 472 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 473 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 474 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 475 res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */ 476 477 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 478 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 479 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 480 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 481 res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */ 482 483 res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 484 res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b); 485 res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b); 486 487 res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b); /* row = 1 */ 488 res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */ 489 res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b); /* row = 1 */ 490 491 res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset)); 492 res_temp18_8x16b = _mm_and_si128(res_temp17_8x16b, mask_low_32b); 493 res_temp17_8x16b = _mm_and_si128(res_temp15_8x16b, mask_high_96b); 494 res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b); 495 496 /* to store the 1st 4 pixels res. */ 497 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b); 498 499 offset += 4; /* To pointer updates*/ 500 } 501 pu1_src += 2 * src_strd; /* Pointer update */ 502 pu1_dst += 2 * dst_strd; /* Pointer update */ 503 } 504 } 505 } 506 else /* odd ht */ 507 { 508 if(0 == (wd & 7)) /* multiple of 8 case */ 509 { 510 for(row = 0; row < ht; row++) 511 { 512 int offset = 0; 513 514 515 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 516 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 517 518 519 for(col = 0; col < wd; col += 8) 520 { 521 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 522 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 523 524 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 525 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 526 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 527 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 528 /* row = 0 */ 529 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 530 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 531 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 532 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 533 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 534 535 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 536 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 537 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 538 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 539 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 540 541 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 542 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 543 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 544 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 545 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 546 547 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 548 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 549 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 550 551 res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */ 552 res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */ 553 res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */ 554 555 /* to store the 1st 4 pixels res. */ 556 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b); 557 558 offset += 8; /* To pointer updates*/ 559 } 560 pu1_src += src_strd; /* pointer updates*/ 561 pu1_dst += dst_strd; /* pointer updates*/ 562 } 563 } 564 else /* wd = multiple of 4 case */ 565 { 566 for(row = 0; row < (ht - 1); row += 2) 567 { 568 int offset = 0; 569 570 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 571 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 572 573 574 for(col = 0; col < wd; col += 4) 575 { 576 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 577 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 578 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */ 579 580 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 581 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 582 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 583 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 584 /* row = 0 */ 585 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 586 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 587 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 588 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 589 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 590 591 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 592 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 593 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 594 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 595 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 596 597 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 598 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 599 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 600 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 601 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 602 603 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 604 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 605 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 606 607 res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */ 608 res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */ 609 res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */ 610 611 res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset)); 612 res_temp8_8x16b = _mm_and_si128(res_temp7_8x16b, mask_low_32b); 613 res_temp7_8x16b = _mm_and_si128(res_temp5_8x16b, mask_high_96b); 614 res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b); 615 616 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b); 617 618 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */ 619 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */ 620 src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 621 res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */ 622 /* row = 1 */ 623 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 624 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 625 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 626 src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 627 res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */ 628 629 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 630 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 631 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 632 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 633 res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */ 634 635 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 636 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 637 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 638 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 639 res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */ 640 641 res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 642 res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b); 643 res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b); 644 645 res_temp16_8x16b = _mm_adds_epi16(res_temp15_8x16b, offset_8x16b); /* row = 1 */ 646 res_temp16_8x16b = _mm_srai_epi16(res_temp16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 1 */ 647 res_temp15_8x16b = _mm_packus_epi16(res_temp16_8x16b, res_temp16_8x16b); /* row = 1 */ 648 649 res_temp17_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset)); 650 res_temp18_8x16b = _mm_and_si128(res_temp17_8x16b, mask_low_32b); 651 res_temp17_8x16b = _mm_and_si128(res_temp15_8x16b, mask_high_96b); 652 res_temp15_8x16b = _mm_or_si128(res_temp17_8x16b, res_temp18_8x16b); 653 654 /* to store the 1st 4 pixels res. */ 655 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp15_8x16b); 656 657 offset += 4; /* To pointer updates*/ 658 } 659 pu1_src += 2 * src_strd; /* Pointer update */ 660 pu1_dst += 2 * dst_strd; /* Pointer update */ 661 } 662 { /* last repeat at outside the loop */ 663 int offset = 0; 664 for(col = 0; col < wd; col += 4) 665 { 666 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 667 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 668 669 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 670 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 671 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 672 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 673 /* row = 0 */ 674 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 675 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 676 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 677 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 678 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 679 680 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 681 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 682 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 683 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 684 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 685 686 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 687 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 688 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 689 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 690 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 691 692 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 693 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 694 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 695 696 res_temp6_8x16b = _mm_adds_epi16(res_temp5_8x16b, offset_8x16b); /* row = 0 */ 697 res_temp6_8x16b = _mm_srai_epi16(res_temp6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */ 698 res_temp5_8x16b = _mm_packus_epi16(res_temp6_8x16b, res_temp6_8x16b); /* row = 0 */ 699 700 res_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset)); 701 res_temp8_8x16b = _mm_and_si128(res_temp7_8x16b, mask_low_32b); 702 res_temp7_8x16b = _mm_and_si128(res_temp5_8x16b, mask_high_96b); 703 res_temp5_8x16b = _mm_or_si128(res_temp7_8x16b, res_temp8_8x16b); 704 705 /* to store the 1st 4 pixels res. */ 706 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp5_8x16b); 707 708 offset += 4; /* To pointer updates*/ 709 } 710 } 711 } 712 } 713 } 714 715 716 /** 717 ******************************************************************************* 718 * 719 * @brief 720 * Interprediction luma filter for vertical input 721 * 722 * @par Description: 723 * Applies a vertcal filter with coefficients pointed to by 'pi1_coeff' to 724 * the elements pointed by 'pu1_src' and writes to the location pointed by 725 * 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits 726 * 727 * @param[in] pu1_src 728 * UWORD8 pointer to the source 729 * 730 * @param[out] pu1_dst 731 * UWORD8 pointer to the destination 732 * 733 * @param[in] src_strd 734 * integer source stride 735 * 736 * @param[in] dst_strd 737 * integer destination stride 738 * 739 * @param[in] pi1_coeff 740 * WORD8 pointer to the filter coefficients 741 * 742 * @param[in] ht 743 * integer height of the array 744 * 745 * @param[in] wd 746 * integer width of the array 747 * 748 * @returns 749 * 750 * @remarks 751 * None 752 * 753 ******************************************************************************* 754 */ 755 void ihevc_inter_pred_luma_vert_ssse3(UWORD8 *pu1_src, 756 UWORD8 *pu1_dst, 757 WORD32 src_strd, 758 WORD32 dst_strd, 759 WORD8 *pi1_coeff, 760 WORD32 ht, 761 WORD32 wd) 762 { 763 WORD32 row, col; 764 UWORD8 *pu1_src_copy; 765 UWORD8 *pu1_dst_copy; 766 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b; 767 __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b; 768 __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b; 769 __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b; 770 __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b; 771 __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s17_8x16b, s18_8x16b, s19_8x16b; 772 __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s27_8x16b, s28_8x16b, s29_8x16b; 773 __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s37_8x16b, s38_8x16b, s39_8x16b; 774 775 __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b; 776 __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b; 777 778 PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0) 779 PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0) 780 PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0) 781 PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0) 782 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 783 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 784 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 785 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 786 787 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 788 s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff); 789 790 control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */ 791 control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */ 792 control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */ 793 control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */ 794 795 coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b); /* pi1_coeff[4] */ 796 coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b); /* pi1_coeff[4] */ 797 798 coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b); /* pi1_coeff[4] */ 799 coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b); /* pi1_coeff[4] */ 800 801 /* seting values in register */ 802 zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */ 803 offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */ 804 mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000); 805 mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF); 806 807 /* outer for loop starts from here */ 808 if(wd % 8 == 0) 809 { /* wd = multiple of 8 case */ 810 811 pu1_src_copy = pu1_src; 812 pu1_dst_copy = pu1_dst; 813 814 for(col = 0; col < wd; col += 8) 815 { 816 817 pu1_src = pu1_src_copy + col; 818 pu1_dst = pu1_dst_copy + col; 819 820 PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0) 821 PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0) 822 PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0) 823 PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0) 824 825 /*load 8 pixel values.*/ 826 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd))); 827 828 /*load 8 pixel values*/ 829 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd))); 830 831 s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b); 832 833 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b); 834 835 /*load 8 pixel values*/ 836 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd))); 837 838 /*load 8 pixel values*/ 839 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd))); 840 841 s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b); 842 843 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b); 844 845 /*load 8 pixel values*/ 846 s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd))); 847 848 /*load 8 pixel values*/ 849 s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 850 851 s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b); 852 853 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b); 854 855 /*load 8 pixel values*/ 856 s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 857 858 /*load 8 pixel values*/ 859 s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd))); 860 861 s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b); 862 863 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b); 864 865 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b); 866 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b); 867 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b); 868 869 s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b); 870 871 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 872 s8_8x16b = _mm_srai_epi16(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 873 874 /* i2_tmp = CLIP_U8(i2_tmp);*/ 875 s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b); 876 877 /* store 8 8-bit output values */ 878 /* Store the output pixels of row 0*/ 879 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b); 880 881 /* ROW 2*/ 882 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b); 883 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b); 884 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b); 885 886 /*load 8 pixel values*/ 887 s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd))); 888 889 /*load 8 pixel values*/ 890 s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd))); 891 892 s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b); 893 894 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b); 895 896 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b); 897 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b); 898 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b); 899 900 s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b); 901 902 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 903 s28_8x16b = _mm_srai_epi16(s27_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 904 905 /* i2_tmp = CLIP_U8(i2_tmp);*/ 906 s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b); 907 908 /* store 8 8-bit output values */ 909 /* Store the output pixels of row 2*/ 910 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b); 911 912 913 /*ROW 1*/ 914 s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b); 915 916 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b); 917 918 s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b); 919 920 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b); 921 922 s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b); 923 924 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b); 925 926 s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b); 927 928 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b); 929 930 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b); 931 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b); 932 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b); 933 934 s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b); 935 936 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 937 s18_8x16b = _mm_srai_epi16(s17_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 938 939 /* i2_tmp = CLIP_U8(i2_tmp);*/ 940 s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b); 941 942 /* store 8 8-bit output values */ 943 /* Store the output pixels of row 1*/ 944 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b); 945 946 947 /* ROW 3*/ 948 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b); 949 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b); 950 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b); 951 952 /*load 8 pixel values*/ 953 s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd))); 954 955 s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b); 956 957 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b); 958 959 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b); 960 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b); 961 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b); 962 963 s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b); 964 965 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 966 s38_8x16b = _mm_srai_epi16(s37_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 967 968 /* i2_tmp = CLIP_U8(i2_tmp);*/ 969 s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b); 970 971 /* store 8 8-bit output values */ 972 /* Store the output pixels of row 2*/ 973 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b); 974 975 pu1_src += (8 * src_strd); 976 pu1_dst += (4 * dst_strd); 977 978 for(row = 4; row < ht; row += 4) 979 { 980 PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0) 981 PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0) 982 PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0) 983 PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0) 984 985 986 s3_0_16x8b = s3_2_16x8b; 987 s3_1_16x8b = s3_3_16x8b; 988 s3_2_16x8b = s3_4_16x8b; 989 990 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b); 991 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b); 992 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b); 993 994 /*load 8 pixel values from (cur_row + 4)th row*/ 995 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 996 997 s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b); 998 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b); 999 1000 s4_0_16x8b = s4_2_16x8b; 1001 s4_1_16x8b = s4_3_16x8b; 1002 s4_2_16x8b = s4_4_16x8b; 1003 1004 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b); 1005 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b); 1006 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b); 1007 1008 s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b); 1009 1010 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1011 s8_8x16b = _mm_srai_epi16(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1012 1013 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1014 s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b); 1015 1016 /* store 8 8-bit output values */ 1017 /* Store the output pixels of row 4*/ 1018 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b); 1019 1020 /* row + 2*/ 1021 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b); 1022 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b); 1023 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b); 1024 1025 /*load 8 pixel values from (cur_row + 5)th row*/ 1026 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 1027 1028 /*load 8 pixel values from (cur_row + 6)th row*/ 1029 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 1030 1031 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/ 1032 s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b); 1033 1034 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b); 1035 1036 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b); 1037 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b); 1038 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b); 1039 1040 s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b); 1041 1042 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1043 s28_8x16b = _mm_srai_epi16(s27_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1044 1045 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1046 s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b); 1047 1048 /* store 8 8-bit output values */ 1049 /* Store the output pixels of (cur_row+2)*/ 1050 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b); 1051 1052 1053 /*row + 1*/ 1054 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b); 1055 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b); 1056 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b); 1057 1058 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/ 1059 s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b); 1060 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b); 1061 1062 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b); 1063 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b); 1064 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b); 1065 1066 s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b); 1067 1068 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1069 s18_8x16b = _mm_srai_epi16(s17_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1070 1071 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1072 s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b); 1073 1074 /* store 8 8-bit output values */ 1075 /* Store the output pixels of (cur_row + 1)*/ 1076 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b); 1077 1078 1079 /* row + 3*/ 1080 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b); 1081 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b); 1082 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b); 1083 1084 /*load 8 pixel values from (cur_row + 7)th row*/ 1085 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 1086 1087 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/ 1088 s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b); 1089 1090 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b); 1091 1092 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b); 1093 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b); 1094 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b); 1095 1096 s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b); 1097 1098 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1099 s38_8x16b = _mm_srai_epi16(s37_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1100 1101 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1102 s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b); 1103 1104 /* store 8 8-bit output values */ 1105 /* Store the output pixels of (cur_row+3)*/ 1106 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b); 1107 1108 s2_10_16x8b = s2_3_16x8b; 1109 1110 pu1_src += 4 * src_strd; /* pointer update */ 1111 pu1_dst += 4 * dst_strd; /* pointer update */ 1112 } 1113 } 1114 } 1115 else /* wd = multiple of 8 case */ 1116 { 1117 1118 pu1_src_copy = pu1_src; 1119 pu1_dst_copy = pu1_dst; 1120 1121 for(col = 0; col < wd; col += 4) 1122 { 1123 1124 pu1_src = pu1_src_copy + col; 1125 pu1_dst = pu1_dst_copy + col; 1126 1127 PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0) 1128 PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0) 1129 PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0) 1130 PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0) 1131 1132 1133 /*load 8 pixel values */ 1134 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd))); 1135 1136 /*load 8 pixel values */ 1137 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd))); 1138 1139 s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b); 1140 1141 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b); 1142 1143 /*load 8 pixel values */ 1144 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd))); 1145 1146 /*load 8 pixel values */ 1147 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd))); 1148 1149 s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b); 1150 1151 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b); 1152 1153 /*load 8 pixel values */ 1154 s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd))); 1155 1156 /*load 8 pixel values */ 1157 s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 1158 1159 s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b); 1160 1161 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b); 1162 1163 /*load 8 pixel values */ 1164 s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 1165 1166 /*load 8 pixel values */ 1167 s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd))); 1168 1169 s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b); 1170 1171 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b); 1172 1173 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b); 1174 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b); 1175 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b); 1176 1177 s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b); 1178 1179 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1180 s8_8x16b = _mm_srai_epi16(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1181 1182 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1183 s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b); 1184 s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst)); 1185 s6_8x16b = _mm_and_si128(s5_8x16b, mask_low_32b); 1186 s7_8x16b = _mm_and_si128(s9_8x16b, mask_high_96b); 1187 s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b); 1188 /* store 8 8-bit output values */ 1189 /* Store the output pixels of row 0*/ 1190 _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b); 1191 1192 /* ROW 2*/ 1193 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b); 1194 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b); 1195 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b); 1196 1197 /*load 8 pixel values */ 1198 s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd))); 1199 1200 /*load 8 pixel values */ 1201 s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd))); 1202 1203 s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b); 1204 1205 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b); 1206 1207 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b); 1208 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b); 1209 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b); 1210 1211 s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b); 1212 1213 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1214 s28_8x16b = _mm_srai_epi16(s27_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1215 1216 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1217 s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b); 1218 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd))); 1219 s26_8x16b = _mm_and_si128(s25_8x16b, mask_low_32b); 1220 s27_8x16b = _mm_and_si128(s29_8x16b, mask_high_96b); 1221 s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b); 1222 /* store 8 8-bit output values */ 1223 /* Store the output pixels of row 2*/ 1224 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b); 1225 1226 1227 /*ROW 1*/ 1228 s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b); 1229 1230 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b); 1231 1232 s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b); 1233 1234 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b); 1235 1236 s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b); 1237 1238 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b); 1239 1240 s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b); 1241 1242 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b); 1243 1244 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b); 1245 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b); 1246 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b); 1247 1248 s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b); 1249 1250 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1251 s18_8x16b = _mm_srai_epi16(s17_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1252 1253 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1254 s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b); 1255 s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd)); 1256 s16_8x16b = _mm_and_si128(s15_8x16b, mask_low_32b); 1257 s17_8x16b = _mm_and_si128(s19_8x16b, mask_high_96b); 1258 s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b); 1259 /* store 8 8-bit output values */ 1260 /* Store the output pixels of row 1*/ 1261 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s18_8x16b); 1262 1263 1264 /* ROW 3*/ 1265 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b); 1266 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b); 1267 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b); 1268 1269 /*load 8 pixel values */ 1270 s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd))); 1271 1272 s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b); 1273 1274 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b); 1275 1276 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b); 1277 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b); 1278 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b); 1279 1280 s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b); 1281 1282 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1283 s38_8x16b = _mm_srai_epi16(s37_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1284 1285 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1286 s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b); 1287 1288 s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd))); 1289 s36_8x16b = _mm_and_si128(s35_8x16b, mask_low_32b); 1290 s37_8x16b = _mm_and_si128(s39_8x16b, mask_high_96b); 1291 s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b); 1292 1293 /* store 8 8-bit output values */ 1294 /* Store the output pixels of row 2*/ 1295 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b); 1296 1297 pu1_src += (8 * src_strd); 1298 pu1_dst += (4 * dst_strd); 1299 1300 for(row = 4; row < ht; row += 4) 1301 { 1302 1303 PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0) 1304 PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0) 1305 PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0) 1306 PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0) 1307 1308 1309 s3_0_16x8b = s3_2_16x8b; 1310 s3_1_16x8b = s3_3_16x8b; 1311 s3_2_16x8b = s3_4_16x8b; 1312 1313 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b); 1314 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b); 1315 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b); 1316 1317 /*load 16 pixel values from (cur_row + 4)th row*/ 1318 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 1319 1320 s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b); 1321 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b); 1322 1323 s4_0_16x8b = s4_2_16x8b; 1324 s4_1_16x8b = s4_3_16x8b; 1325 s4_2_16x8b = s4_4_16x8b; 1326 1327 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b); 1328 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b); 1329 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b); 1330 1331 s7_8x16b = _mm_add_epi16(s6_8x16b, offset_8x16b); 1332 1333 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1334 s8_8x16b = _mm_srai_epi16(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1335 1336 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1337 s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b); 1338 1339 s5_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst)); 1340 s6_8x16b = _mm_and_si128(s5_8x16b, mask_low_32b); 1341 s7_8x16b = _mm_and_si128(s9_8x16b, mask_high_96b); 1342 s8_8x16b = _mm_or_si128(s6_8x16b, s7_8x16b); 1343 1344 /* store 8 8-bit output values */ 1345 /* Store the output pixels of row 4*/ 1346 _mm_storel_epi64((__m128i *)(pu1_dst), s8_8x16b); 1347 1348 /* row + 2*/ 1349 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b); 1350 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b); 1351 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b); 1352 1353 /*load 16 pixel values from (cur_row + 5)th row*/ 1354 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 1355 1356 /*load 16 pixel values from (cur_row + 6)th row*/ 1357 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 1358 1359 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/ 1360 s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b); 1361 1362 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b); 1363 1364 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b); 1365 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b); 1366 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b); 1367 1368 s27_8x16b = _mm_add_epi16(s26_8x16b, offset_8x16b); 1369 1370 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1371 s28_8x16b = _mm_srai_epi16(s27_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1372 1373 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1374 s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b); 1375 1376 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd))); 1377 s26_8x16b = _mm_and_si128(s25_8x16b, mask_low_32b); 1378 s27_8x16b = _mm_and_si128(s29_8x16b, mask_high_96b); 1379 s28_8x16b = _mm_or_si128(s26_8x16b, s27_8x16b); 1380 1381 /* store 8 8-bit output values */ 1382 /* Store the output pixels of (cur_row+2)*/ 1383 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s28_8x16b); 1384 1385 1386 /*row + 1*/ 1387 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b); 1388 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b); 1389 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b); 1390 1391 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/ 1392 s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b); 1393 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b); 1394 1395 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b); 1396 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b); 1397 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b); 1398 1399 s17_8x16b = _mm_add_epi16(s16_8x16b, offset_8x16b); 1400 1401 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1402 s18_8x16b = _mm_srai_epi16(s17_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1403 1404 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1405 s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b); 1406 1407 s15_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd)); 1408 s16_8x16b = _mm_and_si128(s15_8x16b, mask_low_32b); 1409 s17_8x16b = _mm_and_si128(s19_8x16b, mask_high_96b); 1410 s18_8x16b = _mm_or_si128(s16_8x16b, s17_8x16b); 1411 1412 /* store 8 8-bit output values */ 1413 /* Store the output pixels of (cur_row + 1)*/ 1414 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s18_8x16b); 1415 1416 1417 /* row + 3*/ 1418 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b); 1419 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b); 1420 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b); 1421 1422 /*load 16 pixel values from (cur_row + 7)th row*/ 1423 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 1424 1425 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/ 1426 s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b); 1427 1428 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b); 1429 1430 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b); 1431 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b); 1432 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b); 1433 1434 s37_8x16b = _mm_add_epi16(s36_8x16b, offset_8x16b); 1435 1436 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 1437 s38_8x16b = _mm_srai_epi16(s37_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 1438 1439 /* i2_tmp = CLIP_U8(i2_tmp);*/ 1440 s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b); 1441 1442 s35_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd))); 1443 s36_8x16b = _mm_and_si128(s35_8x16b, mask_low_32b); 1444 s37_8x16b = _mm_and_si128(s39_8x16b, mask_high_96b); 1445 s38_8x16b = _mm_or_si128(s36_8x16b, s37_8x16b); 1446 1447 /* store 8 8-bit output values */ 1448 /* Store the output pixels of (cur_row+3)*/ 1449 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s38_8x16b); 1450 1451 s2_10_16x8b = s2_3_16x8b; 1452 1453 pu1_src += 4 * src_strd; /* pointer update */ 1454 pu1_dst += 4 * dst_strd; /* pointer update */ 1455 } 1456 } 1457 } 1458 } 1459 1460 1461 /** 1462 ******************************************************************************* 1463 * 1464 * @brief 1465 * Interprediction luma filter for copy 16bit output 1466 * 1467 * @par Description: 1468 * Copies the array of width 'wd' and height 'ht' from the location pointed 1469 * by 'src' to the location pointed by 'dst' The output is upshifted by 6 1470 * bits and is used as input for vertical filtering or weighted prediction 1471 * 1472 * @param[in] pu1_src 1473 * UWORD8 pointer to the source 1474 * 1475 * @param[out] pi2_dst 1476 * WORD16 pointer to the destination 1477 * 1478 * @param[in] src_strd 1479 * integer source stride 1480 * 1481 * @param[in] dst_strd 1482 * integer destination stride 1483 * 1484 * @param[in] pi1_coeff 1485 * WORD8 pointer to the filter coefficients 1486 * 1487 * @param[in] ht 1488 * integer height of the array 1489 * 1490 * @param[in] wd 1491 * integer width of the array 1492 * 1493 * @returns 1494 * 1495 * @remarks 1496 * None 1497 * 1498 ******************************************************************************* 1499 */ 1500 1501 void ihevc_inter_pred_luma_copy_w16out_ssse3(UWORD8 *pu1_src, 1502 WORD16 *pi2_dst, 1503 WORD32 src_strd, 1504 WORD32 dst_strd, 1505 WORD8 *pi1_coeff, 1506 WORD32 ht, 1507 WORD32 wd) 1508 { 1509 WORD32 row, col; 1510 __m128i s3, zero_8x16b; 1511 1512 ASSERT(wd % 2 == 0); /* checking assumption*/ 1513 ASSERT(ht % 2 == 0); /* checking assumption*/ 1514 UNUSED(pi1_coeff); 1515 zero_8x16b = _mm_setzero_si128(); 1516 /* outer for loop starts from here */ 1517 if(wd % 8 == 0) /* wd = multiple of 8 case */ 1518 { 1519 for(row = 0; row < ht; row += 2) 1520 { 1521 int offset = 0; 1522 for(col = 0; col < wd; col += 8) 1523 { 1524 /* row =0 */ 1525 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 1526 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */ 1527 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 1528 1529 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 1530 1531 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */ 1532 _mm_store_si128((__m128i *)(pi2_dst + offset), s3); 1533 1534 /* row =1 */ 1535 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/ 1536 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */ 1537 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 1538 1539 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 1540 1541 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */ 1542 _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3); 1543 1544 offset += 8; /* To pointer update */ 1545 } /* inner for loop ends here(8-output values in single iteration) */ 1546 1547 pu1_src += 2 * src_strd; /* pointer update */ 1548 pi2_dst += 2 * dst_strd; /* pointer update */ 1549 } 1550 } 1551 else /* wd = multiple of 4 case */ 1552 { 1553 for(row = 0; row < ht; row += 2) 1554 { 1555 int offset = 0; 1556 for(col = 0; col < wd; col += 4) 1557 { 1558 /* row =0 */ 1559 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 1560 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */ 1561 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 1562 1563 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 1564 1565 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */ 1566 _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3); 1567 1568 /* row =1 */ 1569 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/ 1570 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col] */ 1571 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 1572 1573 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH */ 1574 1575 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */ 1576 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3); 1577 offset += 4; /* To pointer update */ 1578 } /* inner for loop ends here(4-output values in single iteration) */ 1579 1580 pu1_src += 2 * src_strd; /* pointer update */ 1581 pi2_dst += 2 * dst_strd; /* pointer update */ 1582 } 1583 } 1584 1585 } 1586 1587 /** 1588 ******************************************************************************* 1589 * 1590 * @brief 1591 * Interprediction luma filter for horizontal 16bit output 1592 * 1593 * @par Description: 1594 * Applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 1595 * to the elements pointed by 'pu1_src' and writes to the location pointed 1596 * by 'pu1_dst' No downshifting or clipping is done and the output is used 1597 * as an input for vertical filtering or weighted prediction 1598 * 1599 * @param[in] pu1_src 1600 * UWORD8 pointer to the source 1601 * 1602 * @param[out] pi2_dst 1603 * WORD16 pointer to the destination 1604 * 1605 * @param[in] src_strd 1606 * integer source stride 1607 * 1608 * @param[in] dst_strd 1609 * integer destination stride 1610 * 1611 * @param[in] pi1_coeff 1612 * WORD8 pointer to the filter coefficients 1613 * 1614 * @param[in] ht 1615 * integer height of the array 1616 * 1617 * @param[in] wd 1618 * integer width of the array 1619 * 1620 * @returns 1621 * 1622 * @remarks 1623 * None 1624 * 1625 ******************************************************************************* 1626 */ 1627 void ihevc_inter_pred_luma_horz_w16out_ssse3(UWORD8 *pu1_src, 1628 WORD16 *pi2_dst, 1629 WORD32 src_strd, 1630 WORD32 dst_strd, 1631 WORD8 *pi1_coeff, 1632 WORD32 ht, 1633 WORD32 wd) 1634 { 1635 WORD32 row, col; 1636 1637 /* all 128 bit registers are named with a suffix mxnb, where m is the */ 1638 /* number of n bits packed in the register */ 1639 1640 __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b; 1641 __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b; 1642 __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b; 1643 __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b; 1644 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b; 1645 __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b; 1646 1647 ASSERT(wd % 4 == 0); /* checking assumption*/ 1648 1649 PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0) 1650 PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0) 1651 PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0) 1652 PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0) 1653 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 1654 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 1655 1656 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 1657 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff); 1658 1659 1660 control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */ 1661 control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */ 1662 control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */ 1663 control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */ 1664 1665 coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b); /* pi1_coeff[4] */ 1666 coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b); /* pi1_coeff[4] */ 1667 1668 coeff4_5_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_3_8x16b); /* pi1_coeff[4] */ 1669 coeff6_7_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_4_8x16b); /* pi1_coeff[4] */ 1670 1671 if(0 == (ht & 1)) /* ht multiple of 2 case */ 1672 { 1673 1674 if(0 == (wd & 7)) /* wd = multiple of 8 case */ 1675 { 1676 for(row = 0; row < ht; row += 2) 1677 { 1678 1679 int offset = 0; 1680 1681 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 1682 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 1683 1684 1685 1686 for(col = 0; col < wd; col += 8) 1687 { 1688 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 1689 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 1690 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */ 1691 1692 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 1693 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 1694 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1695 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 1696 /* row = 0 */ 1697 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1698 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1699 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1700 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1701 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 1702 1703 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1704 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1705 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1706 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1707 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 1708 1709 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1710 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1711 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1712 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1713 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 1714 1715 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 1716 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 1717 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 1718 1719 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */ 1720 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */ 1721 src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1722 res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */ 1723 /* row = 1 */ 1724 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1725 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1726 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1727 src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1728 res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */ 1729 1730 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1731 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1732 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1733 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1734 res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */ 1735 1736 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1737 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1738 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1739 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1740 res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */ 1741 1742 res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 1743 res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b); 1744 res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b); 1745 1746 /* to store the 1st 4 pixels res. */ 1747 _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b); 1748 _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b); 1749 1750 offset += 8; /* To pointer updates*/ 1751 } 1752 pu1_src += 2 * src_strd; /* pointer updates*/ 1753 pi2_dst += 2 * dst_strd; /* pointer updates*/ 1754 } 1755 } 1756 else /* wd = multiple of 4 case */ 1757 { 1758 for(row = 0; row < ht; row += 2) 1759 { 1760 int offset = 0; 1761 1762 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 1763 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 1764 1765 1766 for(col = 0; col < wd; col += 4) 1767 { 1768 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 1769 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 1770 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */ 1771 1772 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 1773 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 1774 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1775 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 1776 /* row = 0 */ 1777 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1778 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1779 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1780 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1781 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 1782 1783 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1784 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1785 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1786 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1787 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 1788 1789 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1790 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1791 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1792 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1793 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 1794 1795 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 1796 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 1797 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 1798 1799 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */ 1800 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */ 1801 src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1802 res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */ 1803 /* row = 1 */ 1804 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1805 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1806 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1807 src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1808 res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */ 1809 1810 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1811 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1812 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1813 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1814 res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */ 1815 1816 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1817 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1818 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1819 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1820 res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */ 1821 1822 res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 1823 res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b); 1824 res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b); 1825 1826 /* to store the 1st 4 pixels res. */ 1827 _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b); 1828 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b); 1829 1830 offset += 4; /* To pointer updates*/ 1831 } 1832 pu1_src += 2 * src_strd; /* Pointer update */ 1833 pi2_dst += 2 * dst_strd; /* Pointer update */ 1834 } 1835 } 1836 } 1837 else /* odd ht */ 1838 { 1839 if(0 == (wd & 7)) /* multiple of 8 case */ 1840 { 1841 for(row = 0; row < ht; row++) 1842 { 1843 int offset = 0; 1844 1845 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 1846 1847 1848 for(col = 0; col < wd; col += 8) 1849 { 1850 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 1851 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 1852 1853 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 1854 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 1855 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1856 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 1857 /* row = 0 */ 1858 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1859 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1860 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1861 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1862 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 1863 1864 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1865 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1866 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1867 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1868 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 1869 1870 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1871 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1872 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1873 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1874 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 1875 1876 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 1877 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 1878 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 1879 1880 /* to store the 1st 4 pixels res. */ 1881 _mm_store_si128((__m128i *)(pi2_dst + offset), res_temp5_8x16b); 1882 1883 offset += 8; /* To pointer updates*/ 1884 } 1885 pu1_src += src_strd; /* pointer updates*/ 1886 pi2_dst += dst_strd; /* pointer updates*/ 1887 } 1888 } 1889 else /* wd = multiple of 4 case */ 1890 { 1891 for(row = 0; row < (ht - 1); row += 2) 1892 { 1893 int offset = 0; 1894 1895 1896 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 1897 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 1898 1899 1900 for(col = 0; col < wd; col += 4) 1901 { 1902 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 1903 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 1904 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd - 3 + offset)); /* row = 1 */ 1905 1906 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 1907 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 1908 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1909 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 1910 /* row = 0 */ 1911 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1912 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1913 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1914 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1915 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 1916 1917 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1918 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1919 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1920 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1921 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 1922 1923 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1924 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1925 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1926 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1927 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 1928 1929 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 1930 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 1931 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 1932 1933 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 1); /* row = 1 */ 1934 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 1 */ 1935 src_temp13_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1936 res_temp11_8x16b = _mm_maddubs_epi16(src_temp13_16x8b, coeff0_1_8x16b); /* row = 1 */ 1937 /* row = 1 */ 1938 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1939 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1940 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1941 src_temp14_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1942 res_temp12_8x16b = _mm_maddubs_epi16(src_temp14_16x8b, coeff2_3_8x16b); /* row = 1 */ 1943 1944 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1945 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1946 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1947 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1948 res_temp13_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff4_5_8x16b); /* row = 1 */ 1949 1950 src_temp11_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); /* row = 1 */ 1951 src_temp12_16x8b = _mm_srli_si128(src_temp12_16x8b, 2); /* row = 1 */ 1952 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 1 */ 1953 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); /* row = 1 */ 1954 res_temp14_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff6_7_8x16b); /* row = 1 */ 1955 1956 res_temp15_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 1957 res_temp16_8x16b = _mm_add_epi16(res_temp13_8x16b, res_temp14_8x16b); 1958 res_temp15_8x16b = _mm_add_epi16(res_temp15_8x16b, res_temp16_8x16b); 1959 1960 /* to store the 1st 4 pixels res. */ 1961 _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b); 1962 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp15_8x16b); 1963 1964 offset += 4; /* To pointer updates*/ 1965 } 1966 pu1_src += 2 * src_strd; /* Pointer update */ 1967 pi2_dst += 2 * dst_strd; /* Pointer update */ 1968 } 1969 { /* last repeat at outside the loop */ 1970 int offset = 0; 1971 for(col = 0; col < wd; col += 4) 1972 { 1973 /*load 16 pixel values from 12:-3 pos. relative to cur. pos.*/ 1974 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 3 + offset)); /* row = 0 */ 1975 1976 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 1); /* row = 0 */ 1977 /* pix. |5:-2|4:-3| to do two dot-products at same time*/ /* row = 0 */ 1978 src_temp3_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1979 res_temp1_8x16b = _mm_maddubs_epi16(src_temp3_16x8b, coeff0_1_8x16b); /* row = 0 */ 1980 /* row = 0 */ 1981 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1982 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1983 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1984 src_temp4_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1985 res_temp2_8x16b = _mm_maddubs_epi16(src_temp4_16x8b, coeff2_3_8x16b); /* row = 0 */ 1986 1987 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1988 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1989 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1990 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1991 res_temp3_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff4_5_8x16b); /* row = 0 */ 1992 1993 src_temp1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); /* row = 0 */ 1994 src_temp2_16x8b = _mm_srli_si128(src_temp2_16x8b, 2); /* row = 0 */ 1995 /* pix. |7:0|6:-1| to do two dot-products at same time*/ /* row = 0 */ 1996 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); /* row = 0 */ 1997 res_temp4_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff6_7_8x16b); /* row = 0 */ 1998 1999 res_temp5_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 2000 res_temp6_8x16b = _mm_add_epi16(res_temp3_8x16b, res_temp4_8x16b); 2001 res_temp5_8x16b = _mm_add_epi16(res_temp5_8x16b, res_temp6_8x16b); 2002 2003 /* to store the 1st 4 pixels res. */ 2004 _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp5_8x16b); 2005 2006 offset += 4; /* To pointer updates*/ 2007 } 2008 } 2009 } 2010 } 2011 } 2012 2013 /** 2014 ******************************************************************************* 2015 * 2016 * @brief 2017 * Interprediction luma filter for vertical 16bit output 2018 * 2019 * @par Description: 2020 * Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 2021 * the elements pointed by 'pu1_src' and writes to the location pointed by 2022 * 'pu1_dst' No downshifting or clipping is done and the output is used as 2023 * an input for weighted prediction 2024 * 2025 * @param[in] pu1_src 2026 * UWORD8 pointer to the source 2027 * 2028 * @param[out] pi2_dst 2029 * WORD16 pointer to the destination 2030 * 2031 * @param[in] src_strd 2032 * integer source stride 2033 * 2034 * @param[in] dst_strd 2035 * integer destination stride 2036 * 2037 * @param[in] pi1_coeff 2038 * WORD8 pointer to the filter coefficients 2039 * 2040 * @param[in] ht 2041 * integer height of the array 2042 * 2043 * @param[in] wd 2044 * integer width of the array 2045 * 2046 * @returns 2047 * 2048 * @remarks 2049 * None 2050 * 2051 ******************************************************************************* 2052 */ 2053 void ihevc_inter_pred_luma_vert_w16out_ssse3(UWORD8 *pu1_src, 2054 WORD16 *pi2_dst, 2055 WORD32 src_strd, 2056 WORD32 dst_strd, 2057 WORD8 *pi1_coeff, 2058 WORD32 ht, 2059 WORD32 wd) 2060 { 2061 WORD32 row, col; 2062 UWORD8 *pu1_src_copy; 2063 WORD16 *pi2_dst_copy; 2064 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b; 2065 __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b; 2066 __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b; 2067 __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b; 2068 __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b; 2069 __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b; 2070 __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b; 2071 __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b; 2072 2073 2074 __m128i control_mask_1_8x16b, control_mask_2_8x16b, control_mask_3_8x16b, control_mask_4_8x16b; 2075 2076 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 2077 s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff); 2078 2079 control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */ 2080 control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */ 2081 control_mask_3_8x16b = _mm_set1_epi32(0x05040504); /* Control Mask register */ 2082 control_mask_4_8x16b = _mm_set1_epi32(0x07060706); /* Control Mask register */ 2083 2084 coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b); /* pi1_coeff[4] */ 2085 coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b); /* pi1_coeff[4] */ 2086 2087 coeff4_5_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_3_8x16b); /* pi1_coeff[4] */ 2088 coeff6_7_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_4_8x16b); /* pi1_coeff[4] */ 2089 2090 2091 /* outer for loop starts from here */ 2092 if((wd % 8) == 0) 2093 { /* wd = multiple of 8 case */ 2094 2095 pu1_src_copy = pu1_src; 2096 pi2_dst_copy = pi2_dst; 2097 2098 for(col = 0; col < wd; col += 8) 2099 { 2100 2101 pu1_src = pu1_src_copy + col; 2102 pi2_dst = pi2_dst_copy + col; 2103 2104 PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0) 2105 PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0) 2106 PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0) 2107 PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0) 2108 2109 /*load 8 pixel values */ 2110 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd))); 2111 2112 /*load 8 pixel values */ 2113 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd))); 2114 2115 s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b); 2116 2117 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b); 2118 2119 /*load 8 pixel values */ 2120 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd))); 2121 2122 /*load 8 pixel values */ 2123 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd))); 2124 2125 s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b); 2126 2127 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b); 2128 2129 /*load 8 pixel values */ 2130 s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd))); 2131 2132 /*load 8 pixel values */ 2133 s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 2134 2135 s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b); 2136 2137 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b); 2138 2139 /*load 8 pixel values */ 2140 s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 2141 2142 /*load 8 pixel values */ 2143 s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd))); 2144 2145 s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b); 2146 2147 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b); 2148 2149 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b); 2150 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b); 2151 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b); 2152 2153 /* store 8 8-bit output values */ 2154 /* Store the output pixels of row 0*/ 2155 _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b); 2156 2157 /* ROW 2*/ 2158 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b); 2159 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b); 2160 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b); 2161 2162 /*load 8 pixel values */ 2163 s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd))); 2164 2165 /*load 8 pixel values */ 2166 s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd))); 2167 2168 s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b); 2169 2170 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b); 2171 2172 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b); 2173 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b); 2174 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b); 2175 2176 /* store 8 8-bit output values */ 2177 /* Store the output pixels of row 2*/ 2178 _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b); 2179 2180 2181 /*ROW 1*/ 2182 s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b); 2183 2184 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b); 2185 2186 s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b); 2187 2188 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b); 2189 2190 s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b); 2191 2192 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b); 2193 2194 s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b); 2195 2196 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b); 2197 2198 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b); 2199 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b); 2200 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b); 2201 2202 2203 /* store 8 8-bit output values */ 2204 /* Store the output pixels of row 1*/ 2205 _mm_store_si128((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b); 2206 2207 2208 /* ROW 3*/ 2209 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b); 2210 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b); 2211 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b); 2212 2213 /*load 8 pixel values */ 2214 s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd))); 2215 2216 s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b); 2217 2218 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b); 2219 2220 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b); 2221 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b); 2222 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b); 2223 2224 2225 /* store 8 8-bit output values */ 2226 /* Store the output pixels of row 2*/ 2227 _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b); 2228 2229 pu1_src += (8 * src_strd); 2230 pi2_dst += (4 * dst_strd); 2231 2232 for(row = 4; row < ht; row += 4) 2233 { 2234 2235 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 2236 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 2237 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 2238 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 2239 2240 s3_0_16x8b = s3_2_16x8b; 2241 s3_1_16x8b = s3_3_16x8b; 2242 s3_2_16x8b = s3_4_16x8b; 2243 2244 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b); 2245 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b); 2246 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b); 2247 2248 /*load 8 pixel values from (cur_row + 4)th row*/ 2249 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2250 2251 s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b); 2252 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b); 2253 2254 s4_0_16x8b = s4_2_16x8b; 2255 s4_1_16x8b = s4_3_16x8b; 2256 s4_2_16x8b = s4_4_16x8b; 2257 2258 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b); 2259 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b); 2260 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b); 2261 2262 /* store 8 8-bit output values */ 2263 /* Store the output pixels of row 4*/ 2264 _mm_store_si128((__m128i *)(pi2_dst), s6_8x16b); 2265 2266 /* row + 2*/ 2267 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b); 2268 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b); 2269 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b); 2270 2271 /*load 8 pixel values from (cur_row + 5)th row*/ 2272 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 2273 2274 /*load 8 pixel values from (cur_row + 6)th row*/ 2275 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 2276 2277 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/ 2278 s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b); 2279 2280 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b); 2281 2282 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b); 2283 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b); 2284 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b); 2285 2286 /* store 8 8-bit output values */ 2287 /* Store the output pixels of (cur_row+2)*/ 2288 _mm_store_si128((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b); 2289 2290 2291 /*row + 1*/ 2292 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b); 2293 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b); 2294 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b); 2295 2296 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/ 2297 s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b); 2298 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b); 2299 2300 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b); 2301 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b); 2302 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b); 2303 2304 2305 /* store 8 8-bit output values */ 2306 /* Store the output pixels of (cur_row + 1)*/ 2307 _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s16_8x16b); 2308 2309 2310 /* row + 3*/ 2311 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b); 2312 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b); 2313 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b); 2314 2315 /*load 8 pixel values from (cur_row + 7)th row*/ 2316 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 2317 2318 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/ 2319 s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b); 2320 2321 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b); 2322 2323 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b); 2324 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b); 2325 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b); 2326 2327 /* store 8 8-bit output values */ 2328 /* Store the output pixels of (cur_row+3)*/ 2329 _mm_store_si128((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b); 2330 2331 s2_10_16x8b = s2_3_16x8b; 2332 2333 2334 pu1_src += 4 * src_strd; /* pointer update */ 2335 pi2_dst += 4 * dst_strd; /* pointer update */ 2336 } 2337 } 2338 } 2339 else /* wd = multiple of 8 case */ 2340 { 2341 2342 pu1_src_copy = pu1_src; 2343 pi2_dst_copy = pi2_dst; 2344 2345 for(col = 0; col < wd; col += 4) 2346 { 2347 2348 pu1_src = pu1_src_copy + col; 2349 pi2_dst = pi2_dst_copy + col; 2350 2351 PREFETCH((char const *)(pu1_src + (8 * src_strd)), _MM_HINT_T0) 2352 PREFETCH((char const *)(pu1_src + (9 * src_strd)), _MM_HINT_T0) 2353 PREFETCH((char const *)(pu1_src + (10 * src_strd)), _MM_HINT_T0) 2354 PREFETCH((char const *)(pu1_src + (11 * src_strd)), _MM_HINT_T0) 2355 2356 /*load 8 pixel values */ 2357 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-3 * src_strd))); 2358 2359 /*load 8 pixel values */ 2360 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-2 * src_strd))); 2361 2362 s3_0_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b); 2363 2364 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b); 2365 2366 /*load 8 pixel values */ 2367 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd))); 2368 2369 /*load 8 pixel values */ 2370 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd))); 2371 2372 s3_1_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b); 2373 2374 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b); 2375 2376 /*load 8 pixel values */ 2377 s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd))); 2378 2379 /*load 8 pixel values */ 2380 s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 2381 2382 s3_2_16x8b = _mm_unpacklo_epi8(s2_4_16x8b, s2_5_16x8b); 2383 2384 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b); 2385 2386 /*load 8 pixel values */ 2387 s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 2388 2389 /*load 8 pixel values */ 2390 s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (4 * src_strd))); 2391 2392 s3_3_16x8b = _mm_unpacklo_epi8(s2_6_16x8b, s2_7_16x8b); 2393 2394 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b); 2395 2396 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b); 2397 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b); 2398 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b); 2399 2400 /* store 8 8-bit output values */ 2401 /* Store the output pixels of row 0*/ 2402 _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b); 2403 2404 /* ROW 2*/ 2405 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b); 2406 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b); 2407 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b); 2408 2409 /*load 8 pixel values */ 2410 s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (5 * src_strd))); 2411 2412 /*load 8 pixel values */ 2413 s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (6 * src_strd))); 2414 2415 s3_4_16x8b = _mm_unpacklo_epi8(s2_8_16x8b, s2_9_16x8b); 2416 2417 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b); 2418 2419 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b); 2420 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b); 2421 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b); 2422 2423 /* store 8 8-bit output values */ 2424 /* Store the output pixels of row 2*/ 2425 _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b); 2426 2427 2428 /*ROW 1*/ 2429 s4_0_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b); 2430 2431 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b); 2432 2433 s4_1_16x8b = _mm_unpacklo_epi8(s2_3_16x8b, s2_4_16x8b); 2434 2435 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b); 2436 2437 s4_2_16x8b = _mm_unpacklo_epi8(s2_5_16x8b, s2_6_16x8b); 2438 2439 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b); 2440 2441 s4_3_16x8b = _mm_unpacklo_epi8(s2_7_16x8b, s2_8_16x8b); 2442 2443 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b); 2444 2445 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b); 2446 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b); 2447 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b); 2448 2449 2450 /* store 8 8-bit output values */ 2451 /* Store the output pixels of row 1*/ 2452 _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s16_8x16b); 2453 2454 2455 /* ROW 3*/ 2456 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b); 2457 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b); 2458 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b); 2459 2460 /*load 8 pixel values */ 2461 s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (7 * src_strd))); 2462 2463 s4_4_16x8b = _mm_unpacklo_epi8(s2_9_16x8b, s2_10_16x8b); 2464 2465 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b); 2466 2467 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b); 2468 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b); 2469 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b); 2470 2471 /* store 8 8-bit output values */ 2472 /* Store the output pixels of row 2*/ 2473 _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b); 2474 2475 pu1_src += (8 * src_strd); 2476 pi2_dst += (4 * dst_strd); 2477 2478 for(row = 4; row < ht; row += 4) 2479 { 2480 2481 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 2482 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 2483 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 2484 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 2485 2486 s3_0_16x8b = s3_2_16x8b; 2487 s3_1_16x8b = s3_3_16x8b; 2488 s3_2_16x8b = s3_4_16x8b; 2489 2490 s0_8x16b = _mm_maddubs_epi16(s3_0_16x8b, coeff0_1_8x16b); 2491 s1_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff2_3_8x16b); 2492 s2_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff4_5_8x16b); 2493 2494 /*load 8 pixel values from (cur_row + 4)th row*/ 2495 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); 2496 2497 s3_3_16x8b = _mm_unpacklo_epi8(s2_10_16x8b, s2_0_16x8b); 2498 s3_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff6_7_8x16b); 2499 2500 s4_0_16x8b = s4_2_16x8b; 2501 s4_1_16x8b = s4_3_16x8b; 2502 s4_2_16x8b = s4_4_16x8b; 2503 2504 s4_8x16b = _mm_add_epi16(s0_8x16b, s1_8x16b); 2505 s5_8x16b = _mm_add_epi16(s2_8x16b, s3_8x16b); 2506 s6_8x16b = _mm_add_epi16(s4_8x16b, s5_8x16b); 2507 2508 /* store 8 8-bit output values */ 2509 /* Store the output pixels of row 4*/ 2510 _mm_storel_epi64((__m128i *)(pi2_dst), s6_8x16b); 2511 2512 /* row + 2*/ 2513 s20_8x16b = _mm_maddubs_epi16(s3_1_16x8b, coeff0_1_8x16b); 2514 s21_8x16b = _mm_maddubs_epi16(s3_2_16x8b, coeff2_3_8x16b); 2515 s22_8x16b = _mm_maddubs_epi16(s3_3_16x8b, coeff4_5_8x16b); 2516 2517 /*load 8 pixel values from (cur_row + 5)th row*/ 2518 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 2519 2520 /*load 8 pixel values from (cur_row + 6)th row*/ 2521 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 2522 2523 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/ 2524 s3_4_16x8b = _mm_unpacklo_epi8(s2_1_16x8b, s2_2_16x8b); 2525 2526 s23_8x16b = _mm_maddubs_epi16(s3_4_16x8b, coeff6_7_8x16b); 2527 2528 s24_8x16b = _mm_add_epi16(s20_8x16b, s21_8x16b); 2529 s25_8x16b = _mm_add_epi16(s22_8x16b, s23_8x16b); 2530 s26_8x16b = _mm_add_epi16(s24_8x16b, s25_8x16b); 2531 2532 /* store 8 8-bit output values */ 2533 /* Store the output pixels of (cur_row+2)*/ 2534 _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s26_8x16b); 2535 2536 2537 /*row + 1*/ 2538 s10_8x16b = _mm_maddubs_epi16(s4_0_16x8b, coeff0_1_8x16b); 2539 s11_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff2_3_8x16b); 2540 s12_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff4_5_8x16b); 2541 2542 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/ 2543 s4_3_16x8b = _mm_unpacklo_epi8(s2_0_16x8b, s2_1_16x8b); 2544 s13_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff6_7_8x16b); 2545 2546 s14_8x16b = _mm_add_epi16(s10_8x16b, s11_8x16b); 2547 s15_8x16b = _mm_add_epi16(s12_8x16b, s13_8x16b); 2548 s16_8x16b = _mm_add_epi16(s14_8x16b, s15_8x16b); 2549 2550 /* store 8 8-bit output values */ 2551 /* Store the output pixels of (cur_row + 1)*/ 2552 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s16_8x16b); 2553 2554 2555 /* row + 3*/ 2556 s30_8x16b = _mm_maddubs_epi16(s4_1_16x8b, coeff0_1_8x16b); 2557 s31_8x16b = _mm_maddubs_epi16(s4_2_16x8b, coeff2_3_8x16b); 2558 s32_8x16b = _mm_maddubs_epi16(s4_3_16x8b, coeff4_5_8x16b); 2559 2560 /*load 8 pixel values from (cur_row + 7)th row*/ 2561 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 2562 2563 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/ 2564 s4_4_16x8b = _mm_unpacklo_epi8(s2_2_16x8b, s2_3_16x8b); 2565 2566 s33_8x16b = _mm_maddubs_epi16(s4_4_16x8b, coeff6_7_8x16b); 2567 2568 s34_8x16b = _mm_add_epi16(s30_8x16b, s31_8x16b); 2569 s35_8x16b = _mm_add_epi16(s32_8x16b, s33_8x16b); 2570 s36_8x16b = _mm_add_epi16(s34_8x16b, s35_8x16b); 2571 2572 /* store 8 8-bit output values */ 2573 /* Store the output pixels of (cur_row+3)*/ 2574 _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s36_8x16b); 2575 2576 s2_10_16x8b = s2_3_16x8b; 2577 2578 pu1_src += 4 * src_strd; /* pointer update */ 2579 pi2_dst += 4 * dst_strd; /* pointer update */ 2580 } 2581 } 2582 } 2583 } 2584 2585 /** 2586 ******************************************************************************* 2587 * 2588 * @brief 2589 * 2590 * Luma vertical filter for 16bit input. 2591 * 2592 * @par Description: 2593 * Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 2594 * the elements pointed by 'pu1_src' and writes to the location pointed by 2595 * 'pu1_dst' Input is 16 bits The filter output is downshifted by 12 and 2596 * clipped to lie between 0 and 255 2597 * 2598 * @param[in] pi2_src 2599 * WORD16 pointer to the source 2600 * 2601 * @param[out] pu1_dst 2602 * UWORD8 pointer to the destination 2603 * 2604 * @param[in] src_strd 2605 * integer source stride 2606 * 2607 * @param[in] dst_strd 2608 * integer destination stride 2609 * 2610 * @param[in] pi1_coeff 2611 * WORD8 pointer to the filter coefficients 2612 * 2613 * @param[in] ht 2614 * integer height of the array 2615 * 2616 * @param[in] wd 2617 * integer width of the array 2618 * 2619 * @returns 2620 * 2621 * @remarks 2622 * None 2623 * 2624 ******************************************************************************* 2625 */ 2626 void ihevc_inter_pred_luma_vert_w16inp_ssse3(WORD16 *pi2_src, 2627 UWORD8 *pu1_dst, 2628 WORD32 src_strd, 2629 WORD32 dst_strd, 2630 WORD8 *pi1_coeff, 2631 WORD32 ht, 2632 WORD32 wd) 2633 { 2634 WORD32 row, col; 2635 WORD16 *pi2_src_copy; 2636 UWORD8 *pu1_dst_copy; 2637 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b; 2638 __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b; 2639 __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b; 2640 __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b; 2641 __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b; 2642 __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b; 2643 __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b; 2644 __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b; 2645 2646 __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg; 2647 2648 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 2649 s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff); 2650 2651 zero_8x16b = _mm_setzero_si128(); 2652 sign_reg = _mm_cmpgt_epi8(zero_8x16b, s4_8x16b); 2653 s5_8x16b = _mm_unpacklo_epi8(s4_8x16b, sign_reg); 2654 2655 coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0)); /* pi1_coeff[4] */ 2656 coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1)); /* pi1_coeff[4] */ 2657 2658 coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2)); /* pi1_coeff[4] */ 2659 coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3)); /* pi1_coeff[4] */ 2660 2661 2662 /* seting values in register */ 2663 offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */ 2664 mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000); 2665 mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF); 2666 2667 2668 pi2_src_copy = pi2_src; 2669 pu1_dst_copy = pu1_dst; 2670 2671 /* outer for loop starts from here */ 2672 for(col = 0; col < wd; col += 4) 2673 { 2674 2675 pi2_src = pi2_src_copy + col; 2676 pu1_dst = pu1_dst_copy + col; 2677 2678 /*load 4 pixel values */ 2679 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd))); 2680 2681 /*load 4 pixel values */ 2682 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd))); 2683 2684 s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b); 2685 2686 s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b); 2687 2688 /*load 4 pixel values */ 2689 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd))); 2690 2691 /*load 4 pixel values */ 2692 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd))); 2693 2694 s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b); 2695 2696 s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b); 2697 2698 /*load 4 pixel values */ 2699 s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd))); 2700 2701 /*load 4 pixel values */ 2702 s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd))); 2703 2704 s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b); 2705 2706 s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b); 2707 2708 /*load 4 pixel values */ 2709 s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd))); 2710 2711 /*load 4 pixel values */ 2712 s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd))); 2713 2714 s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b); 2715 2716 s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b); 2717 2718 s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b); 2719 s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b); 2720 s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b); 2721 2722 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2723 s8_8x16b = _mm_srai_epi32(s6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2724 2725 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 2726 s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b); 2727 2728 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2729 s8_8x16b = _mm_srai_epi32(s9_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2730 2731 s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b); 2732 2733 2734 /* i2_tmp = CLIP_U8(i2_tmp);*/ 2735 s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b); 2736 2737 s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst)); 2738 s5_8x16b = _mm_and_si128(s4_8x16b, mask_low_32b); 2739 s6_8x16b = _mm_and_si128(s9_8x16b, mask_high_96b); 2740 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b); 2741 2742 /* store 8 8-bit output values */ 2743 /* Store the output pixels of row 0*/ 2744 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b); 2745 2746 /* ROW 2*/ 2747 s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b); 2748 s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b); 2749 s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b); 2750 2751 /*load 4 pixel values */ 2752 s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd))); 2753 2754 /*load 4 pixel values */ 2755 s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd))); 2756 2757 s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b); 2758 2759 s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b); 2760 2761 s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b); 2762 s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b); 2763 s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b); 2764 2765 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2766 s28_8x16b = _mm_srai_epi32(s26_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2767 2768 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 2769 s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b); 2770 2771 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2772 s28_8x16b = _mm_srai_epi32(s29_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2773 2774 s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b); 2775 2776 2777 /* i2_tmp = CLIP_U8(i2_tmp);*/ 2778 s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b); 2779 2780 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd))); 2781 s25_8x16b = _mm_and_si128(s24_8x16b, mask_low_32b); 2782 s26_8x16b = _mm_and_si128(s29_8x16b, mask_high_96b); 2783 s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b); 2784 2785 /* store 8 8-bit output values */ 2786 /* Store the output pixels of row 2*/ 2787 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b); 2788 2789 2790 /*ROW 1*/ 2791 s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b); 2792 2793 s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b); 2794 2795 s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b); 2796 2797 s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b); 2798 2799 s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b); 2800 2801 s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b); 2802 2803 s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b); 2804 2805 s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b); 2806 2807 s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b); 2808 s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b); 2809 s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b); 2810 2811 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2812 s18_8x16b = _mm_srai_epi32(s16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2813 2814 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 2815 s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b); 2816 2817 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2818 s18_8x16b = _mm_srai_epi32(s19_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2819 2820 s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b); 2821 2822 2823 /* i2_tmp = CLIP_U8(i2_tmp);*/ 2824 s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b); 2825 2826 s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (dst_strd))); 2827 s15_8x16b = _mm_and_si128(s14_8x16b, mask_low_32b); 2828 s16_8x16b = _mm_and_si128(s19_8x16b, mask_high_96b); 2829 s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b); 2830 2831 /* store 8 8-bit output values */ 2832 /* Store the output pixels of row 1*/ 2833 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd)), s19_8x16b); 2834 2835 2836 /* ROW 3*/ 2837 s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b); 2838 s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b); 2839 s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b); 2840 2841 /*load 4 pixel values */ 2842 s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd))); 2843 2844 s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b); 2845 2846 s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b); 2847 2848 s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b); 2849 s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b); 2850 s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b); 2851 2852 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2853 s38_8x16b = _mm_srai_epi32(s36_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2854 2855 2856 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 2857 s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b); 2858 2859 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2860 s38_8x16b = _mm_srai_epi32(s39_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2861 2862 s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b); 2863 2864 2865 /* i2_tmp = CLIP_U8(i2_tmp);*/ 2866 s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b); 2867 2868 s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd))); 2869 s35_8x16b = _mm_and_si128(s34_8x16b, mask_low_32b); 2870 s36_8x16b = _mm_and_si128(s39_8x16b, mask_high_96b); 2871 s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b); 2872 2873 /* store 8 8-bit output values */ 2874 /* Store the output pixels of row 2*/ 2875 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b); 2876 2877 pi2_src += (8 * src_strd); 2878 pu1_dst += (4 * dst_strd); 2879 2880 for(row = 4; row < ht; row += 4) 2881 { 2882 2883 s3_0_16x8b = s3_2_16x8b; 2884 s3_1_16x8b = s3_3_16x8b; 2885 s3_2_16x8b = s3_4_16x8b; 2886 2887 s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b); 2888 s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b); 2889 s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b); 2890 2891 /*load 4 pixel values from (cur_row + 4)th row*/ 2892 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src)); 2893 2894 s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b); 2895 s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b); 2896 2897 s4_0_16x8b = s4_2_16x8b; 2898 s4_1_16x8b = s4_3_16x8b; 2899 s4_2_16x8b = s4_4_16x8b; 2900 2901 s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b); 2902 s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b); 2903 s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b); 2904 2905 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2906 s8_8x16b = _mm_srai_epi32(s6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2907 2908 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 2909 s9_8x16b = _mm_add_epi32(s8_8x16b, offset_8x16b); 2910 2911 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2912 s8_8x16b = _mm_srai_epi32(s9_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2913 2914 s8_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b); 2915 2916 2917 /* i2_tmp = CLIP_U8(i2_tmp);*/ 2918 s9_8x16b = _mm_packus_epi16(s8_8x16b, zero_8x16b); 2919 2920 s4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst)); 2921 s5_8x16b = _mm_and_si128(s4_8x16b, mask_low_32b); 2922 s6_8x16b = _mm_and_si128(s9_8x16b, mask_high_96b); 2923 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b); 2924 2925 /* store 8 8-bit output values */ 2926 /* Store the output pixels of row 4*/ 2927 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b); 2928 2929 /* row + 2*/ 2930 s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b); 2931 s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b); 2932 s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b); 2933 2934 /*load 4 pixel values from (cur_row + 5)th row*/ 2935 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd)); 2936 2937 /*load 4 pixel values from (cur_row + 6)th row*/ 2938 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd))); 2939 2940 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/ 2941 s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b); 2942 2943 s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b); 2944 2945 s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b); 2946 s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b); 2947 s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b); 2948 2949 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2950 s28_8x16b = _mm_srai_epi32(s26_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2951 2952 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 2953 s29_8x16b = _mm_add_epi32(s28_8x16b, offset_8x16b); 2954 2955 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2956 s28_8x16b = _mm_srai_epi32(s29_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2957 2958 s28_8x16b = _mm_packs_epi32(s28_8x16b, zero_8x16b); 2959 2960 2961 /* i2_tmp = CLIP_U8(i2_tmp);*/ 2962 s29_8x16b = _mm_packus_epi16(s28_8x16b, zero_8x16b); 2963 2964 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (2 * dst_strd))); 2965 s25_8x16b = _mm_and_si128(s24_8x16b, mask_low_32b); 2966 s26_8x16b = _mm_and_si128(s29_8x16b, mask_high_96b); 2967 s29_8x16b = _mm_or_si128(s25_8x16b, s26_8x16b); 2968 2969 /* store 8 8-bit output values */ 2970 /* Store the output pixels of (cur_row+2)*/ 2971 _mm_storel_epi64((__m128i *)(pu1_dst + (2 * dst_strd)), s29_8x16b); 2972 2973 2974 /*row + 1*/ 2975 s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b); 2976 s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b); 2977 s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b); 2978 2979 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/ 2980 s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b); 2981 s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b); 2982 2983 s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b); 2984 s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b); 2985 s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b); 2986 2987 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2988 s18_8x16b = _mm_srai_epi32(s16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2989 2990 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 2991 s19_8x16b = _mm_add_epi32(s18_8x16b, offset_8x16b); 2992 2993 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 2994 s18_8x16b = _mm_srai_epi32(s19_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 2995 2996 s18_8x16b = _mm_packs_epi32(s18_8x16b, zero_8x16b); 2997 2998 /* i2_tmp = CLIP_U8(i2_tmp);*/ 2999 s19_8x16b = _mm_packus_epi16(s18_8x16b, zero_8x16b); 3000 3001 s14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd)); 3002 s15_8x16b = _mm_and_si128(s14_8x16b, mask_low_32b); 3003 s16_8x16b = _mm_and_si128(s19_8x16b, mask_high_96b); 3004 s19_8x16b = _mm_or_si128(s15_8x16b, s16_8x16b); 3005 3006 /* store 8 8-bit output values */ 3007 /* Store the output pixels of (cur_row + 1)*/ 3008 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s19_8x16b); 3009 3010 3011 /* row + 3*/ 3012 s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b); 3013 s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b); 3014 s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b); 3015 3016 /*load 4 pixel values from (cur_row + 7)th row*/ 3017 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd))); 3018 3019 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/ 3020 s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b); 3021 3022 s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b); 3023 3024 s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b); 3025 s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b); 3026 s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b); 3027 3028 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3029 s38_8x16b = _mm_srai_epi32(s36_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3030 3031 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3032 s39_8x16b = _mm_add_epi32(s38_8x16b, offset_8x16b); 3033 3034 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3035 s38_8x16b = _mm_srai_epi32(s39_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3036 3037 s38_8x16b = _mm_packs_epi32(s38_8x16b, zero_8x16b); 3038 3039 3040 /* i2_tmp = CLIP_U8(i2_tmp);*/ 3041 s39_8x16b = _mm_packus_epi16(s38_8x16b, zero_8x16b); 3042 3043 s34_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + (3 * dst_strd))); 3044 s35_8x16b = _mm_and_si128(s34_8x16b, mask_low_32b); 3045 s36_8x16b = _mm_and_si128(s39_8x16b, mask_high_96b); 3046 s39_8x16b = _mm_or_si128(s35_8x16b, s36_8x16b); 3047 3048 /* store 8 8-bit output values */ 3049 /* Store the output pixels of (cur_row+3)*/ 3050 _mm_storel_epi64((__m128i *)(pu1_dst + (3 * dst_strd)), s39_8x16b); 3051 3052 s2_10_16x8b = s2_3_16x8b; 3053 3054 pi2_src += 4 * src_strd; /* pointer update */ 3055 pu1_dst += 4 * dst_strd; /* pointer update */ 3056 } 3057 } 3058 3059 } 3060 3061 3062 /** 3063 ******************************************************************************* 3064 * 3065 * @brief 3066 * Luma prediction filter for vertical 16bit input & output 3067 * 3068 * @par Description: 3069 * Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 3070 * the elements pointed by 'pu1_src' and writes to the location pointed by 3071 * 'pu1_dst' Input is 16 bits The filter output is downshifted by 6 and 3072 * 8192 is subtracted to store it as a 16 bit number The output is used as 3073 * a input to weighted prediction 3074 * 3075 * @param[in] pi2_src 3076 * WORD16 pointer to the source 3077 * 3078 * @param[out] pi2_dst 3079 * WORD16 pointer to the destination 3080 * 3081 * @param[in] src_strd 3082 * integer source stride 3083 * 3084 * @param[in] dst_strd 3085 * integer destination stride 3086 * 3087 * @param[in] pi1_coeff 3088 * WORD8 pointer to the filter coefficients 3089 * 3090 * @param[in] ht 3091 * integer height of the array 3092 * 3093 * @param[in] wd 3094 * integer width of the array 3095 * 3096 * @returns 3097 * 3098 * @remarks 3099 * None 3100 * 3101 ******************************************************************************* 3102 */ 3103 void ihevc_inter_pred_luma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src, 3104 WORD16 *pi2_dst, 3105 WORD32 src_strd, 3106 WORD32 dst_strd, 3107 WORD8 *pi1_coeff, 3108 WORD32 ht, 3109 WORD32 wd) 3110 { 3111 WORD32 row, col; 3112 WORD16 *pi2_src_copy; 3113 WORD16 *pi2_dst_copy; 3114 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b, coeff6_7_8x16b; 3115 __m128i s0_8x16b, s1_8x16b, s2_8x16b, s3_8x16b, s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b, s9_8x16b; 3116 __m128i s2_0_16x8b, s2_1_16x8b, s2_2_16x8b, s2_3_16x8b, s2_4_16x8b, s2_5_16x8b, s2_6_16x8b, s2_7_16x8b, s2_8_16x8b, s2_9_16x8b, s2_10_16x8b; 3117 __m128i s3_0_16x8b, s3_1_16x8b, s3_2_16x8b, s3_3_16x8b, s3_4_16x8b; 3118 __m128i s4_0_16x8b, s4_1_16x8b, s4_2_16x8b, s4_3_16x8b, s4_4_16x8b; 3119 __m128i s10_8x16b, s11_8x16b, s12_8x16b, s13_8x16b, s14_8x16b, s15_8x16b, s16_8x16b, s18_8x16b, s19_8x16b; 3120 __m128i s20_8x16b, s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b, s26_8x16b, s28_8x16b, s29_8x16b; 3121 __m128i s30_8x16b, s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b, s36_8x16b, s38_8x16b, s39_8x16b; 3122 3123 __m128i zero_8x16b, offset_8x16b, sign_reg; 3124 3125 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 3126 s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff); 3127 3128 zero_8x16b = _mm_setzero_si128(); 3129 sign_reg = _mm_cmpgt_epi8(zero_8x16b, s4_8x16b); 3130 s5_8x16b = _mm_unpacklo_epi8(s4_8x16b, sign_reg); 3131 3132 coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0)); /* pi1_coeff[4] */ 3133 coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1)); /* pi1_coeff[4] */ 3134 3135 coeff4_5_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(2, 2, 2, 2)); /* pi1_coeff[4] */ 3136 coeff6_7_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(3, 3, 3, 3)); /* pi1_coeff[4] */ 3137 3138 3139 /* seting values in register */ 3140 offset_8x16b = _mm_set1_epi32(OFFSET14); /* for offset addition */ 3141 3142 pi2_src_copy = pi2_src; 3143 pi2_dst_copy = pi2_dst; 3144 3145 /* outer for loop starts from here */ 3146 for(col = 0; col < wd; col += 4) 3147 { 3148 3149 pi2_src = pi2_src_copy + col; 3150 pi2_dst = pi2_dst_copy + col; 3151 3152 /*load 4 pixel values*/ 3153 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-3 * src_strd))); 3154 3155 /*load 4 pixel values*/ 3156 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-2 * src_strd))); 3157 3158 s3_0_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b); 3159 3160 s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b); 3161 3162 /*load 4 pixel values*/ 3163 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd))); 3164 3165 /*load 4 pixel values*/ 3166 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd))); 3167 3168 s3_1_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b); 3169 3170 s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b); 3171 3172 /*load 4 pixel values*/ 3173 s2_4_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd))); 3174 3175 /*load 4 pixel values*/ 3176 s2_5_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd))); 3177 3178 s3_2_16x8b = _mm_unpacklo_epi16(s2_4_16x8b, s2_5_16x8b); 3179 3180 s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b); 3181 3182 /*load 4 pixel values*/ 3183 s2_6_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd))); 3184 3185 /*load 4 pixel values*/ 3186 s2_7_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (4 * src_strd))); 3187 3188 s3_3_16x8b = _mm_unpacklo_epi16(s2_6_16x8b, s2_7_16x8b); 3189 3190 s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b); 3191 3192 s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b); 3193 s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b); 3194 s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b); 3195 3196 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3197 s8_8x16b = _mm_srai_epi32(s6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3198 3199 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3200 s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b); 3201 3202 s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b); 3203 3204 /* store 8 8-bit output values */ 3205 /* Store the output pixels of row 0*/ 3206 _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b); 3207 3208 /* ROW 2*/ 3209 s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b); 3210 s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b); 3211 s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b); 3212 3213 /*load 4 pixel values*/ 3214 s2_8_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (5 * src_strd))); 3215 3216 /*load 4 pixel values*/ 3217 s2_9_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (6 * src_strd))); 3218 3219 s3_4_16x8b = _mm_unpacklo_epi16(s2_8_16x8b, s2_9_16x8b); 3220 3221 s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b); 3222 3223 s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b); 3224 s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b); 3225 s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b); 3226 3227 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3228 s28_8x16b = _mm_srai_epi32(s26_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3229 3230 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3231 s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b); 3232 3233 s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b); 3234 3235 /* store 8 8-bit output values */ 3236 /* Store the output pixels of row 2*/ 3237 _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b); 3238 3239 3240 /*ROW 1*/ 3241 s4_0_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b); 3242 3243 s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b); 3244 3245 s4_1_16x8b = _mm_unpacklo_epi16(s2_3_16x8b, s2_4_16x8b); 3246 3247 s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b); 3248 3249 s4_2_16x8b = _mm_unpacklo_epi16(s2_5_16x8b, s2_6_16x8b); 3250 3251 s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b); 3252 3253 s4_3_16x8b = _mm_unpacklo_epi16(s2_7_16x8b, s2_8_16x8b); 3254 3255 s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b); 3256 3257 s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b); 3258 s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b); 3259 s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b); 3260 3261 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3262 s18_8x16b = _mm_srai_epi32(s16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3263 3264 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3265 s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b); 3266 3267 s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b); 3268 3269 /* store 8 8-bit output values */ 3270 /* Store the output pixels of row 1*/ 3271 _mm_storel_epi64((__m128i *)(pi2_dst + (dst_strd)), s18_8x16b); 3272 3273 3274 /* ROW 3*/ 3275 s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b); 3276 s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b); 3277 s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b); 3278 3279 /*load 4 pixel values*/ 3280 s2_10_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (7 * src_strd))); 3281 3282 s4_4_16x8b = _mm_unpacklo_epi16(s2_9_16x8b, s2_10_16x8b); 3283 3284 s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b); 3285 3286 s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b); 3287 s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b); 3288 s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b); 3289 3290 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3291 s38_8x16b = _mm_srai_epi32(s36_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3292 3293 3294 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3295 s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b); 3296 3297 s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b); 3298 3299 /* store 8 8-bit output values */ 3300 /* Store the output pixels of row 2*/ 3301 _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b); 3302 3303 pi2_src += (8 * src_strd); 3304 pi2_dst += (4 * dst_strd); 3305 3306 for(row = 4; row < ht; row += 4) 3307 { 3308 3309 s3_0_16x8b = s3_2_16x8b; 3310 s3_1_16x8b = s3_3_16x8b; 3311 s3_2_16x8b = s3_4_16x8b; 3312 3313 s0_8x16b = _mm_madd_epi16(s3_0_16x8b, coeff0_1_8x16b); 3314 s1_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff2_3_8x16b); 3315 s2_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff4_5_8x16b); 3316 3317 /*load 4 pixel values from (cur_row + 4)th row*/ 3318 s2_0_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src)); 3319 3320 s3_3_16x8b = _mm_unpacklo_epi16(s2_10_16x8b, s2_0_16x8b); 3321 s3_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff6_7_8x16b); 3322 3323 s4_0_16x8b = s4_2_16x8b; 3324 s4_1_16x8b = s4_3_16x8b; 3325 s4_2_16x8b = s4_4_16x8b; 3326 3327 s4_8x16b = _mm_add_epi32(s0_8x16b, s1_8x16b); 3328 s5_8x16b = _mm_add_epi32(s2_8x16b, s3_8x16b); 3329 s6_8x16b = _mm_add_epi32(s4_8x16b, s5_8x16b); 3330 3331 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3332 s8_8x16b = _mm_srai_epi32(s6_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3333 3334 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3335 s9_8x16b = _mm_sub_epi32(s8_8x16b, offset_8x16b); 3336 3337 s8_8x16b = _mm_packs_epi32(s9_8x16b, zero_8x16b); 3338 3339 /* store 8 8-bit output values */ 3340 /* Store the output pixels of row 4*/ 3341 _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b); 3342 3343 /* row + 2*/ 3344 s20_8x16b = _mm_madd_epi16(s3_1_16x8b, coeff0_1_8x16b); 3345 s21_8x16b = _mm_madd_epi16(s3_2_16x8b, coeff2_3_8x16b); 3346 s22_8x16b = _mm_madd_epi16(s3_3_16x8b, coeff4_5_8x16b); 3347 3348 /*load 4 pixel values from (cur_row + 5)th row*/ 3349 s2_1_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + src_strd)); 3350 3351 /*load 4 pixel values from (cur_row + 6)th row*/ 3352 s2_2_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd))); 3353 3354 /*unpacking (cur_row + 5)th row and (cur_row + 6)th row*/ 3355 s3_4_16x8b = _mm_unpacklo_epi16(s2_1_16x8b, s2_2_16x8b); 3356 3357 s23_8x16b = _mm_madd_epi16(s3_4_16x8b, coeff6_7_8x16b); 3358 3359 s24_8x16b = _mm_add_epi32(s20_8x16b, s21_8x16b); 3360 s25_8x16b = _mm_add_epi32(s22_8x16b, s23_8x16b); 3361 s26_8x16b = _mm_add_epi32(s24_8x16b, s25_8x16b); 3362 3363 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3364 s28_8x16b = _mm_srai_epi32(s26_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3365 3366 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3367 s29_8x16b = _mm_sub_epi32(s28_8x16b, offset_8x16b); 3368 3369 s28_8x16b = _mm_packs_epi32(s29_8x16b, zero_8x16b); 3370 3371 /* store 8 8-bit output values */ 3372 /* Store the output pixels of (cur_row+2)*/ 3373 _mm_storel_epi64((__m128i *)(pi2_dst + (2 * dst_strd)), s28_8x16b); 3374 3375 3376 /*row + 1*/ 3377 s10_8x16b = _mm_madd_epi16(s4_0_16x8b, coeff0_1_8x16b); 3378 s11_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff2_3_8x16b); 3379 s12_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff4_5_8x16b); 3380 3381 /*unpacking (cur_row + 4)th row and (cur_row + 5)th row*/ 3382 s4_3_16x8b = _mm_unpacklo_epi16(s2_0_16x8b, s2_1_16x8b); 3383 s13_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff6_7_8x16b); 3384 3385 s14_8x16b = _mm_add_epi32(s10_8x16b, s11_8x16b); 3386 s15_8x16b = _mm_add_epi32(s12_8x16b, s13_8x16b); 3387 s16_8x16b = _mm_add_epi32(s14_8x16b, s15_8x16b); 3388 3389 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3390 s18_8x16b = _mm_srai_epi32(s16_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3391 3392 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3393 s19_8x16b = _mm_sub_epi32(s18_8x16b, offset_8x16b); 3394 3395 s18_8x16b = _mm_packs_epi32(s19_8x16b, zero_8x16b); 3396 3397 /* store 8 8-bit output values */ 3398 /* Store the output pixels of (cur_row + 1)*/ 3399 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s18_8x16b); 3400 3401 3402 /* row + 3*/ 3403 s30_8x16b = _mm_madd_epi16(s4_1_16x8b, coeff0_1_8x16b); 3404 s31_8x16b = _mm_madd_epi16(s4_2_16x8b, coeff2_3_8x16b); 3405 s32_8x16b = _mm_madd_epi16(s4_3_16x8b, coeff4_5_8x16b); 3406 3407 /*load 4 pixel values from (cur_row + 7)th row*/ 3408 s2_3_16x8b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd))); 3409 3410 /*unpacking (cur_row + 6)th row and (cur_row + 7)th row*/ 3411 s4_4_16x8b = _mm_unpacklo_epi16(s2_2_16x8b, s2_3_16x8b); 3412 3413 s33_8x16b = _mm_madd_epi16(s4_4_16x8b, coeff6_7_8x16b); 3414 3415 s34_8x16b = _mm_add_epi32(s30_8x16b, s31_8x16b); 3416 s35_8x16b = _mm_add_epi32(s32_8x16b, s33_8x16b); 3417 s36_8x16b = _mm_add_epi32(s34_8x16b, s35_8x16b); 3418 3419 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3420 s38_8x16b = _mm_srai_epi32(s36_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3421 3422 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 3423 s39_8x16b = _mm_sub_epi32(s38_8x16b, offset_8x16b); 3424 3425 s38_8x16b = _mm_packs_epi32(s39_8x16b, zero_8x16b); 3426 3427 /* store 8 8-bit output values */ 3428 /* Store the output pixels of (cur_row+3)*/ 3429 _mm_storel_epi64((__m128i *)(pi2_dst + (3 * dst_strd)), s38_8x16b); 3430 3431 s2_10_16x8b = s2_3_16x8b; 3432 3433 pi2_src += 4 * src_strd; /* pointer update */ 3434 pi2_dst += 4 * dst_strd; /* pointer update */ 3435 } 3436 } 3437 3438 } 3439 3440 /** 3441 ******************************************************************************* 3442 * 3443 * @brief 3444 * Chroma interprediction filter for copy 3445 * 3446 * @par Description: 3447 * Copies the array of width 'wd' and height 'ht' from the location pointed 3448 * by 'src' to the location pointed by 'dst' 3449 * 3450 * @param[in] pu1_src 3451 * UWORD8 pointer to the source 3452 * 3453 * @param[out] pu1_dst 3454 * UWORD8 pointer to the destination 3455 * 3456 * @param[in] src_strd 3457 * integer source stride 3458 * 3459 * @param[in] dst_strd 3460 * integer destination stride 3461 * 3462 * @param[in] pi1_coeff 3463 * WORD8 pointer to the filter coefficients 3464 * 3465 * @param[in] ht 3466 * integer height of the array 3467 * 3468 * @param[in] wd 3469 * integer width of the array 3470 * 3471 * @returns 3472 * 3473 * @remarks 3474 * None 3475 * 3476 ******************************************************************************* 3477 */ 3478 3479 void ihevc_inter_pred_chroma_copy_ssse3(UWORD8 *pu1_src, 3480 UWORD8 *pu1_dst, 3481 WORD32 src_strd, 3482 WORD32 dst_strd, 3483 WORD8 *pi1_coeff, 3484 WORD32 ht, 3485 WORD32 wd) 3486 { 3487 WORD32 row, col; 3488 __m128i s3, mask_4x32b; 3489 UNUSED(pi1_coeff); 3490 ASSERT(wd % 2 == 0); /* checking assumption*/ 3491 ASSERT(ht % 2 == 0); /* checking assumption*/ 3492 3493 mask_4x32b = _mm_set_epi32(0, 0, 0, 0x80808080); /* Mask register */ 3494 3495 /* for loop starts from here */ 3496 if(wd % 8 == 0) 3497 { 3498 for(row = 0; row < ht; row += 2) 3499 { 3500 int offset = 0; 3501 for(col = 0; col < 2 * wd; col += 16) 3502 { 3503 /* row =0 */ 3504 3505 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 3506 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */ 3507 /* storing 16 8-bit output values */ 3508 _mm_storeu_si128((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */ 3509 3510 /* row =1 */ 3511 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/ 3512 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */ 3513 /* storing 8 8-bit output values */ 3514 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]*/ 3515 3516 offset += 16; /*To pointer update */ 3517 } /* inner for loop ends here(16-output values in single iteration) */ 3518 3519 pu1_src += 2 * src_strd; /* pointer update */ 3520 pu1_dst += 2 * dst_strd; /* pointer update */ 3521 } 3522 } 3523 else if(wd % 4 == 0) 3524 { 3525 for(row = 0; row < ht; row += 2) 3526 { 3527 int offset = 0; 3528 for(col = 0; col < 2 * wd; col += 8) 3529 { 3530 /* row =0 */ 3531 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 3532 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]; */ 3533 /* storing 8 8-bit output values */ 3534 _mm_storel_epi64((__m128i *)(pu1_dst + offset), s3); /* pu1_dst[col] = pu1_src[col]; */ 3535 /* row =1 */ 3536 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/ 3537 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); /* pu1_src[col]; */ 3538 /* storing 8 8-bit output values */ 3539 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), s3); /* pu1_dst[col] = pu1_src[col]; */ 3540 3541 offset += 8; /* To pointer update */ 3542 } /* inner for loop ends here(8-output values in single iteration) */ 3543 3544 pu1_src += 2 * src_strd; /* pointer update */ 3545 pu1_dst += 2 * dst_strd; /* pointer update */ 3546 } 3547 } 3548 else 3549 { 3550 for(row = 0; row < ht; row += 2) 3551 { 3552 int offset = 0; 3553 for(col = 0; col < 2 * wd; col += 4) 3554 { 3555 /* row =0 */ 3556 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */ 3557 /* storing four 8-bit output values */ 3558 _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + offset)); /* pu1_dst[col] = pu1_src[col]; */ 3559 /* row =1 */ 3560 /* pu1_src[col] */ 3561 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); 3562 3563 /* storing four 8-bit output values */ 3564 _mm_maskmoveu_si128(s3, mask_4x32b, (char *)(pu1_dst + dst_strd + offset)); /* pu1_dst[col] = pu1_src[col]; */ 3565 3566 offset += 4; /* To pointer update */ 3567 } /* inner for loop ends here(4-output values in single iteration) */ 3568 3569 pu1_src += 2 * src_strd; /* pointer increment */ 3570 pu1_dst += 2 * dst_strd; /* pointer increment */ 3571 } 3572 } 3573 } 3574 3575 /** 3576 ******************************************************************************* 3577 * 3578 * @brief 3579 * Chroma interprediction filter for horizontal input 3580 * 3581 * @par Description: 3582 * Applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 3583 * to the elements pointed by 'pu1_src' and writes to the location pointed 3584 * by 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits 3585 * 3586 * @param[in] pu1_src 3587 * UWORD8 pointer to the source 3588 * 3589 * @param[out] pu1_dst 3590 * UWORD8 pointer to the destination 3591 * 3592 * @param[in] src_strd 3593 * integer source stride 3594 * 3595 * @param[in] dst_strd 3596 * integer destination stride 3597 * 3598 * @param[in] pi1_coeff 3599 * WORD8 pointer to the filter coefficients 3600 * 3601 * @param[in] ht 3602 * integer height of the array 3603 * 3604 * @param[in] wd 3605 * integer width of the array 3606 * 3607 * @returns 3608 * 3609 * @remarks 3610 * None 3611 * 3612 ******************************************************************************* 3613 */ 3614 void ihevc_inter_pred_chroma_horz_ssse3(UWORD8 *pu1_src, 3615 UWORD8 *pu1_dst, 3616 WORD32 src_strd, 3617 WORD32 dst_strd, 3618 WORD8 *pi1_coeff, 3619 WORD32 ht, 3620 WORD32 wd) 3621 { 3622 WORD32 row, col; 3623 3624 __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, offset_8x16b, mask_low_32b, mask_high_96b; 3625 __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b; 3626 __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b; 3627 __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b, res_temp4_8x16b, res_temp5_8x16b, res_temp6_8x16b, res_temp7_8x16b; 3628 __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b, res_temp14_8x16b, res_temp15_8x16b, res_temp16_8x16b, res_temp17_8x16b; 3629 3630 PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0) 3631 PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0) 3632 PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0) 3633 PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0) 3634 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 3635 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 3636 3637 ASSERT(wd % 2 == 0); /* checking assumption*/ 3638 3639 /* loading four 8-bit coefficients */ 3640 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff); 3641 3642 offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */ 3643 mask_low_32b = _mm_cmpeq_epi16(offset_8x16b, offset_8x16b); 3644 mask_high_96b = _mm_srli_si128(mask_low_32b, 12); 3645 mask_low_32b = _mm_slli_si128(mask_low_32b, 4); 3646 3647 control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */ 3648 control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */ 3649 3650 coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b); /* pi1_coeff[4] */ 3651 coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b); /* pi1_coeff[4] */ 3652 3653 /* outer for loop starts from here */ 3654 if(wd % 2 == 0 && wd % 4 != 0) 3655 { 3656 3657 for(row = 0; row < ht; row += 2) 3658 { 3659 int offset = 0; 3660 3661 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 3662 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 3663 3664 3665 for(col = 0; col < 2 * wd; col += 4) 3666 { 3667 3668 3669 /*load 16 pixel values from row 0*/ 3670 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/ 3671 3672 /*load 16 pixel values from row 1*/ 3673 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/ 3674 3675 /*Derive the source pixels for processing the 2nd pixel*/ 3676 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 3677 3678 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); 3679 3680 /*Derive the source pixels for processing the 3rd pixel*/ 3681 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4); 3682 3683 /*Derive the source pixels for processing the 4th pixel*/ 3684 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6); 3685 3686 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b); 3687 3688 /*Derive the source pixels for processing the 2nd pixel*/ 3689 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); 3690 3691 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); 3692 3693 /*Derive the source pixels for processing the 3rd pixel*/ 3694 src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4); 3695 /*Derive the source pixels for processing the 4th pixel*/ 3696 src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6); 3697 3698 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b); 3699 3700 res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b); 3701 res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b); 3702 res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b); 3703 res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b); 3704 3705 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */ 3706 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 3707 3708 res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b); /* row = 0 */ 3709 res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */ 3710 res_temp13_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b); /* row = 0 */ 3711 3712 res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 4); 3713 3714 res_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + offset)); 3715 res_temp5_8x16b = _mm_and_si128(res_temp4_8x16b, mask_low_32b); 3716 res_temp6_8x16b = _mm_and_si128(res_temp13_8x16b, mask_high_96b); 3717 res_temp7_8x16b = _mm_or_si128(res_temp5_8x16b, res_temp6_8x16b); 3718 3719 /* store 4 16-bit values */ 3720 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp7_8x16b); /* pu1_dst[col] = i2_tmp_u */ 3721 3722 res_temp14_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd + offset)); 3723 res_temp15_8x16b = _mm_and_si128(res_temp14_8x16b, mask_low_32b); 3724 res_temp16_8x16b = _mm_and_si128(res_temp3_8x16b, mask_high_96b); 3725 res_temp17_8x16b = _mm_or_si128(res_temp15_8x16b, res_temp16_8x16b); 3726 3727 /* store 4 16-bit values */ 3728 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp17_8x16b); /* pu1_dst[col] = i2_tmp_u */ 3729 3730 3731 offset += 4; /* To pointer update*/ 3732 3733 } /* inner loop ends here(8- output values in single iteration)*/ 3734 3735 pu1_src += 2 * src_strd; /*pointer update*/ 3736 pu1_dst += 2 * dst_strd; /*pointer update*/ 3737 } 3738 } 3739 else 3740 { 3741 3742 for(row = 0; row < ht; row += 2) 3743 { 3744 int offset = 0; 3745 3746 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 3747 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 3748 3749 3750 for(col = 0; col < 2 * wd; col += 8) 3751 { 3752 3753 /*load 16 pixel values from row 0*/ 3754 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/ 3755 3756 /*load 16 pixel values from row 1*/ 3757 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/ 3758 3759 /*Derive the source pixels for processing the 2nd pixel*/ 3760 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 3761 3762 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); 3763 3764 /*Derive the source pixels for processing the 3rd pixel*/ 3765 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4); 3766 3767 /*Derive the source pixels for processing the 4th pixel*/ 3768 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6); 3769 3770 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b); 3771 3772 res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b); 3773 res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b); 3774 3775 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */ 3776 res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 3777 3778 res_temp4_8x16b = _mm_adds_epi16(res_temp3_8x16b, offset_8x16b); /* row = 0 */ 3779 res_temp5_8x16b = _mm_srai_epi16(res_temp4_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */ 3780 res_temp6_8x16b = _mm_packus_epi16(res_temp5_8x16b, res_temp5_8x16b); /* row = 0 */ 3781 3782 /* store 4 16-bit values */ 3783 _mm_storel_epi64((__m128i *)(pu1_dst + offset), res_temp6_8x16b); /* pi2_dst[col] = i2_tmp_u */ 3784 3785 /*Derive the source pixels for processing the 2nd pixel of row 1*/ 3786 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); 3787 3788 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); 3789 3790 /*Derive the source pixels for processing the 3rd pixel of row 1*/ 3791 src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4); 3792 3793 /*Derive the source pixels for processing the 4th pixel of row 1*/ 3794 src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6); 3795 3796 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b); 3797 3798 res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b); 3799 res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b); 3800 3801 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */ 3802 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 3803 3804 res_temp14_8x16b = _mm_adds_epi16(res_temp13_8x16b, offset_8x16b); /* row = 0 */ 3805 res_temp15_8x16b = _mm_srai_epi16(res_temp14_8x16b, SHIFT_14_MINUS_BIT_DEPTH); /* row = 0 */ 3806 res_temp16_8x16b = _mm_packus_epi16(res_temp15_8x16b, res_temp15_8x16b); /* row = 0 */ 3807 3808 /* store 4 16-bit values */ 3809 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd + offset), res_temp16_8x16b); /* pu1_dst[col] = i2_tmp_u */ 3810 3811 3812 offset += 8; /* To pointer update*/ 3813 3814 } /* inner loop ends here(8- output values in single iteration)*/ 3815 3816 pu1_src += 2 * src_strd; /*pointer update*/ 3817 pu1_dst += 2 * dst_strd; /*pointer update*/ 3818 } 3819 } 3820 } 3821 3822 /** 3823 ******************************************************************************* 3824 * 3825 * @brief 3826 * Chroma interprediction filter for vertical input 3827 * 3828 * @par Description: 3829 * Applies a vertcal filter with coefficients pointed to by 'pi1_coeff' to 3830 * the elements pointed by 'pu1_src' and writes to the location pointed by 3831 * 'pu1_dst' The output is downshifted by 6 and clipped to 8 bits 3832 * 3833 * 3834 * @param[in] pu1_src 3835 * UWORD8 pointer to the source 3836 * 3837 * @param[out] pu1_dst 3838 * UWORD8 pointer to the destination 3839 * 3840 * @param[in] src_strd 3841 * integer source stride 3842 * 3843 * @param[in] dst_strd 3844 * integer destination stride 3845 * 3846 * @param[in] pi1_coeff 3847 * WORD8 pointer to the filter coefficients 3848 * 3849 * @param[in] ht 3850 * integer height of the array 3851 * 3852 * @param[in] wd 3853 * integer width of the array 3854 * 3855 * @returns 3856 * 3857 * @remarks 3858 * None 3859 * 3860 ******************************************************************************* 3861 */ 3862 void ihevc_inter_pred_chroma_vert_ssse3(UWORD8 *pu1_src, 3863 UWORD8 *pu1_dst, 3864 WORD32 src_strd, 3865 WORD32 dst_strd, 3866 WORD8 *pi1_coeff, 3867 WORD32 ht, 3868 WORD32 wd) 3869 { 3870 WORD32 row, col; 3871 UWORD8 *pu1_src_copy; 3872 UWORD8 *pu1_dst_copy; 3873 __m128i coeff0_1_8x16b, coeff2_3_8x16b; 3874 __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b; 3875 __m128i control_mask_1_8x16b, control_mask_2_8x16b; 3876 __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b; 3877 __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b; 3878 __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b; 3879 __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b; 3880 3881 PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0) 3882 PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0) 3883 PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0) 3884 PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0) 3885 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 3886 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 3887 3888 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 3889 s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff); 3890 3891 control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */ 3892 control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */ 3893 3894 coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b); /* pi1_coeff[4] */ 3895 coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b); /* pi1_coeff[4] */ 3896 3897 3898 /* seting values in register */ 3899 zero_8x16b = _mm_setzero_si128(); /* for saturated clipping */ 3900 offset_8x16b = _mm_set1_epi16(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */ 3901 mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000); 3902 mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF); 3903 3904 /* outer for loop starts from here */ 3905 if(wd % 8 == 0) 3906 { /* wd = multiple of 8 case */ 3907 3908 pu1_src_copy = pu1_src; 3909 pu1_dst_copy = pu1_dst; 3910 3911 for(col = 0; col < 2 * wd; col += 16) 3912 { 3913 3914 pu1_src = pu1_src_copy + col; 3915 pu1_dst = pu1_dst_copy + col; 3916 3917 3918 for(row = 0; row < ht; row += 2) 3919 { 3920 3921 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 3922 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 3923 3924 3925 /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/ 3926 s21_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd))); 3927 3928 /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/ 3929 s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd))); 3930 3931 3932 /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/ 3933 s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd))); 3934 3935 /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/ 3936 s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd))); 3937 3938 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b); 3939 3940 s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b); 3941 3942 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b); 3943 3944 s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b); 3945 3946 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 3947 3948 s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b); 3949 3950 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 3951 3952 s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b); 3953 3954 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 3955 3956 s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); 3957 3958 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b); 3959 3960 s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b); 3961 3962 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 3963 s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3964 3965 s32_8x16b = _mm_srai_epi16(s31_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 3966 3967 /* i2_tmp = CLIP_U8(i2_tmp);*/ 3968 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b); 3969 3970 s33_8x16b = _mm_packus_epi16(s32_8x16b, zero_8x16b); 3971 3972 s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b); 3973 /* store 8 8-bit output values */ 3974 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 3975 _mm_storeu_si128((__m128i *)(pu1_dst), s7_8x16b); 3976 3977 3978 s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd))); 3979 3980 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b); 3981 3982 s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b); 3983 3984 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 3985 3986 s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b); 3987 3988 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b); 3989 3990 s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b); 3991 3992 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 3993 3994 s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b); 3995 3996 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 3997 3998 s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 3999 4000 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b); 4001 4002 s31_8x16b = _mm_add_epi16(s35_8x16b, offset_8x16b); 4003 4004 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 4005 s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 4006 4007 s32_8x16b = _mm_srai_epi16(s31_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 4008 4009 /* i2_tmp = CLIP_U8(i2_tmp);*/ 4010 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b); 4011 4012 s33_8x16b = _mm_packus_epi16(s32_8x16b, zero_8x16b); 4013 4014 s7_8x16b = _mm_unpacklo_epi64(s7_8x16b, s33_8x16b); 4015 /* store 8 8-bit output values */ 4016 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 4017 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), s7_8x16b); 4018 4019 pu1_src += 2 * src_strd; 4020 pu1_dst += 2 * dst_strd; 4021 4022 4023 } /* inner for loop ends here(8-output values in single iteration) */ 4024 4025 } 4026 } 4027 else if(wd % 4 == 0) 4028 { /* wd = multiple of 8 case */ 4029 4030 for(row = 0; row < ht; row += 2) 4031 { 4032 pu1_src_copy = pu1_src; 4033 pu1_dst_copy = pu1_dst; 4034 for(col = 0; col < 2 * wd; col += 8) 4035 { 4036 4037 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 4038 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 4039 4040 4041 /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/ 4042 s21_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd))); 4043 4044 /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/ 4045 s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd))); 4046 4047 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b); 4048 4049 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4050 4051 /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/ 4052 s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd))); 4053 4054 /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/ 4055 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 4056 4057 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b); 4058 4059 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4060 4061 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4062 4063 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b); 4064 4065 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 4066 s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 4067 4068 /* i2_tmp = CLIP_U8(i2_tmp);*/ 4069 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b); 4070 4071 /* store 8 8-bit output values */ 4072 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 4073 _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b); 4074 4075 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 4076 4077 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b); 4078 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4079 4080 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b); 4081 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4082 4083 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4084 4085 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b); 4086 4087 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 4088 s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 4089 4090 /* i2_tmp = CLIP_U8(i2_tmp);*/ 4091 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b); 4092 4093 /* store 8 8-bit output values */ 4094 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 4095 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b); 4096 4097 pu1_src += 8; /* To pointer update */ 4098 pu1_dst += 8; 4099 4100 } /* inner for loop ends here(8-output values in single iteration) */ 4101 4102 pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */ 4103 pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */ 4104 } 4105 } 4106 4107 else 4108 { /* wd = multiple of 4 case */ 4109 4110 for(row = 0; row < ht; row += 2) 4111 { 4112 pu1_src_copy = pu1_src; 4113 pu1_dst_copy = pu1_dst; 4114 for(col = 0; col < 2 * wd; col += 4) 4115 { 4116 4117 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 4118 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 4119 4120 4121 /*load 8 pixel values from -751:-768 pos. relative to cur. pos.*/ 4122 s21_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd))); 4123 4124 /*load 8 pixel values from -495:-512 pos. relative to cur. pos.*/ 4125 s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd))); 4126 4127 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b); 4128 4129 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4130 4131 /*load 8 pixel values from -239:-256 pos. relative to cur. pos.*/ 4132 s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd))); 4133 4134 /*load 8 pixel values from 15:0 pos. relative to cur. pos.*/ 4135 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 4136 4137 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b); 4138 4139 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4140 4141 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4142 4143 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b); 4144 4145 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 4146 s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 4147 4148 /* i2_tmp = CLIP_U8(i2_tmp);*/ 4149 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b); 4150 4151 s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst)); 4152 s5_8x16b = _mm_and_si128(s9_8x16b, mask_low_32b); 4153 s6_8x16b = _mm_and_si128(s7_8x16b, mask_high_96b); 4154 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b); 4155 4156 /* store 8 8-bit output values */ 4157 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 4158 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b); 4159 4160 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 4161 4162 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b); 4163 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4164 4165 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b); 4166 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4167 4168 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4169 4170 s5_8x16b = _mm_add_epi16(s8_8x16b, offset_8x16b); 4171 4172 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 4173 s6_8x16b = _mm_srai_epi16(s5_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 4174 4175 /* i2_tmp = CLIP_U8(i2_tmp);*/ 4176 s7_8x16b = _mm_packus_epi16(s6_8x16b, zero_8x16b); 4177 4178 s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd)); 4179 s5_8x16b = _mm_and_si128(s9_8x16b, mask_low_32b); 4180 s6_8x16b = _mm_and_si128(s7_8x16b, mask_high_96b); 4181 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b); 4182 4183 /* store 8 8-bit output values */ 4184 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 4185 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b); 4186 4187 pu1_src += 4; /* To pointer update */ 4188 pu1_dst += 4; 4189 } /* inner for loop ends here(8-output values in single iteration) */ 4190 4191 pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */ 4192 pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */ 4193 } 4194 } 4195 } 4196 4197 /** 4198 ******************************************************************************* 4199 * 4200 * @brief 4201 * chroma interprediction filter for copying 16bit output 4202 * 4203 * @par Description: 4204 * Copies the array of width 'wd' and height 'ht' from the location pointed 4205 * by 'src' to the location pointed by 'dst' The output is upshifted by 6 4206 * bits and is used as input for vertical filtering or weighted prediction 4207 * 4208 * @param[in] pu1_src 4209 * UWORD8 pointer to the source 4210 * 4211 * @param[out] pi2_dst 4212 * WORD16 pointer to the destination 4213 * 4214 * @param[in] src_strd 4215 * integer source stride 4216 * 4217 * @param[in] dst_strd 4218 * integer destination stride 4219 * 4220 * @param[in] pi1_coeff 4221 * WORD8 pointer to the filter coefficients 4222 * 4223 * @param[in] ht 4224 * integer height of the array 4225 * 4226 * @param[in] wd 4227 * integer width of the array 4228 * 4229 * @returns 4230 * 4231 * @remarks 4232 * None 4233 * 4234 ******************************************************************************* 4235 */ 4236 4237 void ihevc_inter_pred_chroma_copy_w16out_ssse3(UWORD8 *pu1_src, 4238 WORD16 *pi2_dst, 4239 WORD32 src_strd, 4240 WORD32 dst_strd, 4241 WORD8 *pi1_coeff, 4242 WORD32 ht, 4243 WORD32 wd) 4244 { 4245 WORD32 row, col; 4246 __m128i s3, zero_8x16b; 4247 4248 ASSERT(wd % 2 == 0); /* checking assumption*/ 4249 ASSERT(ht % 2 == 0); /* checking assumption*/ 4250 4251 UNUSED(pi1_coeff); 4252 zero_8x16b = _mm_setzero_si128(); 4253 /* outer for loop starts from here */ 4254 if(wd == 2) /* for wd =2 */ 4255 { 4256 for(row = 0; row < ht; row += 2) 4257 { 4258 int offset = 0; 4259 for(col = 0; col < 2 * wd; col += 4) 4260 { 4261 /* row =0 */ 4262 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 4263 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */ 4264 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 4265 4266 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4267 4268 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4269 _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3); 4270 4271 /* row =1 */ 4272 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/ 4273 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); 4274 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 4275 4276 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4277 4278 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3); 4279 offset += 4; /* To pointer update */ 4280 } /* inner for loop ends here */ 4281 4282 pu1_src += 2 * src_strd; /* pointer update */ 4283 pi2_dst += 2 * dst_strd; /* pointer update */ 4284 } 4285 } 4286 else if(wd % 2 == 0 && wd % 4 != 0) 4287 { 4288 for(row = 0; row < ht / 2; row++) 4289 { 4290 int offset = 0; 4291 int count = (2 * wd) / 8; 4292 for(col = 0; col < count; col++) 4293 { 4294 /* row =0 */ 4295 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 4296 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/ 4297 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 4298 4299 /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4300 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); 4301 4302 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */ 4303 _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3); 4304 4305 /*row=1*/ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/ 4306 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); 4307 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 4308 4309 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4310 _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), s3); 4311 4312 offset += 8; /* To pointer update*/ 4313 } /* inner for loop ends here(8-output values in single iteration) */ 4314 4315 /* finding last four values */ 4316 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col] */ 4317 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 4318 4319 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4320 4321 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4322 _mm_storel_epi64((__m128i *)(pi2_dst + offset), s3); 4323 4324 /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/ 4325 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); 4326 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 4327 4328 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4329 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), s3); 4330 4331 pu1_src += 2 * src_strd; /* pointer update */ 4332 pi2_dst += 2 * dst_strd; 4333 } 4334 } 4335 else 4336 { 4337 for(row = 0; row < ht / 2; row++) 4338 { 4339 int offset = 0; 4340 for(col = 0; col < 2 * wd / 8; col++) 4341 { 4342 /* row =0 */ 4343 /*load 16 pixel values from 15:0 pos. relative to cur. pos.*/ 4344 s3 = _mm_loadu_si128((__m128i *)(pu1_src + offset)); /* pu1_src[col]*/ 4345 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 4346 4347 /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4348 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); 4349 4350 /* pi2_dst[col] = (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH); */ 4351 _mm_storeu_si128((__m128i *)(pi2_dst + offset), s3); 4352 4353 /*row=1*/ /*load 16 pixel values from 271:256 pos. relative to cur. pos.*/ 4354 s3 = _mm_loadu_si128((__m128i *)(pu1_src + src_strd + offset)); 4355 s3 = _mm_unpacklo_epi8(s3, zero_8x16b); 4356 4357 s3 = _mm_slli_epi16(s3, SHIFT_14_MINUS_BIT_DEPTH); /* (pu1_src[col] << SHIFT_14_MINUS_BIT_DEPTH) */ 4358 _mm_store_si128((__m128i *)(pi2_dst + dst_strd + offset), s3); 4359 4360 offset += 8; /* To pointer update*/ 4361 } /* inner for loop ends here(8-output values in single iteration) */ 4362 4363 pu1_src += 2 * src_strd; /* pointer update */ 4364 pi2_dst += 2 * dst_strd; 4365 } 4366 } 4367 } 4368 4369 /** 4370 ******************************************************************************* 4371 * 4372 * @brief 4373 * chroma interprediction filter to store horizontal 16bit ouput 4374 * 4375 * @par Description: 4376 * Applies a horizontal filter with coefficients pointed to by 'pi1_coeff' 4377 * to the elements pointed by 'pu1_src' and writes to the location pointed 4378 * by 'pu1_dst' No downshifting or clipping is done and the output is used 4379 * as an input for vertical filtering or weighted prediction 4380 * 4381 * @param[in] pu1_src 4382 * UWORD8 pointer to the source 4383 * 4384 * @param[out] pi2_dst 4385 * WORD16 pointer to the destination 4386 * 4387 * @param[in] src_strd 4388 * integer source stride 4389 * 4390 * @param[in] dst_strd 4391 * integer destination stride 4392 * 4393 * @param[in] pi1_coeff 4394 * WORD8 pointer to the filter coefficients 4395 * 4396 * @param[in] ht 4397 * integer height of the array 4398 * 4399 * @param[in] wd 4400 * integer width of the array 4401 * 4402 * @returns 4403 * 4404 * @remarks 4405 * None 4406 * 4407 ******************************************************************************* 4408 */ 4409 void ihevc_inter_pred_chroma_horz_w16out_ssse3(UWORD8 *pu1_src, 4410 WORD16 *pi2_dst, 4411 WORD32 src_strd, 4412 WORD32 dst_strd, 4413 WORD8 *pi1_coeff, 4414 WORD32 ht, 4415 WORD32 wd) 4416 { 4417 WORD32 row, col; 4418 4419 __m128i coeff0_1_8x16b, coeff2_3_8x16b, control_mask_1_8x16b, control_mask_2_8x16b, all_zero; 4420 __m128i src_temp1_16x8b, src_temp2_16x8b, src_temp3_16x8b, src_temp4_16x8b, src_temp5_16x8b, src_temp6_16x8b; 4421 __m128i src_temp11_16x8b, src_temp12_16x8b, src_temp13_16x8b, src_temp14_16x8b, src_temp15_16x8b, src_temp16_16x8b; 4422 __m128i res_temp1_8x16b, res_temp2_8x16b, res_temp3_8x16b; 4423 __m128i res_temp11_8x16b, res_temp12_8x16b, res_temp13_8x16b; 4424 4425 PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0) 4426 PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0) 4427 PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0) 4428 PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0) 4429 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 4430 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 4431 4432 ASSERT(wd % 2 == 0); /* checking assumption*/ 4433 4434 /* loading four 8-bit coefficients and convert 8-bit into 16-bit */ 4435 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)pi1_coeff); 4436 4437 all_zero = _mm_setzero_si128(); 4438 4439 control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */ 4440 control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */ 4441 4442 coeff0_1_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_1_8x16b); /* pi1_coeff[4] */ 4443 coeff2_3_8x16b = _mm_shuffle_epi8(src_temp1_16x8b, control_mask_2_8x16b); /* pi1_coeff[4] */ 4444 4445 /* outer for loop starts from here */ 4446 if(wd % 2 == 0 && wd % 4 != 0) 4447 { 4448 int offset = 0; 4449 for(row = ht; row >= 2; row -= 2) 4450 { 4451 offset = 0; 4452 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 4453 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 4454 4455 4456 for(col = 0; col < 2 * wd; col += 4) 4457 { 4458 4459 /*load 16 pixel values of row 0*/ 4460 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/ 4461 4462 /*load 16 pixel values of row 1*/ 4463 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/ 4464 4465 /*Derive the source pixels for processing the 2nd pixel of row 0*/ 4466 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 4467 4468 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); 4469 4470 /*Derive the source pixels for processing the 3rd pixel of row 0*/ 4471 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4); 4472 4473 /*Derive the source pixels for processing the 4th pixel of row 0*/ 4474 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6); 4475 4476 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b); 4477 4478 /*Derive the source pixels for processing the 2nd pixel of row 1*/ 4479 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); 4480 4481 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); 4482 4483 /*Derive the source pixels for processing the 3rd pixel of row 1*/ 4484 src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4); 4485 4486 /*Derive the source pixels for processing the 4th pixel of row 1*/ 4487 src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6); 4488 4489 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b); 4490 4491 res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, src_temp15_16x8b); 4492 res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, src_temp16_16x8b); 4493 res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b); 4494 res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b); 4495 4496 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */ 4497 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 4498 4499 res_temp3_8x16b = _mm_srli_si128(res_temp13_8x16b, 8); 4500 4501 /* store 4 16-bit values */ 4502 _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u */ 4503 4504 4505 4506 /* store 4 16-bit values */ 4507 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u */ 4508 4509 4510 offset += 4; /* To pointer update*/ 4511 4512 } /* inner loop ends here(8- output values in single iteration)*/ 4513 4514 pu1_src += 2 * src_strd; /*pointer update*/ 4515 pi2_dst += 2 * dst_strd; /*pointer update*/ 4516 } 4517 4518 /*Epilogue to handle ht= odd case*/ 4519 if(row) 4520 { 4521 offset = 0; 4522 for(col = 0; col < 2 * wd; col += 4) 4523 { 4524 4525 /*load 16 pixel values of row 0*/ 4526 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/ 4527 4528 /*Derive the source pixels for processing the 2nd pixel of row 0*/ 4529 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 4530 4531 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); 4532 4533 /*Derive the source pixels for processing the 3rd pixel of row 0*/ 4534 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4); 4535 4536 /*Derive the source pixels for processing the 4th pixel of row 0*/ 4537 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6); 4538 4539 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b); 4540 4541 res_temp1_8x16b = _mm_unpacklo_epi64(src_temp5_16x8b, all_zero); 4542 res_temp2_8x16b = _mm_unpacklo_epi64(src_temp6_16x8b, all_zero); 4543 res_temp11_8x16b = _mm_maddubs_epi16(res_temp1_8x16b, coeff0_1_8x16b); 4544 res_temp12_8x16b = _mm_maddubs_epi16(res_temp2_8x16b, coeff2_3_8x16b); 4545 4546 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */ 4547 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 4548 4549 //res_temp3_8x16b = _mm_srli_si128 (res_temp13_8x16b, 8); 4550 4551 /* store 4 16-bit values */ 4552 _mm_storel_epi64((__m128i *)(pi2_dst + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u */ 4553 4554 offset += 4; /* To pointer update*/ 4555 4556 } 4557 } 4558 4559 } 4560 else 4561 { 4562 int offset = 0; 4563 4564 for(row = ht; row >= 2; row -= 2) 4565 { 4566 offset = 0; 4567 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 4568 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 4569 4570 4571 for(col = 0; col < 2 * wd; col += 8) 4572 { 4573 4574 /*load 16 pixel values of row 0*/ 4575 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/ 4576 4577 /*load 16 pixel values of row 1*/ 4578 src_temp11_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + src_strd + offset)); /* pu1_src[col + (i-1) * 2]*/ 4579 4580 /*Derive the source pixels for processing the 2nd pixel of row 0*/ 4581 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 4582 4583 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); 4584 4585 /*Derive the source pixels for processing the 3rd pixel of row 0*/ 4586 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4); 4587 4588 /*Derive the source pixels for processing the 4th pixel of row 0*/ 4589 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6); 4590 4591 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b); 4592 4593 res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b); 4594 res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b); 4595 4596 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */ 4597 res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 4598 4599 /* store 8 16-bit values */ 4600 _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u */ 4601 4602 /*Derive the source pixels for processing the 2nd pixel of row 1*/ 4603 src_temp12_16x8b = _mm_srli_si128(src_temp11_16x8b, 2); 4604 4605 src_temp15_16x8b = _mm_unpacklo_epi8(src_temp11_16x8b, src_temp12_16x8b); 4606 4607 /*Derive the source pixels for processing the 3rd pixel of row 1*/ 4608 src_temp13_16x8b = _mm_srli_si128(src_temp11_16x8b, 4); 4609 4610 /*Derive the source pixels for processing the 4th pixel of row 1*/ 4611 src_temp14_16x8b = _mm_srli_si128(src_temp11_16x8b, 6); 4612 4613 src_temp16_16x8b = _mm_unpacklo_epi8(src_temp13_16x8b, src_temp14_16x8b); 4614 4615 res_temp11_8x16b = _mm_maddubs_epi16(src_temp15_16x8b, coeff0_1_8x16b); 4616 res_temp12_8x16b = _mm_maddubs_epi16(src_temp16_16x8b, coeff2_3_8x16b); 4617 4618 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */ 4619 res_temp13_8x16b = _mm_add_epi16(res_temp11_8x16b, res_temp12_8x16b); 4620 4621 /* store 8 16-bit values */ 4622 _mm_storeu_si128((__m128i *)(pi2_dst + dst_strd + offset), res_temp13_8x16b); /* pi2_dst[col] = i2_tmp_u */ 4623 4624 4625 offset += 8; /* To pointer update*/ 4626 4627 } /* inner loop ends here(8- output values in single iteration)*/ 4628 4629 pu1_src += 2 * src_strd; /*pointer update*/ 4630 pi2_dst += 2 * dst_strd; /*pointer update*/ 4631 } 4632 4633 /*Epilogue to take care of odd ht*/ 4634 if(row) 4635 { 4636 offset = 0; 4637 for(col = 0; col < 2 * wd; col += 8) 4638 { 4639 4640 /*load 16 pixel values of row 0*/ 4641 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src - 2 + offset)); /* pu1_src[col + (i-1) * 2]*/ 4642 4643 /*Derive the source pixels for processing the 2nd pixel of row 0*/ 4644 src_temp2_16x8b = _mm_srli_si128(src_temp1_16x8b, 2); 4645 4646 src_temp5_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, src_temp2_16x8b); 4647 4648 /*Derive the source pixels for processing the 3rd pixel of row 0*/ 4649 src_temp3_16x8b = _mm_srli_si128(src_temp1_16x8b, 4); 4650 4651 /*Derive the source pixels for processing the 4th pixel of row 0*/ 4652 src_temp4_16x8b = _mm_srli_si128(src_temp1_16x8b, 6); 4653 4654 src_temp6_16x8b = _mm_unpacklo_epi8(src_temp3_16x8b, src_temp4_16x8b); 4655 4656 res_temp1_8x16b = _mm_maddubs_epi16(src_temp5_16x8b, coeff0_1_8x16b); 4657 res_temp2_8x16b = _mm_maddubs_epi16(src_temp6_16x8b, coeff2_3_8x16b); 4658 4659 /* i4_tmp += pi1_coeff[i] * pi2_src[col + (i-1) * 2] */ 4660 res_temp3_8x16b = _mm_add_epi16(res_temp1_8x16b, res_temp2_8x16b); 4661 4662 /* store 8 16-bit values */ 4663 _mm_storeu_si128((__m128i *)(pi2_dst + offset), res_temp3_8x16b); /* pi2_dst[col] = i2_tmp_u */ 4664 4665 offset += 8; /* To pointer update*/ 4666 4667 } 4668 } 4669 4670 } 4671 } 4672 4673 /** 4674 ******************************************************************************* 4675 * 4676 * @brief 4677 * Interprediction chroma filter to store vertical 16bit ouput 4678 * 4679 * @par Description: 4680 * Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 4681 * the elements pointed by 'pu1_src' and writes to the location pointed by 4682 * 'pu1_dst' No downshifting or clipping is done and the output is used as 4683 * an input for weighted prediction 4684 * 4685 * @param[in] pu1_src 4686 * UWORD8 pointer to the source 4687 * 4688 * @param[out] pi2_dst 4689 * WORD16 pointer to the destination 4690 * 4691 * @param[in] src_strd 4692 * integer source stride 4693 * 4694 * @param[in] dst_strd 4695 * integer destination stride 4696 * 4697 * @param[in] pi1_coeff 4698 * WORD8 pointer to the filter coefficients 4699 * 4700 * @param[in] ht 4701 * integer height of the array 4702 * 4703 * @param[in] wd 4704 * integer width of the array 4705 * 4706 * @returns 4707 * 4708 * @remarks 4709 * None 4710 * 4711 ******************************************************************************* 4712 */ 4713 void ihevc_inter_pred_chroma_vert_w16out_ssse3(UWORD8 *pu1_src, 4714 WORD16 *pi2_dst, 4715 WORD32 src_strd, 4716 WORD32 dst_strd, 4717 WORD8 *pi1_coeff, 4718 WORD32 ht, 4719 WORD32 wd) 4720 { 4721 WORD32 row, col; 4722 UWORD8 *pu1_src_copy; 4723 WORD16 *pi2_dst_copy; 4724 __m128i coeff0_1_8x16b, coeff2_3_8x16b; 4725 __m128i s4_8x16b, s5_8x16b, s6_8x16b, s8_8x16b; 4726 __m128i control_mask_1_8x16b, control_mask_2_8x16b; 4727 __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b; 4728 __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b; 4729 __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b; 4730 4731 4732 PREFETCH((char const *)(pu1_src + (0 * src_strd)), _MM_HINT_T0) 4733 PREFETCH((char const *)(pu1_src + (1 * src_strd)), _MM_HINT_T0) 4734 PREFETCH((char const *)(pu1_src + (2 * src_strd)), _MM_HINT_T0) 4735 PREFETCH((char const *)(pu1_src + (3 * src_strd)), _MM_HINT_T0) 4736 PREFETCH((char const *)(pu1_src + (4 * src_strd)), _MM_HINT_T0) 4737 PREFETCH((char const *)(pu1_src + (5 * src_strd)), _MM_HINT_T0) 4738 4739 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 4740 s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff); 4741 4742 control_mask_1_8x16b = _mm_set1_epi32(0x01000100); /* Control Mask register */ 4743 control_mask_2_8x16b = _mm_set1_epi32(0x03020302); /* Control Mask register */ 4744 4745 coeff0_1_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_1_8x16b); /* pi1_coeff[4] */ 4746 coeff2_3_8x16b = _mm_shuffle_epi8(s4_8x16b, control_mask_2_8x16b); /* pi1_coeff[4] */ 4747 4748 4749 4750 /* outer for loop starts from here */ 4751 if(wd % 8 == 0) 4752 { /* wd = multiple of 8 case */ 4753 4754 pu1_src_copy = pu1_src; 4755 pi2_dst_copy = pi2_dst; 4756 4757 for(col = 0; col < 2 * wd; col += 16) 4758 { 4759 4760 pu1_src = pu1_src_copy + col; 4761 pi2_dst = pi2_dst_copy + col; 4762 4763 4764 for(row = 0; row < ht; row += 2) 4765 { 4766 4767 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 4768 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 4769 4770 4771 /*load 16 pixel values */ 4772 s21_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (-1 * src_strd))); 4773 4774 /*load 16 pixel values */ 4775 s22_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (0 * src_strd))); 4776 4777 4778 /*load 16 pixel values */ 4779 s23_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (1 * src_strd))); 4780 4781 /*load 16 pixel values */ 4782 s24_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (2 * src_strd))); 4783 4784 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b); 4785 4786 s31_8x16b = _mm_unpackhi_epi8(s21_8x16b, s22_8x16b); 4787 4788 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b); 4789 4790 s33_8x16b = _mm_unpackhi_epi8(s23_8x16b, s24_8x16b); 4791 4792 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4793 4794 s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b); 4795 4796 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4797 4798 s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b); 4799 4800 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4801 4802 s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); 4803 4804 /* store 8 8-bit output values */ 4805 /* pi2_dst[col] = (UWORD8)i2_tmp; */ 4806 _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b); 4807 4808 _mm_storeu_si128((__m128i *)(pi2_dst + 8), s35_8x16b); 4809 4810 4811 s25_8x16b = _mm_loadu_si128((__m128i *)(pu1_src + (3 * src_strd))); 4812 4813 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b); 4814 4815 s31_8x16b = _mm_unpackhi_epi8(s22_8x16b, s23_8x16b); 4816 4817 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4818 4819 s32_8x16b = _mm_maddubs_epi16(s31_8x16b, coeff0_1_8x16b); 4820 4821 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b); 4822 4823 s33_8x16b = _mm_unpackhi_epi8(s24_8x16b, s25_8x16b); 4824 4825 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4826 4827 s34_8x16b = _mm_maddubs_epi16(s33_8x16b, coeff2_3_8x16b); 4828 4829 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4830 4831 s35_8x16b = _mm_add_epi16(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4832 4833 /* store 8 8-bit output values */ 4834 /* pi2_dst[col] = (UWORD8)i2_tmp; */ 4835 _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b); 4836 4837 _mm_store_si128((__m128i *)(pi2_dst + dst_strd + 8), s35_8x16b); 4838 4839 4840 pu1_src += 2 * src_strd; 4841 pi2_dst += 2 * dst_strd; 4842 4843 4844 } /* inner for loop ends here(8-output values in single iteration) */ 4845 4846 } 4847 } 4848 4849 else if(wd % 4 == 0) 4850 { /* wd = multiple of 8 case */ 4851 4852 for(row = 0; row < ht; row += 2) 4853 { 4854 4855 pu1_src_copy = pu1_src; 4856 pi2_dst_copy = pi2_dst; 4857 4858 for(col = 0; col < 2 * wd; col += 8) 4859 { 4860 4861 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 4862 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 4863 4864 4865 /*load 8 pixel values */ 4866 s21_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd))); 4867 4868 /*load 8 pixel values */ 4869 s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd))); 4870 4871 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b); 4872 4873 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4874 4875 /*load 8 pixel values */ 4876 s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd))); 4877 4878 /*load 8 pixel values */ 4879 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 4880 4881 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b); 4882 4883 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4884 4885 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4886 4887 _mm_storeu_si128((__m128i *)(pi2_dst), s8_8x16b); 4888 4889 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 4890 4891 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b); 4892 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4893 4894 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b); 4895 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4896 4897 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4898 4899 _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s8_8x16b); 4900 4901 pu1_src += 8; /* To pointer update */ 4902 pi2_dst += 8; 4903 4904 } /* inner for loop ends here(8-output values in single iteration) */ 4905 4906 pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */ 4907 pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */ 4908 } 4909 } 4910 4911 else 4912 { /* wd = multiple of 4 case */ 4913 4914 for(row = 0; row < ht; row += 2) 4915 { 4916 pu1_src_copy = pu1_src; 4917 pi2_dst_copy = pi2_dst; 4918 for(col = 0; col < 2 * wd; col += 4) 4919 { 4920 4921 PREFETCH((char const *)(pu1_src + (6 * src_strd)), _MM_HINT_T0) 4922 PREFETCH((char const *)(pu1_src + (7 * src_strd)), _MM_HINT_T0) 4923 4924 4925 /*load 8 pixel values */ 4926 s21_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (-1 * src_strd))); 4927 4928 /*load 8 pixel values */ 4929 s22_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (0 * src_strd))); 4930 4931 s5_8x16b = _mm_unpacklo_epi8(s21_8x16b, s22_8x16b); 4932 4933 s11_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4934 4935 /*load 8 pixel values */ 4936 s23_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (1 * src_strd))); 4937 4938 /*load 8 pixel values */ 4939 s24_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (2 * src_strd))); 4940 4941 s6_8x16b = _mm_unpacklo_epi8(s23_8x16b, s24_8x16b); 4942 4943 s12_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4944 4945 s8_8x16b = _mm_add_epi16(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4946 4947 4948 /* store 8 8-bit output values */ 4949 /* pi2_dst[col] = (UWORD8)i2_tmp; */ 4950 _mm_storel_epi64((__m128i *)(pi2_dst), s8_8x16b); 4951 4952 s25_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + (3 * src_strd))); 4953 4954 s5_8x16b = _mm_unpacklo_epi8(s22_8x16b, s23_8x16b); 4955 s15_8x16b = _mm_maddubs_epi16(s5_8x16b, coeff0_1_8x16b); 4956 4957 s6_8x16b = _mm_unpacklo_epi8(s24_8x16b, s25_8x16b); 4958 s16_8x16b = _mm_maddubs_epi16(s6_8x16b, coeff2_3_8x16b); 4959 4960 s8_8x16b = _mm_add_epi16(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 4961 4962 4963 /* store 8 8-bit output values */ 4964 /* pi2_dst[col] = (UWORD8)i2_tmp; */ 4965 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s8_8x16b); 4966 4967 pu1_src += 4; /* To pointer update */ 4968 pi2_dst += 4; 4969 } /* inner for loop ends here(8-output values in single iteration) */ 4970 4971 pu1_src = pu1_src_copy + 2 * src_strd; /* pointer update */ 4972 pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */ 4973 } 4974 } 4975 } 4976 4977 /** 4978 ******************************************************************************* 4979 * 4980 * @brief 4981 * chroma interprediction filter for vertical 16bit input 4982 * 4983 * @par Description: 4984 * Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 4985 * the elements pointed by 'pu1_src' and writes to the location pointed by 4986 * 'pu1_dst' Input is 16 bits The filter output is downshifted by 12 and 4987 * clipped to lie between 0 and 255 4988 * 4989 * @param[in] pi2_src 4990 * WORD16 pointer to the source 4991 * 4992 * @param[out] pu1_dst 4993 * UWORD8 pointer to the destination 4994 * 4995 * @param[in] src_strd 4996 * integer source stride 4997 * 4998 * @param[in] dst_strd 4999 * integer destination stride 5000 * 5001 * @param[in] pi1_coeff 5002 * WORD8 pointer to the filter coefficients 5003 * 5004 * @param[in] ht 5005 * integer height of the array 5006 * 5007 * @param[in] wd 5008 * integer width of the array 5009 * 5010 * @returns 5011 * 5012 * @remarks 5013 * None 5014 * 5015 ******************************************************************************* 5016 */ 5017 void ihevc_inter_pred_chroma_vert_w16inp_ssse3(WORD16 *pi2_src, 5018 UWORD8 *pu1_dst, 5019 WORD32 src_strd, 5020 WORD32 dst_strd, 5021 WORD8 *pi1_coeff, 5022 WORD32 ht, 5023 WORD32 wd) 5024 { 5025 WORD32 row, col; 5026 WORD16 *pi2_src_copy; 5027 UWORD8 *pu1_dst_copy; 5028 __m128i coeff0_1_8x16b, coeff2_3_8x16b; 5029 __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b; 5030 __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b; 5031 __m128i zero_8x16b, offset_8x16b, mask_low_32b, mask_high_96b, sign_reg; 5032 __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b; 5033 __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b; 5034 5035 5036 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 5037 s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff); 5038 5039 zero_8x16b = _mm_setzero_si128(); 5040 sign_reg = _mm_cmpgt_epi8(zero_8x16b, s4_8x16b); 5041 s5_8x16b = _mm_unpacklo_epi8(s4_8x16b, sign_reg); 5042 5043 coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0)); /* pi1_coeff[4] */ 5044 coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1)); /* pi1_coeff[4] */ 5045 5046 /* seting values in register */ 5047 offset_8x16b = _mm_set1_epi32(OFFSET_14_MINUS_BIT_DEPTH); /* for offset addition */ 5048 mask_low_32b = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000); 5049 mask_high_96b = _mm_set_epi32(0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF); 5050 5051 /* outer for loop starts from here */ 5052 if(wd % 4 == 0) 5053 { /* wd = multiple of 8 case */ 5054 5055 pi2_src_copy = pi2_src; 5056 pu1_dst_copy = pu1_dst; 5057 5058 for(col = 0; col < 2 * wd; col += 8) 5059 { 5060 5061 pi2_src = pi2_src_copy + col; 5062 pu1_dst = pu1_dst_copy + col; 5063 5064 5065 for(row = 0; row < ht; row += 2) 5066 { 5067 5068 /*load 16 pixel values */ 5069 s21_8x16b = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd))); 5070 5071 /*load 16 pixel values */ 5072 s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd))); 5073 5074 5075 /*load 16 pixel values */ 5076 s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd))); 5077 5078 /*load 16 pixel values */ 5079 s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd))); 5080 5081 s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b); 5082 5083 s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b); 5084 5085 s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b); 5086 5087 s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b); 5088 5089 s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b); 5090 5091 s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b); 5092 5093 s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b); 5094 5095 s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b); 5096 5097 s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5098 5099 s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); 5100 5101 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5102 s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5103 5104 s32_8x16b = _mm_srai_epi32(s35_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5105 5106 5107 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 5108 s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b); 5109 5110 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5111 s8_8x16b = _mm_srai_epi32(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5112 5113 s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b); 5114 5115 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 5116 s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b); 5117 5118 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5119 s34_8x16b = _mm_srai_epi32(s33_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5120 5121 s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b); 5122 5123 5124 /* i2_tmp = CLIP_U8(i2_tmp);*/ 5125 s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b); 5126 5127 s33_8x16b = _mm_packus_epi16(s35_8x16b, zero_8x16b); 5128 5129 s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b); 5130 /* store 8 8-bit output values */ 5131 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 5132 _mm_storel_epi64((__m128i *)(pu1_dst), s7_8x16b); 5133 5134 5135 s25_8x16b = _mm_load_si128((__m128i *)(pi2_src + (3 * src_strd))); 5136 5137 s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b); 5138 5139 s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b); 5140 5141 s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b); 5142 5143 s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b); 5144 5145 s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b); 5146 5147 s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b); 5148 5149 s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b); 5150 5151 s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b); 5152 5153 s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5154 5155 s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5156 5157 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5158 s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5159 5160 s32_8x16b = _mm_srai_epi32(s35_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5161 5162 5163 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 5164 s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b); 5165 5166 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5167 s8_8x16b = _mm_srai_epi32(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5168 5169 s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b); 5170 5171 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 5172 s33_8x16b = _mm_add_epi32(s32_8x16b, offset_8x16b); 5173 5174 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5175 s34_8x16b = _mm_srai_epi32(s33_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5176 5177 s35_8x16b = _mm_packs_epi32(s34_8x16b, zero_8x16b); 5178 5179 5180 /* i2_tmp = CLIP_U8(i2_tmp);*/ 5181 s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b); 5182 5183 s33_8x16b = _mm_packus_epi16(s35_8x16b, zero_8x16b); 5184 5185 s7_8x16b = _mm_unpacklo_epi32(s7_8x16b, s33_8x16b); 5186 /* store 8 8-bit output values */ 5187 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 5188 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s7_8x16b); 5189 5190 pi2_src += 2 * src_strd; 5191 pu1_dst += 2 * dst_strd; 5192 5193 5194 } /* inner for loop ends here(8-output values in single iteration) */ 5195 5196 } 5197 } 5198 else 5199 { /* wd = multiple of 4 case */ 5200 5201 for(row = 0; row < ht; row += 2) 5202 { 5203 pi2_src_copy = pi2_src; 5204 pu1_dst_copy = pu1_dst; 5205 for(col = 0; col < 2 * wd; col += 4) 5206 { 5207 5208 /*load 8 pixel values */ 5209 s21_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd))); 5210 5211 /*load 8 pixel values */ 5212 s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd))); 5213 5214 s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b); 5215 5216 s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b); 5217 5218 /*load 8 pixel values */ 5219 s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd))); 5220 5221 /*load 8 pixel values */ 5222 s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd))); 5223 5224 s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b); 5225 5226 s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b); 5227 5228 s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5229 5230 5231 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5232 s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5233 5234 5235 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 5236 s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b); 5237 5238 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5239 s8_8x16b = _mm_srai_epi32(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5240 5241 s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b); 5242 5243 5244 /* i2_tmp = CLIP_U8(i2_tmp);*/ 5245 s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b); 5246 5247 s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst)); 5248 s5_8x16b = _mm_and_si128(s9_8x16b, mask_low_32b); 5249 s6_8x16b = _mm_and_si128(s7_8x16b, mask_high_96b); 5250 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b); 5251 5252 /* store 8 8-bit output values */ 5253 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 5254 _mm_storel_epi64((__m128i *)(pu1_dst), s9_8x16b); 5255 5256 s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd))); 5257 5258 s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b); 5259 s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b); 5260 5261 s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b); 5262 s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b); 5263 5264 s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5265 5266 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5267 s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5268 5269 /* (i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) */ 5270 s7_8x16b = _mm_add_epi32(s6_8x16b, offset_8x16b); 5271 5272 /* i4_tmp = ((i4_tmp >> SHIFT_14_MINUS_BIT_DEPTH) + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5273 s8_8x16b = _mm_srai_epi32(s7_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5274 5275 s9_8x16b = _mm_packs_epi32(s8_8x16b, zero_8x16b); 5276 5277 /* i2_tmp = CLIP_U8(i2_tmp);*/ 5278 s7_8x16b = _mm_packus_epi16(s9_8x16b, zero_8x16b); 5279 5280 s9_8x16b = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd)); 5281 s5_8x16b = _mm_and_si128(s9_8x16b, mask_low_32b); 5282 s6_8x16b = _mm_and_si128(s7_8x16b, mask_high_96b); 5283 s9_8x16b = _mm_or_si128(s5_8x16b, s6_8x16b); 5284 5285 /* store 8 8-bit output values */ 5286 /* pu1_dst[col] = (UWORD8)i2_tmp; */ 5287 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), s9_8x16b); 5288 5289 pi2_src += 4; /* To pointer update */ 5290 pu1_dst += 4; 5291 } /* inner for loop ends here(8-output values in single iteration) */ 5292 5293 pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */ 5294 pu1_dst = pu1_dst_copy + 2 * dst_strd; /* pointer update */ 5295 } 5296 } 5297 5298 } 5299 5300 /** 5301 ******************************************************************************* 5302 * 5303 * @brief 5304 * 5305 * Chroma interprediction filter for 16bit vertical input and output. 5306 * 5307 * @par Description: 5308 * Applies a vertical filter with coefficients pointed to by 'pi1_coeff' to 5309 * the elements pointed by 'pu1_src' and writes to the location pointed by 5310 * 'pu1_dst' Input is 16 bits The filter output is downshifted by 6 and 5311 * 8192 is subtracted to store it as a 16 bit number The output is used as 5312 * a input to weighted prediction 5313 * 5314 * @param[in] pi2_src 5315 * WORD16 pointer to the source 5316 * 5317 * @param[out] pi2_dst 5318 * WORD16 pointer to the destination 5319 * 5320 * @param[in] src_strd 5321 * integer source stride 5322 * 5323 * @param[in] dst_strd 5324 * integer destination stride 5325 * 5326 * @param[in] pi1_coeff 5327 * WORD8 pointer to the filter coefficients 5328 * 5329 * @param[in] ht 5330 * integer height of the array 5331 * 5332 * @param[in] wd 5333 * integer width of the array 5334 * 5335 * @returns 5336 * 5337 * @remarks 5338 * None 5339 * 5340 ******************************************************************************* 5341 */ 5342 void ihevc_inter_pred_chroma_vert_w16inp_w16out_ssse3(WORD16 *pi2_src, 5343 WORD16 *pi2_dst, 5344 WORD32 src_strd, 5345 WORD32 dst_strd, 5346 WORD8 *pi1_coeff, 5347 WORD32 ht, 5348 WORD32 wd) 5349 { 5350 WORD32 row, col; 5351 WORD16 *pi2_src_copy; 5352 WORD16 *pi2_dst_copy; 5353 __m128i coeff0_1_8x16b, coeff2_3_8x16b; 5354 __m128i s4_8x16b, s5_8x16b, s6_8x16b, s7_8x16b, s8_8x16b, s9_8x16b; 5355 __m128i s11_8x16b, s12_8x16b, s15_8x16b, s16_8x16b; 5356 __m128i zero_8x16b, sign_reg; 5357 __m128i s21_8x16b, s22_8x16b, s23_8x16b, s24_8x16b, s25_8x16b; 5358 __m128i s31_8x16b, s32_8x16b, s33_8x16b, s34_8x16b, s35_8x16b; 5359 5360 5361 /* load 8 8-bit coefficients and convert 8-bit into 16-bit */ 5362 s4_8x16b = _mm_loadl_epi64((__m128i *)pi1_coeff); 5363 5364 zero_8x16b = _mm_setzero_si128(); 5365 sign_reg = _mm_cmpgt_epi8(zero_8x16b, s4_8x16b); 5366 s5_8x16b = _mm_unpacklo_epi8(s4_8x16b, sign_reg); 5367 5368 coeff0_1_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(0, 0, 0, 0)); /* pi1_coeff[4] */ 5369 coeff2_3_8x16b = _mm_shuffle_epi32(s5_8x16b, _MM_SHUFFLE(1, 1, 1, 1)); /* pi1_coeff[4] */ 5370 5371 5372 /* outer for loop starts from here */ 5373 if(wd % 4 == 0) 5374 { /* wd = multiple of 8 case */ 5375 5376 pi2_src_copy = pi2_src; 5377 pi2_dst_copy = pi2_dst; 5378 5379 for(col = 0; col < 2 * wd; col += 8) 5380 { 5381 5382 pi2_src = pi2_src_copy + col; 5383 pi2_dst = pi2_dst_copy + col; 5384 5385 5386 for(row = 0; row < ht; row += 2) 5387 { 5388 5389 /*load 16 pixel values */ 5390 s21_8x16b = _mm_load_si128((__m128i *)(pi2_src + (-1 * src_strd))); 5391 5392 /*load 16 pixel values */ 5393 s22_8x16b = _mm_load_si128((__m128i *)(pi2_src + (0 * src_strd))); 5394 5395 5396 /*load 16 pixel values */ 5397 s23_8x16b = _mm_load_si128((__m128i *)(pi2_src + (1 * src_strd))); 5398 5399 /*load 16 pixel values */ 5400 s24_8x16b = _mm_load_si128((__m128i *)(pi2_src + (2 * src_strd))); 5401 5402 s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b); 5403 5404 s31_8x16b = _mm_unpackhi_epi16(s21_8x16b, s22_8x16b); 5405 5406 s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b); 5407 5408 s33_8x16b = _mm_unpackhi_epi16(s23_8x16b, s24_8x16b); 5409 5410 s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b); 5411 5412 s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b); 5413 5414 s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b); 5415 5416 s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b); 5417 5418 s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5419 5420 s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); 5421 5422 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5423 s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5424 5425 s32_8x16b = _mm_srai_epi32(s35_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5426 5427 s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b); 5428 5429 s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b); 5430 5431 s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b); 5432 /* store 8 8-bit output values */ 5433 /* pi2_dst[col] = (UWORD8)i2_tmp; */ 5434 _mm_store_si128((__m128i *)(pi2_dst), s7_8x16b); 5435 5436 5437 s25_8x16b = _mm_loadu_si128((__m128i *)(pi2_src + (3 * src_strd))); 5438 5439 s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b); 5440 5441 s31_8x16b = _mm_unpackhi_epi16(s22_8x16b, s23_8x16b); 5442 5443 s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b); 5444 5445 s32_8x16b = _mm_madd_epi16(s31_8x16b, coeff0_1_8x16b); 5446 5447 s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b); 5448 5449 s33_8x16b = _mm_unpackhi_epi16(s24_8x16b, s25_8x16b); 5450 5451 s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b); 5452 5453 s34_8x16b = _mm_madd_epi16(s33_8x16b, coeff2_3_8x16b); 5454 5455 s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5456 5457 s35_8x16b = _mm_add_epi32(s32_8x16b, s34_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5458 5459 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5460 s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5461 5462 s32_8x16b = _mm_srai_epi32(s35_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5463 5464 s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b); 5465 5466 s35_8x16b = _mm_packs_epi32(s32_8x16b, zero_8x16b); 5467 5468 s7_8x16b = _mm_unpacklo_epi64(s9_8x16b, s35_8x16b); 5469 /* store 8 8-bit output values */ 5470 /* pi2_dst[col] = (UWORD8)i2_tmp; */ 5471 _mm_store_si128((__m128i *)(pi2_dst + dst_strd), s7_8x16b); 5472 5473 pi2_src += 2 * src_strd; 5474 pi2_dst += 2 * dst_strd; 5475 5476 5477 } /* inner for loop ends here(8-output values in single iteration) */ 5478 5479 } 5480 } 5481 else 5482 { /* wd = multiple of 4 case */ 5483 5484 for(row = 0; row < ht; row += 2) 5485 { 5486 pi2_src_copy = pi2_src; 5487 pi2_dst_copy = pi2_dst; 5488 for(col = 0; col < 2 * wd; col += 4) 5489 { 5490 5491 /*load 4 pixel values */ 5492 s21_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (-1 * src_strd))); 5493 5494 /*load 4 pixel values */ 5495 s22_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (0 * src_strd))); 5496 5497 s5_8x16b = _mm_unpacklo_epi16(s21_8x16b, s22_8x16b); 5498 5499 s11_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b); 5500 5501 /*load 4 pixel values */ 5502 s23_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (1 * src_strd))); 5503 5504 /*load 4 pixel values */ 5505 s24_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (2 * src_strd))); 5506 5507 s6_8x16b = _mm_unpacklo_epi16(s23_8x16b, s24_8x16b); 5508 5509 s12_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b); 5510 5511 s8_8x16b = _mm_add_epi32(s11_8x16b, s12_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5512 5513 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5514 s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5515 5516 s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b); 5517 5518 /* store 8 8-bit output values */ 5519 /* pi2_dst[col] = (UWORD8)i2_tmp; */ 5520 _mm_storel_epi64((__m128i *)(pi2_dst), s9_8x16b); 5521 5522 s25_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src + (3 * src_strd))); 5523 5524 s5_8x16b = _mm_unpacklo_epi16(s22_8x16b, s23_8x16b); 5525 s15_8x16b = _mm_madd_epi16(s5_8x16b, coeff0_1_8x16b); 5526 5527 s6_8x16b = _mm_unpacklo_epi16(s24_8x16b, s25_8x16b); 5528 s16_8x16b = _mm_madd_epi16(s6_8x16b, coeff2_3_8x16b); 5529 5530 s8_8x16b = _mm_add_epi32(s15_8x16b, s16_8x16b); /* (i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) */ 5531 5532 /*(i2_tmp + OFFSET_14_MINUS_BIT_DEPTH) >> SHIFT_14_MINUS_BIT_DEPTH */ 5533 s6_8x16b = _mm_srai_epi32(s8_8x16b, SHIFT_14_MINUS_BIT_DEPTH); 5534 5535 s9_8x16b = _mm_packs_epi32(s6_8x16b, zero_8x16b); 5536 5537 /* store 8 8-bit output values */ 5538 /* pi2_dst[col] = (UWORD8)i2_tmp; */ 5539 _mm_storel_epi64((__m128i *)(pi2_dst + dst_strd), s9_8x16b); 5540 5541 pi2_src += 4; /* To pointer update */ 5542 pi2_dst += 4; 5543 } /* inner for loop ends here(8-output values in single iteration) */ 5544 5545 pi2_src = pi2_src_copy + 2 * src_strd; /* pointer update */ 5546 pi2_dst = pi2_dst_copy + 2 * dst_strd; /* pointer update */ 5547 } 5548 } 5549 5550 } 5551