1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 /*****************************************************************************/ 21 /* */ 22 /* File Name : ih264_weighted_pred_intr_sse42.c */ 23 /* */ 24 /* Description : Contains function definitions for weighted */ 25 /* prediction functions in x86 sse4 intrinsics */ 26 /* */ 27 /* List of Functions : ih264_default_weighted_pred_luma_sse42() */ 28 /* ih264_default_weighted_pred_chroma_sse42() */ 29 /* ih264_weighted_pred_luma_sse42() */ 30 /* ih264_weighted_pred_chroma_sse42() */ 31 /* ih264_weighted_bipred_luma_sse42() */ 32 /* ih264_weighted_bipred_chroma_sse42() */ 33 /* */ 34 /* Issues / Problems : None */ 35 /* */ 36 /* Revision History : */ 37 /* */ 38 /* DD MM YYYY Author(s) Changes */ 39 /* 30 01 2015 Kaushik Initial version */ 40 /* Senthoor */ 41 /* */ 42 /*****************************************************************************/ 43 /*****************************************************************************/ 44 /* File Includes */ 45 /*****************************************************************************/ 46 47 #include <immintrin.h> 48 #include "ih264_typedefs.h" 49 #include "ih264_macros.h" 50 #include "ih264_platform_macros.h" 51 #include "ih264_weighted_pred.h" 52 53 /*****************************************************************************/ 54 /* Function definitions . */ 55 /*****************************************************************************/ 56 /*****************************************************************************/ 57 /* */ 58 /* Function Name : ih264_default_weighted_pred_luma_sse42 */ 59 /* */ 60 /* Description : This function performs the default weighted prediction */ 61 /* as described in sec 8.4.2.3.1 titled "Default weighted */ 62 /* sample prediction process" for luma. The function gets */ 63 /* two ht x wd blocks, calculates their rounded-average and */ 64 /* stores it in the destination block. (ht,wd) can be */ 65 /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ 66 /* */ 67 /* Inputs : pu1_src1 - Pointer to source 1 */ 68 /* pu1_src2 - Pointer to source 2 */ 69 /* pu1_dst - Pointer to destination */ 70 /* src_strd1 - stride for source 1 */ 71 /* src_strd1 - stride for source 2 */ 72 /* dst_strd - stride for destination */ 73 /* ht - height of the block */ 74 /* wd - width of the block */ 75 /* */ 76 /* Issues : None */ 77 /* */ 78 /* Revision History: */ 79 /* */ 80 /* DD MM YYYY Author(s) Changes */ 81 /* 04 02 2015 Kaushik Initial Version */ 82 /* Senthoor */ 83 /* */ 84 /*****************************************************************************/ 85 void ih264_default_weighted_pred_luma_sse42(UWORD8 *pu1_src1, 86 UWORD8 *pu1_src2, 87 UWORD8 *pu1_dst, 88 WORD32 src_strd1, 89 WORD32 src_strd2, 90 WORD32 dst_strd, 91 WORD32 ht, 92 WORD32 wd) 93 { 94 __m128i y0_0_16x8b, y0_1_16x8b, y0_2_16x8b, y0_3_16x8b; 95 __m128i y1_0_16x8b, y1_1_16x8b, y1_2_16x8b, y1_3_16x8b; 96 97 if(wd == 4) 98 { 99 do 100 { 101 y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 102 y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 103 y0_2_16x8b = _mm_loadl_epi64( 104 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 105 y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); 106 107 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 108 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 109 y1_2_16x8b = _mm_loadl_epi64( 110 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 111 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); 112 113 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); 114 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); 115 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); 116 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); 117 118 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y0_0_16x8b); 119 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y0_1_16x8b); 120 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y0_2_16x8b); 121 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y0_3_16x8b); 122 123 ht -= 4; 124 pu1_src1 += src_strd1 << 2; 125 pu1_src2 += src_strd2 << 2; 126 pu1_dst += dst_strd << 2; 127 } 128 while(ht > 0); 129 } 130 else if(wd == 8) 131 { 132 do 133 { 134 y0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 135 y0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 136 y0_2_16x8b = _mm_loadl_epi64( 137 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 138 y0_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); 139 140 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 141 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 142 y1_2_16x8b = _mm_loadl_epi64( 143 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 144 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); 145 146 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); 147 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); 148 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); 149 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); 150 151 _mm_storel_epi64((__m128i *)pu1_dst, y0_0_16x8b); 152 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b); 153 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b); 154 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b); 155 156 ht -= 4; 157 pu1_src1 += src_strd1 << 2; 158 pu1_src2 += src_strd2 << 2; 159 pu1_dst += dst_strd << 2; 160 } 161 while(ht > 0); 162 } 163 else // wd == 16 164 { 165 __m128i y0_4_16x8b, y0_5_16x8b, y0_6_16x8b, y0_7_16x8b; 166 __m128i y1_4_16x8b, y1_5_16x8b, y1_6_16x8b, y1_7_16x8b; 167 168 do 169 { 170 y0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); 171 y0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); 172 y0_2_16x8b = _mm_loadu_si128( 173 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 174 y0_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 3)); 175 y0_4_16x8b = _mm_loadu_si128( 176 (__m128i *)(pu1_src1 + (src_strd1 << 2))); 177 y0_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 5)); 178 y0_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 6)); 179 y0_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1 * 7)); 180 181 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); 182 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); 183 y1_2_16x8b = _mm_loadu_si128( 184 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 185 y1_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 3)); 186 y1_4_16x8b = _mm_loadu_si128( 187 (__m128i *)(pu1_src2 + (src_strd2 << 2))); 188 y1_5_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 5)); 189 y1_6_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 6)); 190 y1_7_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2 * 7)); 191 192 y0_0_16x8b = _mm_avg_epu8(y0_0_16x8b, y1_0_16x8b); 193 y0_1_16x8b = _mm_avg_epu8(y0_1_16x8b, y1_1_16x8b); 194 y0_2_16x8b = _mm_avg_epu8(y0_2_16x8b, y1_2_16x8b); 195 y0_3_16x8b = _mm_avg_epu8(y0_3_16x8b, y1_3_16x8b); 196 y0_4_16x8b = _mm_avg_epu8(y0_4_16x8b, y1_4_16x8b); 197 y0_5_16x8b = _mm_avg_epu8(y0_5_16x8b, y1_5_16x8b); 198 y0_6_16x8b = _mm_avg_epu8(y0_6_16x8b, y1_6_16x8b); 199 y0_7_16x8b = _mm_avg_epu8(y0_7_16x8b, y1_7_16x8b); 200 201 _mm_storeu_si128((__m128i *)pu1_dst, y0_0_16x8b); 202 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y0_1_16x8b); 203 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y0_2_16x8b); 204 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y0_3_16x8b); 205 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 2)), y0_4_16x8b); 206 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 5), y0_5_16x8b); 207 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 6), y0_6_16x8b); 208 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 7), y0_7_16x8b); 209 210 ht -= 8; 211 pu1_src1 += src_strd1 << 3; 212 pu1_src2 += src_strd2 << 3; 213 pu1_dst += dst_strd << 3; 214 } 215 while(ht > 0); 216 } 217 } 218 219 /*****************************************************************************/ 220 /* */ 221 /* Function Name : ih264_default_weighted_pred_chroma_sse42 */ 222 /* */ 223 /* Description : This function performs the default weighted prediction */ 224 /* as described in sec 8.4.2.3.1 titled "Default weighted */ 225 /* sample prediction process" for chroma. The function gets */ 226 /* two ht x wd blocks, calculates their rounded-average and */ 227 /* stores it in the destination block. (ht,wd) can be */ 228 /* (2,2), (4,2) , (2,4), (4,4), (8,4), (4,8) or (8,8). */ 229 /* */ 230 /* Inputs : pu1_src1 - Pointer to source 1 */ 231 /* pu1_src2 - Pointer to source 2 */ 232 /* pu1_dst - Pointer to destination */ 233 /* src_strd1 - stride for source 1 */ 234 /* src_strd1 - stride for source 2 */ 235 /* dst_strd - stride for destination */ 236 /* ht - height of the block */ 237 /* wd - width of the block */ 238 /* */ 239 /* Issues : None */ 240 /* */ 241 /* Revision History: */ 242 /* */ 243 /* DD MM YYYY Author(s) Changes */ 244 /* 04 02 2015 Kaushik Initial Version */ 245 /* Senthoor */ 246 /* */ 247 /*****************************************************************************/ 248 void ih264_default_weighted_pred_chroma_sse42(UWORD8 *pu1_src1, 249 UWORD8 *pu1_src2, 250 UWORD8 *pu1_dst, 251 WORD32 src_strd1, 252 WORD32 src_strd2, 253 WORD32 dst_strd, 254 WORD32 ht, 255 WORD32 wd) 256 { 257 __m128i uv0_0_16x8b, uv0_1_16x8b; 258 __m128i uv1_0_16x8b, uv1_1_16x8b; 259 260 if(wd == 2) 261 { 262 do 263 { 264 uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 265 uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 266 267 uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 268 uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 269 270 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); 271 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); 272 273 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(uv0_0_16x8b); 274 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(uv0_1_16x8b); 275 276 ht -= 2; 277 pu1_src1 += src_strd1 << 1; 278 pu1_src2 += src_strd2 << 1; 279 pu1_dst += dst_strd << 1; 280 } 281 while(ht > 0); 282 } 283 else if(wd == 4) 284 { 285 do 286 { 287 uv0_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 288 uv0_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 289 290 uv1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 291 uv1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 292 293 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); 294 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); 295 296 _mm_storel_epi64((__m128i *)pu1_dst, uv0_0_16x8b); 297 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b); 298 299 ht -= 2; 300 pu1_src1 += src_strd1 << 1; 301 pu1_src2 += src_strd2 << 1; 302 pu1_dst += dst_strd << 1; 303 } 304 while(ht > 0); 305 } 306 else // wd == 8 307 { 308 __m128i uv0_2_16x8b, uv0_3_16x8b; 309 __m128i uv1_2_16x8b, uv1_3_16x8b; 310 311 do 312 { 313 uv0_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); 314 uv0_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); 315 uv0_2_16x8b = _mm_loadu_si128( 316 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 317 uv0_3_16x8b = _mm_loadu_si128( 318 (__m128i *)(pu1_src1 + src_strd1 * 3)); 319 320 uv1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); 321 uv1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); 322 uv1_2_16x8b = _mm_loadu_si128( 323 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 324 uv1_3_16x8b = _mm_loadu_si128( 325 (__m128i *)(pu1_src2 + src_strd2 * 3)); 326 327 uv0_0_16x8b = _mm_avg_epu8(uv0_0_16x8b, uv1_0_16x8b); 328 uv0_1_16x8b = _mm_avg_epu8(uv0_1_16x8b, uv1_1_16x8b); 329 uv0_2_16x8b = _mm_avg_epu8(uv0_2_16x8b, uv1_2_16x8b); 330 uv0_3_16x8b = _mm_avg_epu8(uv0_3_16x8b, uv1_3_16x8b); 331 332 _mm_storeu_si128((__m128i *)pu1_dst, uv0_0_16x8b); 333 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), uv0_1_16x8b); 334 _mm_storeu_si128( 335 (__m128i *)(pu1_dst + (dst_strd << 1)), uv0_2_16x8b); 336 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), uv0_3_16x8b); 337 338 ht -= 4; 339 pu1_src1 += src_strd1 << 2; 340 pu1_src2 += src_strd2 << 2; 341 pu1_dst += dst_strd << 2; 342 } 343 while(ht > 0); 344 } 345 } 346 347 /*****************************************************************************/ 348 /* */ 349 /* Function Name : ih264_weighted_pred_luma_sse42 */ 350 /* */ 351 /* Description : This function performs the weighted prediction as */ 352 /* described in sec 8.4.2.3.2 titled "Weighted sample */ 353 /* prediction process" for luma. The function gets one */ 354 /* ht x wd block, weights it, rounds it off, offsets it, */ 355 /* saturates it to unsigned 8-bit and stores it in the */ 356 /* destination block. (ht,wd) can be (4,4), (8,4), (4,8), */ 357 /* (8,8), (16,8), (8,16) or (16,16). */ 358 /* */ 359 /* Inputs : pu1_src - Pointer to source */ 360 /* pu1_dst - Pointer to destination */ 361 /* src_strd - stride for source */ 362 /* dst_strd - stride for destination */ 363 /* log_wd - number of bits to be rounded off */ 364 /* wt - weight value */ 365 /* ofst - offset value */ 366 /* ht - height of the block */ 367 /* wd - width of the block */ 368 /* */ 369 /* Issues : None */ 370 /* */ 371 /* Revision History: */ 372 /* */ 373 /* DD MM YYYY Author(s) Changes */ 374 /* 04 02 2015 Kaushik Initial Version */ 375 /* Senthoor */ 376 /* */ 377 /*****************************************************************************/ 378 void ih264_weighted_pred_luma_sse42(UWORD8 *pu1_src, 379 UWORD8 *pu1_dst, 380 WORD32 src_strd, 381 WORD32 dst_strd, 382 WORD32 log_wd, 383 WORD32 wt, 384 WORD32 ofst, 385 WORD32 ht, 386 WORD32 wd) 387 { 388 __m128i y_0_16x8b, y_1_16x8b, y_2_16x8b, y_3_16x8b; 389 390 __m128i wt_8x16b, round_8x16b, ofst_8x16b; 391 392 WORD32 round_val; 393 394 wt = (WORD16)(wt & 0xffff); 395 round_val = 1 << (log_wd - 1); 396 ofst = (WORD8)(ofst & 0xff); 397 398 wt_8x16b = _mm_set1_epi16(wt); 399 round_8x16b = _mm_set1_epi16(round_val); 400 ofst_8x16b = _mm_set1_epi16(ofst); 401 402 if(wd == 4) 403 { 404 __m128i y_0_8x16b, y_2_8x16b; 405 406 do 407 { 408 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 409 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 410 y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1))); 411 y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3)); 412 413 y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b); 414 y_2_16x8b = _mm_unpacklo_epi32(y_2_16x8b, y_3_16x8b); 415 416 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 417 y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); 418 419 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); 420 y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b); 421 422 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); 423 y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b); 424 425 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); 426 y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd); 427 428 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); 429 y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b); 430 431 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_2_8x16b); 432 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); 433 y_2_16x8b = _mm_srli_si128(y_0_16x8b, 8); 434 y_3_16x8b = _mm_srli_si128(y_0_16x8b, 12); 435 436 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b); 437 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b); 438 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y_2_16x8b); 439 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y_3_16x8b); 440 441 ht -= 4; 442 pu1_src += src_strd << 2; 443 pu1_dst += dst_strd << 2; 444 } 445 while(ht > 0); 446 } 447 else if(wd == 8) 448 { 449 __m128i y_0_8x16b, y_1_8x16b, y_2_8x16b, y_3_8x16b; 450 451 do 452 { 453 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 454 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 455 y_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + (src_strd << 1))); 456 y_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd * 3)); 457 458 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 459 y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); 460 y_2_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); 461 y_3_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); 462 463 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); 464 y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b); 465 y_2_8x16b = _mm_mullo_epi16(y_2_8x16b, wt_8x16b); 466 y_3_8x16b = _mm_mullo_epi16(y_3_8x16b, wt_8x16b); 467 468 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); 469 y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b); 470 y_2_8x16b = _mm_adds_epi16(round_8x16b, y_2_8x16b); 471 y_3_8x16b = _mm_adds_epi16(round_8x16b, y_3_8x16b); 472 473 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); 474 y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd); 475 y_2_8x16b = _mm_srai_epi16(y_2_8x16b, log_wd); 476 y_3_8x16b = _mm_srai_epi16(y_3_8x16b, log_wd); 477 478 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); 479 y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b); 480 y_2_8x16b = _mm_adds_epi16(ofst_8x16b, y_2_8x16b); 481 y_3_8x16b = _mm_adds_epi16(ofst_8x16b, y_3_8x16b); 482 483 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b); 484 y_2_16x8b = _mm_packus_epi16(y_2_8x16b, y_3_8x16b); 485 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8); 486 y_3_16x8b = _mm_srli_si128(y_2_16x8b, 8); 487 488 _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); 489 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 490 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); 491 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); 492 493 ht -= 4; 494 pu1_src += src_strd << 2; 495 pu1_dst += dst_strd << 2; 496 } 497 while(ht > 0); 498 } 499 else // wd == 16 500 { 501 __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b; 502 __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b; 503 504 __m128i zero_16x8b; 505 zero_16x8b = _mm_set1_epi8(0); 506 507 do 508 { 509 y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 510 y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 511 y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1))); 512 y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3)); 513 514 y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 515 y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b); 516 y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); 517 y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b); 518 y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); 519 y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b); 520 y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); 521 y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b); 522 523 y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b); 524 y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b); 525 y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b); 526 y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b); 527 y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b); 528 y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b); 529 y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b); 530 y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b); 531 532 y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b); 533 y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b); 534 y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b); 535 y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b); 536 y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b); 537 y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b); 538 y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b); 539 y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b); 540 541 y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd); 542 y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd); 543 y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd); 544 y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd); 545 y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd); 546 y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd); 547 y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd); 548 y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd); 549 550 y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b); 551 y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b); 552 y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b); 553 y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b); 554 y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b); 555 y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b); 556 y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b); 557 y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b); 558 559 y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b); 560 y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b); 561 y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b); 562 y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b); 563 564 _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); 565 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 566 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); 567 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); 568 569 ht -= 4; 570 pu1_src += src_strd << 2; 571 pu1_dst += dst_strd << 2; 572 } 573 while(ht > 0); 574 } 575 } 576 577 /*****************************************************************************/ 578 /* */ 579 /* Function Name : ih264_weighted_pred_chroma_sse42 */ 580 /* */ 581 /* Description : This function performs the weighted prediction as */ 582 /* described in sec 8.4.2.3.2 titled "Weighted sample */ 583 /* prediction process" for chroma. The function gets one */ 584 /* ht x wd block, weights it, rounds it off, offsets it, */ 585 /* saturates it to unsigned 8-bit and stores it in the */ 586 /* destination block. (ht,wd) can be (2,2), (4,2), (2,4), */ 587 /* (4,4), (8,4), (4,8) or (8,8). */ 588 /* */ 589 /* Inputs : pu1_src - Pointer to source */ 590 /* pu1_dst - Pointer to destination */ 591 /* src_strd - stride for source */ 592 /* dst_strd - stride for destination */ 593 /* log_wd - number of bits to be rounded off */ 594 /* wt - weight values for u and v */ 595 /* ofst - offset values for u and v */ 596 /* ht - height of the block */ 597 /* wd - width of the block */ 598 /* */ 599 /* Issues : None */ 600 /* */ 601 /* Revision History: */ 602 /* */ 603 /* DD MM YYYY Author(s) Changes */ 604 /* 04 02 2015 Kaushik Initial Version */ 605 /* Senthoor */ 606 /* */ 607 /*****************************************************************************/ 608 void ih264_weighted_pred_chroma_sse42(UWORD8 *pu1_src, 609 UWORD8 *pu1_dst, 610 WORD32 src_strd, 611 WORD32 dst_strd, 612 WORD32 log_wd, 613 WORD32 wt, 614 WORD32 ofst, 615 WORD32 ht, 616 WORD32 wd) 617 { 618 __m128i y_0_16x8b, y_1_16x8b; 619 620 __m128i wt_8x16b, round_8x16b, ofst_8x16b; 621 622 WORD32 ofst_u, ofst_v; 623 WORD32 round_val; 624 625 ofst_u = (WORD8)(ofst & 0xff); 626 ofst_v = (WORD8)(ofst >> 8); 627 round_val = 1 << (log_wd - 1); 628 ofst = (ofst_u & 0xffff) | (ofst_v << 16); 629 630 wt_8x16b = _mm_set1_epi32(wt); 631 round_8x16b = _mm_set1_epi16(round_val); 632 ofst_8x16b = _mm_set1_epi32(ofst); 633 634 if(wd == 2) 635 { 636 __m128i y_0_8x16b; 637 638 do 639 { 640 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 641 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 642 643 y_0_16x8b = _mm_unpacklo_epi32(y_0_16x8b, y_1_16x8b); 644 645 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 646 647 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); 648 649 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); 650 651 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); 652 653 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); 654 655 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_0_8x16b); 656 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 4); 657 658 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y_0_16x8b); 659 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y_1_16x8b); 660 661 ht -= 2; 662 pu1_src += src_strd << 1; 663 pu1_dst += dst_strd << 1; 664 } 665 while(ht > 0); 666 } 667 else if(wd == 4) 668 { 669 __m128i y_0_8x16b, y_1_8x16b; 670 671 do 672 { 673 y_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src); 674 y_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd)); 675 676 y_0_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 677 y_1_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); 678 679 y_0_8x16b = _mm_mullo_epi16(y_0_8x16b, wt_8x16b); 680 y_1_8x16b = _mm_mullo_epi16(y_1_8x16b, wt_8x16b); 681 682 y_0_8x16b = _mm_adds_epi16(round_8x16b, y_0_8x16b); 683 y_1_8x16b = _mm_adds_epi16(round_8x16b, y_1_8x16b); 684 685 y_0_8x16b = _mm_srai_epi16(y_0_8x16b, log_wd); 686 y_1_8x16b = _mm_srai_epi16(y_1_8x16b, log_wd); 687 688 y_0_8x16b = _mm_adds_epi16(ofst_8x16b, y_0_8x16b); 689 y_1_8x16b = _mm_adds_epi16(ofst_8x16b, y_1_8x16b); 690 691 y_0_16x8b = _mm_packus_epi16(y_0_8x16b, y_1_8x16b); 692 y_1_16x8b = _mm_srli_si128(y_0_16x8b, 8); 693 694 _mm_storel_epi64((__m128i *)pu1_dst, y_0_16x8b); 695 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 696 697 ht -= 2; 698 pu1_src += src_strd << 1; 699 pu1_dst += dst_strd << 1; 700 } 701 while(ht > 0); 702 } 703 else // wd == 16 704 { 705 __m128i y_2_16x8b, y_3_16x8b; 706 __m128i y_0L_8x16b, y_1L_8x16b, y_2L_8x16b, y_3L_8x16b; 707 __m128i y_0H_8x16b, y_1H_8x16b, y_2H_8x16b, y_3H_8x16b; 708 709 __m128i zero_16x8b; 710 zero_16x8b = _mm_set1_epi8(0); 711 712 do 713 { 714 y_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); 715 y_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd)); 716 y_2_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (src_strd << 1))); 717 y_3_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + src_strd * 3)); 718 719 y_0L_8x16b = _mm_cvtepu8_epi16(y_0_16x8b); 720 y_0H_8x16b = _mm_unpackhi_epi8(y_0_16x8b, zero_16x8b); 721 y_1L_8x16b = _mm_cvtepu8_epi16(y_1_16x8b); 722 y_1H_8x16b = _mm_unpackhi_epi8(y_1_16x8b, zero_16x8b); 723 y_2L_8x16b = _mm_cvtepu8_epi16(y_2_16x8b); 724 y_2H_8x16b = _mm_unpackhi_epi8(y_2_16x8b, zero_16x8b); 725 y_3L_8x16b = _mm_cvtepu8_epi16(y_3_16x8b); 726 y_3H_8x16b = _mm_unpackhi_epi8(y_3_16x8b, zero_16x8b); 727 728 y_0L_8x16b = _mm_mullo_epi16(y_0L_8x16b, wt_8x16b); 729 y_0H_8x16b = _mm_mullo_epi16(y_0H_8x16b, wt_8x16b); 730 y_1L_8x16b = _mm_mullo_epi16(y_1L_8x16b, wt_8x16b); 731 y_1H_8x16b = _mm_mullo_epi16(y_1H_8x16b, wt_8x16b); 732 y_2L_8x16b = _mm_mullo_epi16(y_2L_8x16b, wt_8x16b); 733 y_2H_8x16b = _mm_mullo_epi16(y_2H_8x16b, wt_8x16b); 734 y_3L_8x16b = _mm_mullo_epi16(y_3L_8x16b, wt_8x16b); 735 y_3H_8x16b = _mm_mullo_epi16(y_3H_8x16b, wt_8x16b); 736 737 y_0L_8x16b = _mm_adds_epi16(round_8x16b, y_0L_8x16b); 738 y_0H_8x16b = _mm_adds_epi16(round_8x16b, y_0H_8x16b); 739 y_1L_8x16b = _mm_adds_epi16(round_8x16b, y_1L_8x16b); 740 y_1H_8x16b = _mm_adds_epi16(round_8x16b, y_1H_8x16b); 741 y_2L_8x16b = _mm_adds_epi16(round_8x16b, y_2L_8x16b); 742 y_2H_8x16b = _mm_adds_epi16(round_8x16b, y_2H_8x16b); 743 y_3L_8x16b = _mm_adds_epi16(round_8x16b, y_3L_8x16b); 744 y_3H_8x16b = _mm_adds_epi16(round_8x16b, y_3H_8x16b); 745 746 y_0L_8x16b = _mm_srai_epi16(y_0L_8x16b, log_wd); 747 y_0H_8x16b = _mm_srai_epi16(y_0H_8x16b, log_wd); 748 y_1L_8x16b = _mm_srai_epi16(y_1L_8x16b, log_wd); 749 y_1H_8x16b = _mm_srai_epi16(y_1H_8x16b, log_wd); 750 y_2L_8x16b = _mm_srai_epi16(y_2L_8x16b, log_wd); 751 y_2H_8x16b = _mm_srai_epi16(y_2H_8x16b, log_wd); 752 y_3L_8x16b = _mm_srai_epi16(y_3L_8x16b, log_wd); 753 y_3H_8x16b = _mm_srai_epi16(y_3H_8x16b, log_wd); 754 755 y_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y_0L_8x16b); 756 y_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y_0H_8x16b); 757 y_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y_1L_8x16b); 758 y_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y_1H_8x16b); 759 y_2L_8x16b = _mm_adds_epi16(ofst_8x16b, y_2L_8x16b); 760 y_2H_8x16b = _mm_adds_epi16(ofst_8x16b, y_2H_8x16b); 761 y_3L_8x16b = _mm_adds_epi16(ofst_8x16b, y_3L_8x16b); 762 y_3H_8x16b = _mm_adds_epi16(ofst_8x16b, y_3H_8x16b); 763 764 y_0_16x8b = _mm_packus_epi16(y_0L_8x16b, y_0H_8x16b); 765 y_1_16x8b = _mm_packus_epi16(y_1L_8x16b, y_1H_8x16b); 766 y_2_16x8b = _mm_packus_epi16(y_2L_8x16b, y_2H_8x16b); 767 y_3_16x8b = _mm_packus_epi16(y_3L_8x16b, y_3H_8x16b); 768 769 _mm_storeu_si128((__m128i *)pu1_dst, y_0_16x8b); 770 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y_1_16x8b); 771 _mm_storeu_si128((__m128i *)(pu1_dst + (dst_strd << 1)), y_2_16x8b); 772 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd * 3), y_3_16x8b); 773 774 ht -= 4; 775 pu1_src += src_strd << 2; 776 pu1_dst += dst_strd << 2; 777 } 778 while(ht > 0); 779 } 780 } 781 782 /*****************************************************************************/ 783 /* */ 784 /* Function Name : ih264_weighted_bi_pred_luma_sse42 */ 785 /* */ 786 /* Description : This function performs the weighted biprediction as */ 787 /* described in sec 8.4.2.3.2 titled "Weighted sample */ 788 /* prediction process" for luma. The function gets two */ 789 /* ht x wd blocks, weights them, adds them, rounds off the */ 790 /* sum, offsets it, saturates it to unsigned 8-bit and */ 791 /* stores it in the destination block. (ht,wd) can be */ 792 /* (4,4), (8,4), (4,8), (8,8), (16,8), (8,16) or (16,16). */ 793 /* */ 794 /* Inputs : pu1_src1 - Pointer to source 1 */ 795 /* pu1_src2 - Pointer to source 2 */ 796 /* pu1_dst - Pointer to destination */ 797 /* src_strd1 - stride for source 1 */ 798 /* src_strd2 - stride for source 2 */ 799 /* dst_strd2 - stride for destination */ 800 /* log_wd - number of bits to be rounded off */ 801 /* wt1 - weight value for source 1 */ 802 /* wt2 - weight value for source 2 */ 803 /* ofst1 - offset value for source 1 */ 804 /* ofst2 - offset value for source 2 */ 805 /* ht - height of the block */ 806 /* wd - width of the block */ 807 /* */ 808 /* Issues : None */ 809 /* */ 810 /* Revision History: */ 811 /* */ 812 /* DD MM YYYY Author(s) Changes */ 813 /* 04 02 2015 Kaushik Initial Version */ 814 /* Senthoor */ 815 /* */ 816 /*****************************************************************************/ 817 void ih264_weighted_bi_pred_luma_sse42(UWORD8 *pu1_src1, 818 UWORD8 *pu1_src2, 819 UWORD8 *pu1_dst, 820 WORD32 src_strd1, 821 WORD32 src_strd2, 822 WORD32 dst_strd, 823 WORD32 log_wd, 824 WORD32 wt1, 825 WORD32 wt2, 826 WORD32 ofst1, 827 WORD32 ofst2, 828 WORD32 ht, 829 WORD32 wd) 830 { 831 __m128i y1_0_16x8b, y1_1_16x8b; 832 __m128i y2_0_16x8b, y2_1_16x8b; 833 834 __m128i wt1_8x16b, wt2_8x16b; 835 __m128i ofst_8x16b, round_8x16b; 836 837 WORD32 ofst; 838 WORD32 round_val, shft; 839 840 wt1 = (WORD16)(wt1 & 0xffff); 841 wt2 = (WORD16)(wt2 & 0xffff); 842 round_val = 1 << log_wd; 843 shft = log_wd + 1; 844 ofst1 = (WORD8)(ofst1 & 0xff); 845 ofst2 = (WORD8)(ofst2 & 0xff); 846 ofst = (ofst1 + ofst2 + 1) >> 1; 847 848 wt1_8x16b = _mm_set1_epi16(wt1); 849 wt2_8x16b = _mm_set1_epi16(wt2); 850 round_8x16b = _mm_set1_epi16(round_val); 851 ofst_8x16b = _mm_set1_epi16(ofst); 852 853 if(wd == 4) 854 { 855 __m128i y1_2_16x8b, y1_3_16x8b; 856 __m128i y2_2_16x8b, y2_3_16x8b; 857 858 __m128i y1_0_8x16b, y1_2_8x16b; 859 __m128i y2_0_8x16b, y2_2_8x16b; 860 861 do 862 { 863 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 864 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 865 y1_2_16x8b = _mm_loadl_epi64( 866 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 867 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); 868 869 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 870 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 871 y2_2_16x8b = _mm_loadl_epi64( 872 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 873 y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); 874 875 y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); 876 y1_2_16x8b = _mm_unpacklo_epi32(y1_2_16x8b, y1_3_16x8b); 877 y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); 878 y2_2_16x8b = _mm_unpacklo_epi32(y2_2_16x8b, y2_3_16x8b); 879 880 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 881 y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b); 882 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 883 y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b); 884 885 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); 886 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); 887 y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b); 888 y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b); 889 890 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); 891 y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b); 892 893 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); 894 y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b); 895 896 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); 897 y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft); 898 899 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); 900 y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b); 901 902 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_2_8x16b); 903 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); 904 y1_2_16x8b = _mm_srli_si128(y1_0_16x8b, 8); 905 y1_3_16x8b = _mm_srli_si128(y1_0_16x8b, 12); 906 907 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); 908 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); 909 *((WORD32 *)(pu1_dst + (dst_strd << 1))) = _mm_cvtsi128_si32(y1_2_16x8b); 910 *((WORD32 *)(pu1_dst + dst_strd * 3)) = _mm_cvtsi128_si32(y1_3_16x8b); 911 912 913 ht -= 4; 914 pu1_src1 += src_strd1 << 2; 915 pu1_src2 += src_strd2 << 2; 916 pu1_dst += dst_strd << 2; 917 } 918 while(ht > 0); 919 } 920 else if(wd == 8) 921 { 922 __m128i y1_2_16x8b, y1_3_16x8b; 923 __m128i y2_2_16x8b, y2_3_16x8b; 924 925 __m128i y1_0_8x16b, y1_1_8x16b, y1_2_8x16b, y1_3_8x16b; 926 __m128i y2_0_8x16b, y2_1_8x16b, y2_2_8x16b, y2_3_8x16b; 927 928 do 929 { 930 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 931 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 932 y1_2_16x8b = _mm_loadl_epi64( 933 (__m128i *)(pu1_src1 + (src_strd1 << 1))); 934 y1_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1 * 3)); 935 936 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 937 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 938 y2_2_16x8b = _mm_loadl_epi64( 939 (__m128i *)(pu1_src2 + (src_strd2 << 1))); 940 y2_3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2 * 3)); 941 942 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 943 y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); 944 y1_2_8x16b = _mm_cvtepu8_epi16(y1_2_16x8b); 945 y1_3_8x16b = _mm_cvtepu8_epi16(y1_3_16x8b); 946 947 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 948 y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); 949 y2_2_8x16b = _mm_cvtepu8_epi16(y2_2_16x8b); 950 y2_3_8x16b = _mm_cvtepu8_epi16(y2_3_16x8b); 951 952 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); 953 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); 954 y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); 955 y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); 956 957 y1_2_8x16b = _mm_mullo_epi16(y1_2_8x16b, wt1_8x16b); 958 y2_2_8x16b = _mm_mullo_epi16(y2_2_8x16b, wt2_8x16b); 959 y1_3_8x16b = _mm_mullo_epi16(y1_3_8x16b, wt1_8x16b); 960 y2_3_8x16b = _mm_mullo_epi16(y2_3_8x16b, wt2_8x16b); 961 962 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); 963 y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); 964 y1_2_8x16b = _mm_adds_epi16(y1_2_8x16b, y2_2_8x16b); 965 y1_3_8x16b = _mm_adds_epi16(y1_3_8x16b, y2_3_8x16b); 966 967 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); 968 y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); 969 y1_2_8x16b = _mm_adds_epi16(round_8x16b, y1_2_8x16b); 970 y1_3_8x16b = _mm_adds_epi16(round_8x16b, y1_3_8x16b); 971 972 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); 973 y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); 974 y1_2_8x16b = _mm_srai_epi16(y1_2_8x16b, shft); 975 y1_3_8x16b = _mm_srai_epi16(y1_3_8x16b, shft); 976 977 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); 978 y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); 979 y1_2_8x16b = _mm_adds_epi16(ofst_8x16b, y1_2_8x16b); 980 y1_3_8x16b = _mm_adds_epi16(ofst_8x16b, y1_3_8x16b); 981 982 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); 983 y1_2_16x8b = _mm_packus_epi16(y1_2_8x16b, y1_3_8x16b); 984 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); 985 y1_3_16x8b = _mm_srli_si128(y1_2_16x8b, 8); 986 987 _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); 988 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); 989 _mm_storel_epi64((__m128i *)(pu1_dst + (dst_strd << 1)), y1_2_16x8b); 990 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd * 3), y1_3_16x8b); 991 992 ht -= 4; 993 pu1_src1 += src_strd1 << 2; 994 pu1_src2 += src_strd2 << 2; 995 pu1_dst += dst_strd << 2; 996 } 997 while(ht > 0); 998 } 999 else // wd == 16 1000 { 1001 __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b; 1002 __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b; 1003 1004 __m128i zero_16x8b; 1005 zero_16x8b = _mm_set1_epi8(0); 1006 1007 do 1008 { 1009 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); 1010 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); 1011 y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); 1012 y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); 1013 1014 y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 1015 y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b); 1016 y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); 1017 y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b); 1018 1019 y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 1020 y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b); 1021 y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); 1022 y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b); 1023 1024 y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b); 1025 y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b); 1026 y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b); 1027 y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b); 1028 1029 y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b); 1030 y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b); 1031 y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b); 1032 y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b); 1033 1034 y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b); 1035 y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b); 1036 y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b); 1037 y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b); 1038 1039 y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b); 1040 y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b); 1041 y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b); 1042 y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b); 1043 1044 y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft); 1045 y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft); 1046 y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft); 1047 y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft); 1048 1049 y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b); 1050 y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b); 1051 y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b); 1052 y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b); 1053 1054 y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b); 1055 y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b); 1056 1057 _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b); 1058 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); 1059 1060 ht -= 2; 1061 pu1_src1 += src_strd1 << 1; 1062 pu1_src2 += src_strd2 << 1; 1063 pu1_dst += dst_strd << 1; 1064 } 1065 while(ht > 0); 1066 } 1067 } 1068 1069 /*****************************************************************************/ 1070 /* */ 1071 /* Function Name : ih264_weighted_bi_pred_chroma_sse42 */ 1072 /* */ 1073 /* Description : This function performs the weighted biprediction as */ 1074 /* described in sec 8.4.2.3.2 titled "Weighted sample */ 1075 /* prediction process" for chroma. The function gets two */ 1076 /* ht x wd blocks, weights them, adds them, rounds off the */ 1077 /* sum, offsets it, saturates it to unsigned 8-bit and */ 1078 /* stores it in the destination block. (ht,wd) can be */ 1079 /* (2,2), (4,2), (2,4), (4,4), (8,4), (4,8) or (8,8). */ 1080 /* */ 1081 /* Inputs : pu1_src1 - Pointer to source 1 */ 1082 /* pu1_src2 - Pointer to source 2 */ 1083 /* pu1_dst - Pointer to destination */ 1084 /* src_strd1 - stride for source 1 */ 1085 /* src_strd2 - stride for source 2 */ 1086 /* dst_strd2 - stride for destination */ 1087 /* log_wd - number of bits to be rounded off */ 1088 /* wt1 - weight values for u and v in source 1 */ 1089 /* wt2 - weight values for u and v in source 2 */ 1090 /* ofst1 - offset value for u and v in source 1 */ 1091 /* ofst2 - offset value for u and v in source 2 */ 1092 /* ht - height of the block */ 1093 /* wd - width of the block */ 1094 /* */ 1095 /* Issues : None */ 1096 /* */ 1097 /* Revision History: */ 1098 /* */ 1099 /* DD MM YYYY Author(s) Changes */ 1100 /* 04 02 2015 Kaushik Initial Version */ 1101 /* Senthoor */ 1102 /* */ 1103 /*****************************************************************************/ 1104 void ih264_weighted_bi_pred_chroma_sse42(UWORD8 *pu1_src1, 1105 UWORD8 *pu1_src2, 1106 UWORD8 *pu1_dst, 1107 WORD32 src_strd1, 1108 WORD32 src_strd2, 1109 WORD32 dst_strd, 1110 WORD32 log_wd, 1111 WORD32 wt1, 1112 WORD32 wt2, 1113 WORD32 ofst1, 1114 WORD32 ofst2, 1115 WORD32 ht, 1116 WORD32 wd) 1117 { 1118 __m128i y1_0_16x8b, y1_1_16x8b; 1119 __m128i y2_0_16x8b, y2_1_16x8b; 1120 1121 __m128i wt1_8x16b, wt2_8x16b; 1122 __m128i ofst_8x16b, round_8x16b; 1123 1124 WORD32 ofst1_u, ofst2_u, ofst_u; 1125 WORD32 ofst1_v, ofst2_v, ofst_v; 1126 WORD32 round_val, shft, ofst_val; 1127 1128 round_val = 1 << log_wd; 1129 shft = log_wd + 1; 1130 1131 ofst1_u = (WORD8)(ofst1 & 0xff); 1132 ofst1_v = (WORD8)(ofst1 >> 8); 1133 ofst2_u = (WORD8)(ofst2 & 0xff); 1134 ofst2_v = (WORD8)(ofst2 >> 8); 1135 1136 wt1_8x16b = _mm_set1_epi32(wt1); 1137 wt2_8x16b = _mm_set1_epi32(wt2); 1138 1139 ofst_u = (ofst1_u + ofst2_u + 1) >> 1; 1140 ofst_v = (ofst1_v + ofst2_v + 1) >> 1; 1141 ofst_val = (ofst_u & 0xffff) | (ofst_v << 16); 1142 1143 round_8x16b = _mm_set1_epi16(round_val); 1144 ofst_8x16b = _mm_set1_epi32(ofst_val); 1145 1146 if(wd == 2) 1147 { 1148 __m128i y1_0_8x16b, y2_0_8x16b; 1149 1150 do 1151 { 1152 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 1153 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 1154 1155 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 1156 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 1157 1158 y1_0_16x8b = _mm_unpacklo_epi32(y1_0_16x8b, y1_1_16x8b); 1159 y2_0_16x8b = _mm_unpacklo_epi32(y2_0_16x8b, y2_1_16x8b); 1160 1161 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 1162 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 1163 1164 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); 1165 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); 1166 1167 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); 1168 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); 1169 1170 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); 1171 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); 1172 1173 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_0_8x16b); 1174 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 4); 1175 1176 *((WORD32 *)(pu1_dst)) = _mm_cvtsi128_si32(y1_0_16x8b); 1177 *((WORD32 *)(pu1_dst + dst_strd)) = _mm_cvtsi128_si32(y1_1_16x8b); 1178 1179 ht -= 2; 1180 pu1_src1 += src_strd1 << 1; 1181 pu1_src2 += src_strd2 << 1; 1182 pu1_dst += dst_strd << 1; 1183 } 1184 while(ht > 0); 1185 } 1186 else if(wd == 4) 1187 { 1188 __m128i y1_0_8x16b, y1_1_8x16b; 1189 __m128i y2_0_8x16b, y2_1_8x16b; 1190 1191 do 1192 { 1193 y1_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src1); 1194 y1_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src1 + src_strd1)); 1195 1196 y2_0_16x8b = _mm_loadl_epi64((__m128i *)pu1_src2); 1197 y2_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src2 + src_strd2)); 1198 1199 y1_0_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 1200 y1_1_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); 1201 1202 y2_0_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 1203 y2_1_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); 1204 1205 y1_0_8x16b = _mm_mullo_epi16(y1_0_8x16b, wt1_8x16b); 1206 y2_0_8x16b = _mm_mullo_epi16(y2_0_8x16b, wt2_8x16b); 1207 y1_1_8x16b = _mm_mullo_epi16(y1_1_8x16b, wt1_8x16b); 1208 y2_1_8x16b = _mm_mullo_epi16(y2_1_8x16b, wt2_8x16b); 1209 1210 y1_0_8x16b = _mm_adds_epi16(y1_0_8x16b, y2_0_8x16b); 1211 y1_1_8x16b = _mm_adds_epi16(y1_1_8x16b, y2_1_8x16b); 1212 1213 y1_0_8x16b = _mm_adds_epi16(round_8x16b, y1_0_8x16b); 1214 y1_1_8x16b = _mm_adds_epi16(round_8x16b, y1_1_8x16b); 1215 1216 y1_0_8x16b = _mm_srai_epi16(y1_0_8x16b, shft); 1217 y1_1_8x16b = _mm_srai_epi16(y1_1_8x16b, shft); 1218 1219 y1_0_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0_8x16b); 1220 y1_1_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1_8x16b); 1221 1222 y1_0_16x8b = _mm_packus_epi16(y1_0_8x16b, y1_1_8x16b); 1223 y1_1_16x8b = _mm_srli_si128(y1_0_16x8b, 8); 1224 1225 _mm_storel_epi64((__m128i *)pu1_dst, y1_0_16x8b); 1226 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); 1227 1228 ht -= 2; 1229 pu1_src1 += src_strd1 << 1; 1230 pu1_src2 += src_strd2 << 1; 1231 pu1_dst += dst_strd << 1; 1232 } 1233 while(ht > 0); 1234 } 1235 else // wd == 8 1236 { 1237 __m128i y1_0L_8x16b, y1_0H_8x16b, y1_1L_8x16b, y1_1H_8x16b; 1238 __m128i y2_0L_8x16b, y2_0H_8x16b, y2_1L_8x16b, y2_1H_8x16b; 1239 1240 __m128i zero_16x8b; 1241 zero_16x8b = _mm_set1_epi8(0); 1242 1243 do 1244 { 1245 y1_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src1); 1246 y1_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src1 + src_strd1)); 1247 y2_0_16x8b = _mm_loadu_si128((__m128i *)pu1_src2); 1248 y2_1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src2 + src_strd2)); 1249 1250 y1_0L_8x16b = _mm_cvtepu8_epi16(y1_0_16x8b); 1251 y1_0H_8x16b = _mm_unpackhi_epi8(y1_0_16x8b, zero_16x8b); 1252 y1_1L_8x16b = _mm_cvtepu8_epi16(y1_1_16x8b); 1253 y1_1H_8x16b = _mm_unpackhi_epi8(y1_1_16x8b, zero_16x8b); 1254 1255 y2_0L_8x16b = _mm_cvtepu8_epi16(y2_0_16x8b); 1256 y2_0H_8x16b = _mm_unpackhi_epi8(y2_0_16x8b, zero_16x8b); 1257 y2_1L_8x16b = _mm_cvtepu8_epi16(y2_1_16x8b); 1258 y2_1H_8x16b = _mm_unpackhi_epi8(y2_1_16x8b, zero_16x8b); 1259 1260 y1_0L_8x16b = _mm_mullo_epi16(y1_0L_8x16b, wt1_8x16b); 1261 y1_0H_8x16b = _mm_mullo_epi16(y1_0H_8x16b, wt1_8x16b); 1262 y1_1L_8x16b = _mm_mullo_epi16(y1_1L_8x16b, wt1_8x16b); 1263 y1_1H_8x16b = _mm_mullo_epi16(y1_1H_8x16b, wt1_8x16b); 1264 1265 y2_0L_8x16b = _mm_mullo_epi16(y2_0L_8x16b, wt2_8x16b); 1266 y2_0H_8x16b = _mm_mullo_epi16(y2_0H_8x16b, wt2_8x16b); 1267 y2_1L_8x16b = _mm_mullo_epi16(y2_1L_8x16b, wt2_8x16b); 1268 y2_1H_8x16b = _mm_mullo_epi16(y2_1H_8x16b, wt2_8x16b); 1269 1270 y1_0L_8x16b = _mm_adds_epi16(y1_0L_8x16b, y2_0L_8x16b); 1271 y1_0H_8x16b = _mm_adds_epi16(y1_0H_8x16b, y2_0H_8x16b); 1272 y1_1L_8x16b = _mm_adds_epi16(y1_1L_8x16b, y2_1L_8x16b); 1273 y1_1H_8x16b = _mm_adds_epi16(y1_1H_8x16b, y2_1H_8x16b); 1274 1275 y1_0L_8x16b = _mm_adds_epi16(round_8x16b, y1_0L_8x16b); 1276 y1_0H_8x16b = _mm_adds_epi16(round_8x16b, y1_0H_8x16b); 1277 y1_1L_8x16b = _mm_adds_epi16(round_8x16b, y1_1L_8x16b); 1278 y1_1H_8x16b = _mm_adds_epi16(round_8x16b, y1_1H_8x16b); 1279 1280 y1_0L_8x16b = _mm_srai_epi16(y1_0L_8x16b, shft); 1281 y1_0H_8x16b = _mm_srai_epi16(y1_0H_8x16b, shft); 1282 y1_1L_8x16b = _mm_srai_epi16(y1_1L_8x16b, shft); 1283 y1_1H_8x16b = _mm_srai_epi16(y1_1H_8x16b, shft); 1284 1285 y1_0L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0L_8x16b); 1286 y1_0H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_0H_8x16b); 1287 y1_1L_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1L_8x16b); 1288 y1_1H_8x16b = _mm_adds_epi16(ofst_8x16b, y1_1H_8x16b); 1289 1290 y1_0_16x8b = _mm_packus_epi16(y1_0L_8x16b, y1_0H_8x16b); 1291 y1_1_16x8b = _mm_packus_epi16(y1_1L_8x16b, y1_1H_8x16b); 1292 1293 _mm_storeu_si128((__m128i *)pu1_dst, y1_0_16x8b); 1294 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), y1_1_16x8b); 1295 1296 ht -= 2; 1297 pu1_src1 += src_strd1 << 1; 1298 pu1_src2 += src_strd2 << 1; 1299 pu1_dst += dst_strd << 1; 1300 } 1301 while(ht > 0); 1302 } 1303 } 1304