1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_weighted_pred_neon_intr.c 22 * 23 * @brief 24 * Contains function definitions for weighted prediction used in inter 25 * prediction 26 * 27 * @author 28 * Parthiban V 29 * 30 * @par List of Functions: 31 * - ihevc_weighted_pred_uni() 32 * - ihevc_weighted_pred_bi() 33 * - ihevc_weighted_pred_bi_default() 34 * 35 * @remarks 36 * None 37 * 38 ******************************************************************************* 39 */ 40 /*****************************************************************************/ 41 /* File Includes */ 42 /*****************************************************************************/ 43 #include "ihevc_typedefs.h" 44 #include "ihevc_defs.h" 45 #include "ihevc_macros.h" 46 #include "ihevc_func_selector.h" 47 #include "ihevc_inter_pred.h" 48 #include "arm_neon.h" 49 50 51 /** 52 ******************************************************************************* 53 * 54 * @brief 55 * Does uni-weighted prediction on the array pointed by pi2_src and stores 56 * it at the location pointed by pi2_dst Assumptions : The function is 57 * optimized considering the fact Width and height are multiple of 2. 58 * 59 * @par Description: 60 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 61 * offset 62 * 63 * @param[in] pi2_src 64 * Pointer to the source 65 * 66 * @param[out] pu1_dst 67 * Pointer to the destination 68 * 69 * @param[in] src_strd 70 * Source stride 71 * 72 * @param[in] dst_strd 73 * Destination stride 74 * 75 * @param[in] wgt0 76 * weight to be multiplied to the source 77 * 78 * @param[in] off0 79 * offset to be added after rounding and 80 * 81 * @param[in] shifting 82 * 83 * 84 * @param[in] shift 85 * (14 Bit depth) + log2_weight_denominator 86 * 87 * @param[in] lvl_shift 88 * added before shift and offset 89 * 90 * @param[in] ht 91 * height of the source 92 * 93 * @param[in] wd 94 * width of the source 95 * 96 * @returns 97 * 98 * @remarks 99 * None 100 * 101 ******************************************************************************* 102 */ 103 104 void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src, 105 UWORD8 *pu1_dst, 106 WORD32 src_strd, 107 WORD32 dst_strd, 108 WORD32 wgt0, 109 WORD32 off0, 110 WORD32 shift, 111 WORD32 lvl_shift, 112 WORD32 ht, 113 WORD32 wd) 114 { 115 WORD32 row, col; 116 int16x4_t pi2_src_val1; 117 int16x4_t pi2_src_val2; 118 int32x4_t i4_tmp1_t; 119 int32x4_t i4_tmp2_t; 120 int32x4_t sto_res_tmp1; 121 uint16x4_t sto_res_tmp2; 122 uint16x8_t sto_res_tmp3; 123 uint8x8_t sto_res; 124 int32x4_t tmp_lvl_shift_t; 125 WORD32 tmp_shift = 0 - shift; 126 int32x4_t tmp_shift_t; 127 WORD16 *pi2_src_tmp; 128 UWORD8 *pu1_dst_tmp; 129 130 WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift); 131 tmp_lvl_shift += (1 << (shift - 1)); 132 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); 133 tmp_shift_t = vmovq_n_s32(tmp_shift); 134 135 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 136 /* height has also been unrolled, hence 2 rows will processed at a time */ 137 /* store also has been taken care for two row process */ 138 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 139 /* saturated and narrowed */ 140 141 for(row = ht; row > 0; row -= 2) 142 { 143 for(col = wd; col > 0; col -= 4) 144 { 145 pi2_src_tmp = pi2_src + src_strd; 146 147 pu1_dst_tmp = pu1_dst + dst_strd; 148 149 pi2_src_val1 = vld1_s16((int16_t *)pi2_src); 150 pi2_src += 4; 151 152 pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp); 153 i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0); 154 155 i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t); 156 i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0); 157 158 sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t); 159 i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t); 160 161 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 162 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 163 164 sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t); 165 sto_res = vqmovn_u16(sto_res_tmp3); 166 167 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 168 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 169 170 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 171 pu1_dst += 4; 172 173 sto_res = vqmovn_u16(sto_res_tmp3); 174 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 175 } 176 pi2_src += 2 * src_strd - wd; 177 pu1_dst += 2 * dst_strd - wd; 178 } 179 } 180 //WEIGHTED_PRED_UNI 181 182 /** 183 ******************************************************************************* 184 * 185 * @brief 186 * Chroma uni-weighted prediction on the array pointed by pi2_src and stores 187 * it at the location pointed by pi2_dst Assumptions : The function is 188 * optimized considering the fact Width and height are multiple of 2. 189 * 190 * @par Description: 191 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 192 * offset 193 * 194 * @param[in] pi2_src 195 * Pointer to the source 196 * 197 * @param[out] pu1_dst 198 * Pointer to the destination 199 * 200 * @param[in] src_strd 201 * Source stride 202 * 203 * @param[in] dst_strd 204 * Destination stride 205 * 206 * @param[in] wgt0 207 * weight to be multiplied to the source 208 * 209 * @param[in] off0 210 * offset to be added after rounding and 211 * 212 * @param[in] shifting 213 * 214 * 215 * @param[in] shift 216 * (14 Bit depth) + log2_weight_denominator 217 * 218 * @param[in] lvl_shift 219 * added before shift and offset 220 * 221 * @param[in] ht 222 * height of the source 223 * 224 * @param[in] wd 225 * width of the source 226 * 227 * @returns 228 * 229 * @remarks 230 * None 231 * 232 ******************************************************************************* 233 */ 234 235 void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src, 236 UWORD8 *pu1_dst, 237 WORD32 src_strd, 238 WORD32 dst_strd, 239 WORD32 wgt0_cb, 240 WORD32 wgt0_cr, 241 WORD32 off0_cb, 242 WORD32 off0_cr, 243 WORD32 shift, 244 WORD32 lvl_shift, 245 WORD32 ht, 246 WORD32 wd) 247 { 248 WORD32 row, col; 249 int16x4_t pi2_src_val1; 250 int16x4_t pi2_src_val2; 251 int32x4_t i4_tmp1_t; 252 int32x4_t i4_tmp2_t; 253 int32x4_t sto_res_tmp1; 254 uint16x4_t sto_res_tmp2; 255 uint16x8_t sto_res_tmp3; 256 uint8x8_t sto_res; 257 int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v; 258 int32x4x2_t tmp_lvl_shift_t; 259 WORD32 tmp_shift = 0 - shift; 260 int32x4_t tmp_shift_t; 261 int16x4_t tmp_wgt0_u, tmp_wgt0_v; 262 int16x4x2_t wgt0; 263 WORD16 *pi2_src_tmp; 264 UWORD8 *pu1_dst_tmp; 265 266 WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift); 267 tmp_lvl_shift += (1 << (shift - 1)); 268 tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift); 269 270 tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift); 271 tmp_lvl_shift += (1 << (shift - 1)); 272 tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift); 273 274 tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v); 275 276 tmp_shift_t = vmovq_n_s32(tmp_shift); 277 278 tmp_wgt0_u = vdup_n_s16(wgt0_cb); 279 tmp_wgt0_v = vdup_n_s16(wgt0_cr); 280 wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v); 281 282 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 283 /* height has also been unrolled, hence 2 rows will processed at a time */ 284 /* store also has been taken care for two row process */ 285 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 286 /* saturated and narrowed */ 287 288 for(row = ht; row > 0; row -= 2) 289 { 290 for(col = 2 * wd; col > 0; col -= 4) 291 { 292 pi2_src_tmp = pi2_src + src_strd; 293 294 pu1_dst_tmp = pu1_dst + dst_strd; 295 296 pi2_src_val1 = vld1_s16((int16_t *)pi2_src); 297 pi2_src += 4; 298 299 pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp); 300 i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]); 301 302 i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]); 303 i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]); 304 305 sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t); 306 i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]); 307 308 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 309 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 310 311 sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t); 312 sto_res = vqmovn_u16(sto_res_tmp3); 313 314 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 315 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 316 317 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 318 pu1_dst += 4; 319 320 sto_res = vqmovn_u16(sto_res_tmp3); 321 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 322 } 323 pi2_src += 2 * src_strd - 2 * wd; 324 pu1_dst += 2 * dst_strd - 2 * wd; 325 } 326 } 327 //WEIGHTED_PRED_CHROMA_UNI 328 329 /** 330 ******************************************************************************* 331 * 332 * @brief 333 * Does bi-weighted prediction on the arrays pointed by pi2_src1 and 334 * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The 335 * function is optimized considering the fact Width and height are multiple 336 * of 2. 337 * 338 * @par Description: 339 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 340 * off1 + 1) << (shift - 1) ) >> shift 341 * 342 * @param[in] pi2_src1 343 * Pointer to source 1 344 * 345 * @param[in] pi2_src2 346 * Pointer to source 2 347 * 348 * @param[out] pu1_dst 349 * Pointer to destination 350 * 351 * @param[in] src_strd1 352 * Source stride 1 353 * 354 * @param[in] src_strd2 355 * Source stride 2 356 * 357 * @param[in] dst_strd 358 * Destination stride 359 * 360 * @param[in] wgt0 361 * weight to be multiplied to source 1 362 * 363 * @param[in] off0 364 * offset 0 365 * 366 * @param[in] wgt1 367 * weight to be multiplied to source 2 368 * 369 * @param[in] off1 370 * offset 1 371 * 372 * @param[in] shift 373 * (14 Bit depth) + log2_weight_denominator 374 * 375 * @param[in] lvl_shift1 376 * added before shift and offset 377 * 378 * @param[in] lvl_shift2 379 * added before shift and offset 380 * 381 * @param[in] ht 382 * height of the source 383 * 384 * @param[in] wd 385 * width of the source 386 * 387 * @returns 388 * 389 * @remarks 390 * None 391 * 392 ******************************************************************************* 393 */ 394 395 void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1, 396 WORD16 *pi2_src2, 397 UWORD8 *pu1_dst, 398 WORD32 src_strd1, 399 WORD32 src_strd2, 400 WORD32 dst_strd, 401 WORD32 wgt0, 402 WORD32 off0, 403 WORD32 wgt1, 404 WORD32 off1, 405 WORD32 shift, 406 WORD32 lvl_shift1, 407 WORD32 lvl_shift2, 408 WORD32 ht, 409 WORD32 wd) 410 { 411 WORD32 row, col; 412 int16x4_t pi2_src1_val1; 413 int16x4_t pi2_src1_val2; 414 int16x4_t pi2_src2_val1; 415 int16x4_t pi2_src2_val2; 416 int32x4_t i4_tmp1_t1; 417 int32x4_t i4_tmp1_t2; 418 int32x4_t i4_tmp2_t1; 419 int32x4_t i4_tmp2_t2; 420 int32x4_t sto_res_tmp1; 421 uint16x4_t sto_res_tmp2; 422 uint16x8_t sto_res_tmp3; 423 uint8x8_t sto_res; 424 int32x4_t tmp_lvl_shift_t; 425 WORD32 tmp_shift = 0 - shift; 426 int32x4_t tmp_shift_t; 427 WORD16 *pi2_src_tmp1; 428 WORD16 *pi2_src_tmp2; 429 UWORD8 *pu1_dst_tmp; 430 431 WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1); 432 tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1)); 433 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); 434 tmp_shift_t = vmovq_n_s32(tmp_shift); 435 436 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 437 /* height has also been unrolled, hence 2 rows will processed at a time */ 438 /* store also has been taken care for two row process */ 439 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 440 /* saturated and narrowed */ 441 442 for(row = ht; row > 0; row -= 2) 443 { 444 for(col = wd; col > 0; col -= 4) 445 { 446 pi2_src_tmp1 = pi2_src1 + src_strd1; 447 pi2_src_tmp2 = pi2_src2 + src_strd2; 448 449 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); 450 pi2_src1 += 4; 451 pu1_dst_tmp = pu1_dst + dst_strd; 452 453 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); 454 pi2_src2 += 4; 455 i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0); 456 457 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); 458 i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1); 459 460 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); 461 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); 462 463 i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0); 464 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); 465 466 i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1); 467 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); 468 469 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); 470 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 471 472 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); 473 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 474 475 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); 476 sto_res = vqmovn_u16(sto_res_tmp3); 477 478 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 479 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 480 481 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 482 pu1_dst += 4; 483 484 sto_res = vqmovn_u16(sto_res_tmp3); 485 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 486 } 487 pi2_src1 += 2 * src_strd1 - wd; 488 pi2_src2 += 2 * src_strd2 - wd; 489 pu1_dst += 2 * dst_strd - wd; 490 } 491 } 492 //WEIGHTED_PRED_BI 493 494 /** 495 ******************************************************************************* 496 * 497 * @brief 498 * Chroma bi-weighted prediction on the arrays pointed by pi2_src1 and 499 * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The 500 * function is optimized considering the fact Width and height are multiple 501 * of 2. 502 * 503 * @par Description: 504 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 + 505 * off1 + 1) << (shift - 1) ) >> shift 506 * 507 * @param[in] pi2_src1 508 * Pointer to source 1 509 * 510 * @param[in] pi2_src2 511 * Pointer to source 2 512 * 513 * @param[out] pu1_dst 514 * Pointer to destination 515 * 516 * @param[in] src_strd1 517 * Source stride 1 518 * 519 * @param[in] src_strd2 520 * Source stride 2 521 * 522 * @param[in] dst_strd 523 * Destination stride 524 * 525 * @param[in] wgt0 526 * weight to be multiplied to source 1 527 * 528 * @param[in] off0 529 * offset 0 530 * 531 * @param[in] wgt1 532 * weight to be multiplied to source 2 533 * 534 * @param[in] off1 535 * offset 1 536 * 537 * @param[in] shift 538 * (14 Bit depth) + log2_weight_denominator 539 * 540 * @param[in] lvl_shift1 541 * added before shift and offset 542 * 543 * @param[in] lvl_shift2 544 * added before shift and offset 545 * 546 * @param[in] ht 547 * height of the source 548 * 549 * @param[in] wd 550 * width of the source 551 * 552 * @returns 553 * 554 * @remarks 555 * None 556 * 557 ******************************************************************************* 558 */ 559 560 void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1, 561 WORD16 *pi2_src2, 562 UWORD8 *pu1_dst, 563 WORD32 src_strd1, 564 WORD32 src_strd2, 565 WORD32 dst_strd, 566 WORD32 wgt0_cb, 567 WORD32 wgt0_cr, 568 WORD32 off0_cb, 569 WORD32 off0_cr, 570 WORD32 wgt1_cb, 571 WORD32 wgt1_cr, 572 WORD32 off1_cb, 573 WORD32 off1_cr, 574 WORD32 shift, 575 WORD32 lvl_shift1, 576 WORD32 lvl_shift2, 577 WORD32 ht, 578 WORD32 wd) 579 { 580 WORD32 row, col; 581 int16x4_t pi2_src1_val1; 582 int16x4_t pi2_src1_val2; 583 int16x4_t pi2_src2_val1; 584 int16x4_t pi2_src2_val2; 585 int32x4_t i4_tmp1_t1; 586 int32x4_t i4_tmp1_t2; 587 int32x4_t i4_tmp2_t1; 588 int32x4_t i4_tmp2_t2; 589 int32x4_t sto_res_tmp1; 590 uint16x4_t sto_res_tmp2; 591 uint16x8_t sto_res_tmp3; 592 uint8x8_t sto_res; 593 int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v; 594 int32x4x2_t tmp_lvl_shift_t; 595 WORD32 tmp_shift = 0 - shift; 596 int32x4_t tmp_shift_t; 597 int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v; 598 int16x4x2_t wgt0, wgt1; 599 WORD16 *pi2_src_tmp1; 600 WORD16 *pi2_src_tmp2; 601 UWORD8 *pu1_dst_tmp; 602 603 WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb); 604 tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1)); 605 tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift); 606 607 tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr); 608 tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1)); 609 tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift); 610 611 tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v); 612 613 tmp_shift_t = vmovq_n_s32(tmp_shift); 614 615 tmp_wgt0_u = vdup_n_s16(wgt0_cb); 616 tmp_wgt0_v = vdup_n_s16(wgt0_cr); 617 wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v); 618 tmp_wgt1_u = vdup_n_s16(wgt1_cb); 619 tmp_wgt1_v = vdup_n_s16(wgt1_cr); 620 wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v); 621 622 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 623 /* height has also been unrolled, hence 2 rows will processed at a time */ 624 /* store also has been taken care for two row process */ 625 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 626 /* saturated and narrowed */ 627 628 for(row = ht; row > 0; row -= 2) 629 { 630 for(col = 2 * wd; col > 0; col -= 4) 631 { 632 pi2_src_tmp1 = pi2_src1 + src_strd1; 633 pi2_src_tmp2 = pi2_src2 + src_strd2; 634 635 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); 636 pi2_src1 += 4; 637 pu1_dst_tmp = pu1_dst + dst_strd; 638 639 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); 640 pi2_src2 += 4; 641 i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]); 642 643 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); 644 i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]); 645 646 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); 647 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); 648 649 i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]); 650 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]); 651 652 i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]); 653 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); 654 655 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); 656 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 657 658 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]); 659 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 660 661 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); 662 sto_res = vqmovn_u16(sto_res_tmp3); 663 664 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 665 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 666 667 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 668 pu1_dst += 4; 669 670 sto_res = vqmovn_u16(sto_res_tmp3); 671 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 672 } 673 pi2_src1 += 2 * src_strd1 - 2 * wd; 674 pi2_src2 += 2 * src_strd2 - 2 * wd; 675 pu1_dst += 2 * dst_strd - 2 * wd; 676 } 677 } 678 //WEIGHTED_PRED_CHROMA_BI 679 680 /** 681 ******************************************************************************* 682 * 683 * @brief 684 * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and 685 * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The 686 * function is optimized considering the fact Width and height are multiple 687 * of 2. 688 * 689 * @par Description: 690 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 691 * >> shift where shift = 15 - BitDepth 692 * 693 * @param[in] pi2_src1 694 * Pointer to source 1 695 * 696 * @param[in] pi2_src2 697 * Pointer to source 2 698 * 699 * @param[out] pu1_dst 700 * Pointer to destination 701 * 702 * @param[in] src_strd1 703 * Source stride 1 704 * 705 * @param[in] src_strd2 706 * Source stride 2 707 * 708 * @param[in] dst_strd 709 * Destination stride 710 * 711 * @param[in] lvl_shift1 712 * added before shift and offset 713 * 714 * @param[in] lvl_shift2 715 * added before shift and offset 716 * 717 * @param[in] ht 718 * height of the source 719 * 720 * @param[in] wd 721 * width of the source 722 * 723 * @returns 724 * 725 * @remarks 726 * None 727 * 728 ******************************************************************************* 729 */ 730 731 void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1, 732 WORD16 *pi2_src2, 733 UWORD8 *pu1_dst, 734 WORD32 src_strd1, 735 WORD32 src_strd2, 736 WORD32 dst_strd, 737 WORD32 lvl_shift1, 738 WORD32 lvl_shift2, 739 WORD32 ht, 740 WORD32 wd) 741 { 742 WORD32 row, col; 743 int16x4_t pi2_src1_val1; 744 int16x4_t pi2_src1_val2; 745 int16x4_t pi2_src2_val1; 746 int16x4_t pi2_src2_val2; 747 int32x4_t i4_tmp1_t1; 748 int32x4_t i4_tmp1_t2; 749 int32x4_t i4_tmp2_t1; 750 int32x4_t i4_tmp2_t2; 751 int32x4_t sto_res_tmp1; 752 uint16x4_t sto_res_tmp2; 753 uint16x8_t sto_res_tmp3; 754 uint8x8_t sto_res; 755 int32x4_t tmp_lvl_shift_t; 756 int32x4_t tmp_shift_t; 757 WORD16 *pi2_src_tmp1; 758 WORD16 *pi2_src_tmp2; 759 UWORD8 *pu1_dst_tmp; 760 WORD32 shift; 761 762 shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 763 WORD32 tmp_shift = 0 - shift; 764 WORD32 tmp_lvl_shift = 1 << (shift - 1); 765 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); 766 tmp_shift_t = vmovq_n_s32(tmp_shift); 767 768 int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1); 769 int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2); 770 771 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 772 /* height has also been unrolled, hence 2 rows will processed at a time */ 773 /* store also has been taken care for two row process */ 774 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 775 /* saturated and narrowed */ 776 777 for(row = ht; row > 0; row -= 2) 778 { 779 for(col = wd; col > 0; col -= 4) 780 { 781 pi2_src_tmp1 = pi2_src1 + src_strd1; 782 pi2_src_tmp2 = pi2_src2 + src_strd2; 783 784 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); 785 pi2_src1 += 4; 786 pu1_dst_tmp = pu1_dst + dst_strd; 787 788 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); 789 pi2_src2 += 4; 790 i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t); 791 792 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); 793 i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t); 794 795 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); 796 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); 797 798 i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t); 799 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); 800 801 i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t); 802 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); 803 804 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); 805 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 806 807 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); 808 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 809 810 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); 811 sto_res = vqmovn_u16(sto_res_tmp3); 812 813 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 814 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 815 816 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 817 pu1_dst += 4; 818 819 sto_res = vqmovn_u16(sto_res_tmp3); 820 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 821 } 822 pi2_src1 += 2 * src_strd1 - wd; 823 pi2_src2 += 2 * src_strd2 - wd; 824 pu1_dst += 2 * dst_strd - wd; 825 } 826 } 827 //WEIGHTED_PRED_BI_DEFAULT 828 829 /** 830 ******************************************************************************* 831 * 832 * @brief 833 * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and 834 * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The 835 * function is optimized considering the fact Width and height are multiple 836 * of 2. 837 * 838 * @par Description: 839 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) ) 840 * >> shift where shift = 15 - BitDepth 841 * 842 * @param[in] pi2_src1 843 * Pointer to source 1 844 * 845 * @param[in] pi2_src2 846 * Pointer to source 2 847 * 848 * @param[out] pu1_dst 849 * Pointer to destination 850 * 851 * @param[in] src_strd1 852 * Source stride 1 853 * 854 * @param[in] src_strd2 855 * Source stride 2 856 * 857 * @param[in] dst_strd 858 * Destination stride 859 * 860 * @param[in] lvl_shift1 861 * added before shift and offset 862 * 863 * @param[in] lvl_shift2 864 * added before shift and offset 865 * 866 * @param[in] ht 867 * height of the source 868 * 869 * @param[in] wd 870 * width of the source 871 * 872 * @returns 873 * 874 * @remarks 875 * None 876 * 877 ******************************************************************************* 878 */ 879 880 void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1, 881 WORD16 *pi2_src2, 882 UWORD8 *pu1_dst, 883 WORD32 src_strd1, 884 WORD32 src_strd2, 885 WORD32 dst_strd, 886 WORD32 lvl_shift1, 887 WORD32 lvl_shift2, 888 WORD32 ht, 889 WORD32 wd) 890 { 891 WORD32 row, col; 892 int16x4_t pi2_src1_val1; 893 int16x4_t pi2_src1_val2; 894 int16x4_t pi2_src2_val1; 895 int16x4_t pi2_src2_val2; 896 int32x4_t i4_tmp1_t1; 897 int32x4_t i4_tmp1_t2; 898 int32x4_t i4_tmp2_t1; 899 int32x4_t i4_tmp2_t2; 900 int32x4_t sto_res_tmp1; 901 uint16x4_t sto_res_tmp2; 902 uint16x8_t sto_res_tmp3; 903 uint8x8_t sto_res; 904 int32x4_t tmp_lvl_shift_t; 905 int32x4_t tmp_shift_t; 906 WORD16 *pi2_src_tmp1; 907 WORD16 *pi2_src_tmp2; 908 UWORD8 *pu1_dst_tmp; 909 WORD32 shift; 910 WORD32 tmp_shift; 911 WORD32 tmp_lvl_shift; 912 int16x4_t lvl_shift1_t; 913 int16x4_t lvl_shift2_t; 914 shift = SHIFT_14_MINUS_BIT_DEPTH + 1; 915 tmp_shift = 0 - shift; 916 tmp_lvl_shift = 1 << (shift - 1); 917 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift); 918 tmp_shift_t = vmovq_n_s32(tmp_shift); 919 920 lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1); 921 lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2); 922 923 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */ 924 /* height has also been unrolled, hence 2 rows will processed at a time */ 925 /* store also has been taken care for two row process */ 926 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */ 927 /* saturated and narrowed */ 928 929 for(row = ht; row > 0; row -= 2) 930 { 931 for(col = 2 * wd; col > 0; col -= 4) 932 { 933 pi2_src_tmp1 = pi2_src1 + src_strd1; 934 pi2_src_tmp2 = pi2_src2 + src_strd2; 935 936 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1); 937 pi2_src1 += 4; 938 pu1_dst_tmp = pu1_dst + dst_strd; 939 940 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2); 941 pi2_src2 += 4; 942 i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t); 943 944 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1); 945 i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t); 946 947 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2); 948 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2); 949 950 i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t); 951 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t); 952 953 i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t); 954 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t); 955 956 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2); 957 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 958 959 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t); 960 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 961 962 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t); 963 sto_res = vqmovn_u16(sto_res_tmp3); 964 965 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1); 966 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2); 967 968 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0); 969 pu1_dst += 4; 970 971 sto_res = vqmovn_u16(sto_res_tmp3); 972 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0); 973 } 974 pi2_src1 += 2 * src_strd1 - 2 * wd; 975 pi2_src2 += 2 * src_strd2 - 2 * wd; 976 pu1_dst += 2 * dst_strd - 2 * wd; 977 } 978 } 979 //WEIGHTED_PRED_CHROMA_BI_DEFAULT 980