Home | History | Annotate | Download | only in arm
      1 /******************************************************************************
      2 *
      3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 *
      5 * Licensed under the Apache License, Version 2.0 (the "License");
      6 * you may not use this file except in compliance with the License.
      7 * You may obtain a copy of the License at:
      8 *
      9 * http://www.apache.org/licenses/LICENSE-2.0
     10 *
     11 * Unless required by applicable law or agreed to in writing, software
     12 * distributed under the License is distributed on an "AS IS" BASIS,
     13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 * See the License for the specific language governing permissions and
     15 * limitations under the License.
     16 *
     17 ******************************************************************************/
     18 /**
     19 *******************************************************************************
     20 * @file
     21 *  ihevc_weighted_pred_neon_intr.c
     22 *
     23 * @brief
     24 *  Contains function definitions for weighted prediction used in inter
     25 * prediction
     26 *
     27 * @author
     28 *  Parthiban V
     29 *
     30 * @par List of Functions:
     31 *  - ihevc_weighted_pred_uni()
     32 *  - ihevc_weighted_pred_bi()
     33 *  - ihevc_weighted_pred_bi_default()
     34 *
     35 * @remarks
     36 *  None
     37 *
     38 *******************************************************************************
     39 */
     40 /*****************************************************************************/
     41 /* File Includes                                                             */
     42 /*****************************************************************************/
     43 #include "ihevc_typedefs.h"
     44 #include "ihevc_defs.h"
     45 #include "ihevc_macros.h"
     46 #include "ihevc_func_selector.h"
     47 #include "ihevc_inter_pred.h"
     48 #include "arm_neon.h"
     49 
     50 
     51 /**
     52 *******************************************************************************
     53 *
     54 * @brief
     55 *  Does uni-weighted prediction on the array pointed by  pi2_src and stores
     56 * it at the location pointed by pi2_dst Assumptions : The function is
     57 * optimized considering the fact Width and  height are multiple of 2.
     58 *
     59 * @par Description:
     60 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
     61 * offset
     62 *
     63 * @param[in] pi2_src
     64 *  Pointer to the source
     65 *
     66 * @param[out] pu1_dst
     67 *  Pointer to the destination
     68 *
     69 * @param[in] src_strd
     70 *  Source stride
     71 *
     72 * @param[in] dst_strd
     73 *  Destination stride
     74 *
     75 * @param[in] wgt0
     76 *  weight to be multiplied to the source
     77 *
     78 * @param[in] off0
     79 *  offset to be added after rounding and
     80 *
     81 * @param[in] shifting
     82 *
     83 *
     84 * @param[in] shift
     85 *  (14 Bit depth) + log2_weight_denominator
     86 *
     87 * @param[in] lvl_shift
     88 *  added before shift and offset
     89 *
     90 * @param[in] ht
     91 *  height of the source
     92 *
     93 * @param[in] wd
     94 *  width of the source
     95 *
     96 * @returns
     97 *
     98 * @remarks
     99 *  None
    100 *
    101 *******************************************************************************
    102 */
    103 
    104 void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
    105                                       UWORD8 *pu1_dst,
    106                                       WORD32 src_strd,
    107                                       WORD32 dst_strd,
    108                                       WORD32 wgt0,
    109                                       WORD32 off0,
    110                                       WORD32 shift,
    111                                       WORD32 lvl_shift,
    112                                       WORD32 ht,
    113                                       WORD32 wd)
    114 {
    115     WORD32 row, col;
    116     int16x4_t pi2_src_val1;
    117     int16x4_t pi2_src_val2;
    118     int32x4_t i4_tmp1_t;
    119     int32x4_t i4_tmp2_t;
    120     int32x4_t sto_res_tmp1;
    121     uint16x4_t sto_res_tmp2;
    122     uint16x8_t sto_res_tmp3;
    123     uint8x8_t sto_res;
    124     int32x4_t tmp_lvl_shift_t;
    125     WORD32 tmp_shift = 0 - shift;
    126     int32x4_t tmp_shift_t;
    127     WORD16 *pi2_src_tmp;
    128     UWORD8 *pu1_dst_tmp;
    129 
    130     WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
    131     tmp_lvl_shift += (1 << (shift - 1));
    132     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
    133     tmp_shift_t = vmovq_n_s32(tmp_shift);
    134 
    135     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    136     /* height has also been unrolled, hence 2 rows will processed at a time                     */
    137     /* store also has been taken care for two row process                                       */
    138     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    139     /* saturated and narrowed                                                                   */
    140 
    141     for(row = ht; row > 0; row -= 2)
    142     {
    143         for(col = wd; col > 0; col -= 4)
    144         {
    145             pi2_src_tmp = pi2_src + src_strd;
    146 
    147             pu1_dst_tmp = pu1_dst + dst_strd;
    148 
    149             pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
    150             pi2_src += 4;
    151 
    152             pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
    153             i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
    154 
    155             i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
    156             i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
    157 
    158             sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
    159             i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
    160 
    161             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    162             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    163 
    164             sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
    165             sto_res = vqmovn_u16(sto_res_tmp3);
    166 
    167             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    168             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    169 
    170             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
    171             pu1_dst += 4;
    172 
    173             sto_res = vqmovn_u16(sto_res_tmp3);
    174             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
    175         }
    176         pi2_src += 2 * src_strd - wd;
    177         pu1_dst += 2 * dst_strd - wd;
    178     }
    179 }
    180 //WEIGHTED_PRED_UNI
    181 
    182 /**
    183 *******************************************************************************
    184 *
    185 * @brief
    186 * Chroma uni-weighted prediction on the array pointed by  pi2_src and stores
    187 * it at the location pointed by pi2_dst Assumptions : The function is
    188 * optimized considering the fact Width and  height are multiple of 2.
    189 *
    190 * @par Description:
    191 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
    192 * offset
    193 *
    194 * @param[in] pi2_src
    195 *  Pointer to the source
    196 *
    197 * @param[out] pu1_dst
    198 *  Pointer to the destination
    199 *
    200 * @param[in] src_strd
    201 *  Source stride
    202 *
    203 * @param[in] dst_strd
    204 *  Destination stride
    205 *
    206 * @param[in] wgt0
    207 *  weight to be multiplied to the source
    208 *
    209 * @param[in] off0
    210 *  offset to be added after rounding and
    211 *
    212 * @param[in] shifting
    213 *
    214 *
    215 * @param[in] shift
    216 *  (14 Bit depth) + log2_weight_denominator
    217 *
    218 * @param[in] lvl_shift
    219 *  added before shift and offset
    220 *
    221 * @param[in] ht
    222 *  height of the source
    223 *
    224 * @param[in] wd
    225 *  width of the source
    226 *
    227 * @returns
    228 *
    229 * @remarks
    230 *  None
    231 *
    232 *******************************************************************************
    233 */
    234 
    235 void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
    236                                              UWORD8 *pu1_dst,
    237                                              WORD32 src_strd,
    238                                              WORD32 dst_strd,
    239                                              WORD32 wgt0_cb,
    240                                              WORD32 wgt0_cr,
    241                                              WORD32 off0_cb,
    242                                              WORD32 off0_cr,
    243                                              WORD32 shift,
    244                                              WORD32 lvl_shift,
    245                                              WORD32 ht,
    246                                              WORD32 wd)
    247 {
    248     WORD32 row, col;
    249     int16x4_t pi2_src_val1;
    250     int16x4_t pi2_src_val2;
    251     int32x4_t i4_tmp1_t;
    252     int32x4_t i4_tmp2_t;
    253     int32x4_t sto_res_tmp1;
    254     uint16x4_t sto_res_tmp2;
    255     uint16x8_t sto_res_tmp3;
    256     uint8x8_t sto_res;
    257     int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
    258     int32x4x2_t tmp_lvl_shift_t;
    259     WORD32 tmp_shift = 0 - shift;
    260     int32x4_t tmp_shift_t;
    261     int16x4_t tmp_wgt0_u, tmp_wgt0_v;
    262     int16x4x2_t wgt0;
    263     WORD16 *pi2_src_tmp;
    264     UWORD8 *pu1_dst_tmp;
    265 
    266     WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
    267     tmp_lvl_shift += (1 << (shift - 1));
    268     tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
    269 
    270     tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
    271     tmp_lvl_shift += (1 << (shift - 1));
    272     tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
    273 
    274     tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
    275 
    276     tmp_shift_t = vmovq_n_s32(tmp_shift);
    277 
    278     tmp_wgt0_u = vdup_n_s16(wgt0_cb);
    279     tmp_wgt0_v = vdup_n_s16(wgt0_cr);
    280     wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
    281 
    282     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    283     /* height has also been unrolled, hence 2 rows will processed at a time                     */
    284     /* store also has been taken care for two row process                                       */
    285     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    286     /* saturated and narrowed                                                                   */
    287 
    288     for(row = ht; row > 0; row -= 2)
    289     {
    290         for(col = 2 * wd; col > 0; col -= 4)
    291         {
    292             pi2_src_tmp = pi2_src + src_strd;
    293 
    294             pu1_dst_tmp = pu1_dst + dst_strd;
    295 
    296             pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
    297             pi2_src += 4;
    298 
    299             pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
    300             i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
    301 
    302             i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
    303             i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
    304 
    305             sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
    306             i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
    307 
    308             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    309             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    310 
    311             sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
    312             sto_res = vqmovn_u16(sto_res_tmp3);
    313 
    314             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    315             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    316 
    317             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
    318             pu1_dst += 4;
    319 
    320             sto_res = vqmovn_u16(sto_res_tmp3);
    321             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
    322         }
    323         pi2_src += 2 * src_strd - 2 * wd;
    324         pu1_dst += 2 * dst_strd - 2 * wd;
    325     }
    326 }
    327 //WEIGHTED_PRED_CHROMA_UNI
    328 
    329 /**
    330 *******************************************************************************
    331 *
    332 * @brief
    333 *  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
    334 * pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
    335 * function is optimized considering the fact Width and  height are multiple
    336 * of 2.
    337 *
    338 * @par Description:
    339 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
    340 * off1 + 1) << (shift - 1) ) >> shift
    341 *
    342 * @param[in] pi2_src1
    343 *  Pointer to source 1
    344 *
    345 * @param[in] pi2_src2
    346 *  Pointer to source 2
    347 *
    348 * @param[out] pu1_dst
    349 *  Pointer to destination
    350 *
    351 * @param[in] src_strd1
    352 *  Source stride 1
    353 *
    354 * @param[in] src_strd2
    355 *  Source stride 2
    356 *
    357 * @param[in] dst_strd
    358 *  Destination stride
    359 *
    360 * @param[in] wgt0
    361 *  weight to be multiplied to source 1
    362 *
    363 * @param[in] off0
    364 *  offset 0
    365 *
    366 * @param[in] wgt1
    367 *  weight to be multiplied to source 2
    368 *
    369 * @param[in] off1
    370 *  offset 1
    371 *
    372 * @param[in] shift
    373 *  (14 Bit depth) + log2_weight_denominator
    374 *
    375 * @param[in] lvl_shift1
    376 *  added before shift and offset
    377 *
    378 * @param[in] lvl_shift2
    379 *  added before shift and offset
    380 *
    381 * @param[in] ht
    382 *  height of the source
    383 *
    384 * @param[in] wd
    385 *  width of the source
    386 *
    387 * @returns
    388 *
    389 * @remarks
    390 *  None
    391 *
    392 *******************************************************************************
    393 */
    394 
    395 void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
    396                                      WORD16 *pi2_src2,
    397                                      UWORD8 *pu1_dst,
    398                                      WORD32 src_strd1,
    399                                      WORD32 src_strd2,
    400                                      WORD32 dst_strd,
    401                                      WORD32 wgt0,
    402                                      WORD32 off0,
    403                                      WORD32 wgt1,
    404                                      WORD32 off1,
    405                                      WORD32 shift,
    406                                      WORD32 lvl_shift1,
    407                                      WORD32 lvl_shift2,
    408                                      WORD32 ht,
    409                                      WORD32 wd)
    410 {
    411     WORD32 row, col;
    412     int16x4_t pi2_src1_val1;
    413     int16x4_t pi2_src1_val2;
    414     int16x4_t pi2_src2_val1;
    415     int16x4_t pi2_src2_val2;
    416     int32x4_t i4_tmp1_t1;
    417     int32x4_t i4_tmp1_t2;
    418     int32x4_t i4_tmp2_t1;
    419     int32x4_t i4_tmp2_t2;
    420     int32x4_t sto_res_tmp1;
    421     uint16x4_t sto_res_tmp2;
    422     uint16x8_t sto_res_tmp3;
    423     uint8x8_t sto_res;
    424     int32x4_t tmp_lvl_shift_t;
    425     WORD32 tmp_shift = 0 - shift;
    426     int32x4_t tmp_shift_t;
    427     WORD16 *pi2_src_tmp1;
    428     WORD16 *pi2_src_tmp2;
    429     UWORD8 *pu1_dst_tmp;
    430 
    431     WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
    432     tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
    433     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
    434     tmp_shift_t = vmovq_n_s32(tmp_shift);
    435 
    436     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    437     /* height has also been unrolled, hence 2 rows will processed at a time                     */
    438     /* store also has been taken care for two row process                                       */
    439     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    440     /* saturated and narrowed                                                                   */
    441 
    442     for(row = ht; row > 0; row -= 2)
    443     {
    444         for(col = wd; col > 0; col -= 4)
    445         {
    446             pi2_src_tmp1 = pi2_src1 + src_strd1;
    447             pi2_src_tmp2 = pi2_src2 + src_strd2;
    448 
    449             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
    450             pi2_src1 += 4;
    451             pu1_dst_tmp = pu1_dst + dst_strd;
    452 
    453             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
    454             pi2_src2 += 4;
    455             i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
    456 
    457             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
    458             i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
    459 
    460             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
    461             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
    462 
    463             i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
    464             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
    465 
    466             i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
    467             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
    468 
    469             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
    470             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    471 
    472             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
    473             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    474 
    475             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
    476             sto_res = vqmovn_u16(sto_res_tmp3);
    477 
    478             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    479             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    480 
    481             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
    482             pu1_dst += 4;
    483 
    484             sto_res = vqmovn_u16(sto_res_tmp3);
    485             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
    486         }
    487         pi2_src1 += 2 * src_strd1 - wd;
    488         pi2_src2 += 2 * src_strd2 - wd;
    489         pu1_dst += 2 * dst_strd - wd;
    490     }
    491 }
    492 //WEIGHTED_PRED_BI
    493 
    494 /**
    495 *******************************************************************************
    496 *
    497 * @brief
    498 *  Chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
    499 * pi2_src2 and stores it at location pointed  by pi2_dst   Assumptions : The
    500 * function is optimized considering the fact Width and  height are multiple
    501 * of 2.
    502 *
    503 * @par Description:
    504 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
    505 * off1 + 1) << (shift - 1) ) >> shift
    506 *
    507 * @param[in] pi2_src1
    508 *  Pointer to source 1
    509 *
    510 * @param[in] pi2_src2
    511 *  Pointer to source 2
    512 *
    513 * @param[out] pu1_dst
    514 *  Pointer to destination
    515 *
    516 * @param[in] src_strd1
    517 *  Source stride 1
    518 *
    519 * @param[in] src_strd2
    520 *  Source stride 2
    521 *
    522 * @param[in] dst_strd
    523 *  Destination stride
    524 *
    525 * @param[in] wgt0
    526 *  weight to be multiplied to source 1
    527 *
    528 * @param[in] off0
    529 *  offset 0
    530 *
    531 * @param[in] wgt1
    532 *  weight to be multiplied to source 2
    533 *
    534 * @param[in] off1
    535 *  offset 1
    536 *
    537 * @param[in] shift
    538 *  (14 Bit depth) + log2_weight_denominator
    539 *
    540 * @param[in] lvl_shift1
    541 *  added before shift and offset
    542 *
    543 * @param[in] lvl_shift2
    544 *  added before shift and offset
    545 *
    546 * @param[in] ht
    547 *  height of the source
    548 *
    549 * @param[in] wd
    550 *  width of the source
    551 *
    552 * @returns
    553 *
    554 * @remarks
    555 *  None
    556 *
    557 *******************************************************************************
    558 */
    559 
    560 void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
    561                                             WORD16 *pi2_src2,
    562                                             UWORD8 *pu1_dst,
    563                                             WORD32 src_strd1,
    564                                             WORD32 src_strd2,
    565                                             WORD32 dst_strd,
    566                                             WORD32 wgt0_cb,
    567                                             WORD32 wgt0_cr,
    568                                             WORD32 off0_cb,
    569                                             WORD32 off0_cr,
    570                                             WORD32 wgt1_cb,
    571                                             WORD32 wgt1_cr,
    572                                             WORD32 off1_cb,
    573                                             WORD32 off1_cr,
    574                                             WORD32 shift,
    575                                             WORD32 lvl_shift1,
    576                                             WORD32 lvl_shift2,
    577                                             WORD32 ht,
    578                                             WORD32 wd)
    579 {
    580     WORD32 row, col;
    581     int16x4_t pi2_src1_val1;
    582     int16x4_t pi2_src1_val2;
    583     int16x4_t pi2_src2_val1;
    584     int16x4_t pi2_src2_val2;
    585     int32x4_t i4_tmp1_t1;
    586     int32x4_t i4_tmp1_t2;
    587     int32x4_t i4_tmp2_t1;
    588     int32x4_t i4_tmp2_t2;
    589     int32x4_t sto_res_tmp1;
    590     uint16x4_t sto_res_tmp2;
    591     uint16x8_t sto_res_tmp3;
    592     uint8x8_t sto_res;
    593     int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
    594     int32x4x2_t tmp_lvl_shift_t;
    595     WORD32 tmp_shift = 0 - shift;
    596     int32x4_t tmp_shift_t;
    597     int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
    598     int16x4x2_t wgt0, wgt1;
    599     WORD16 *pi2_src_tmp1;
    600     WORD16 *pi2_src_tmp2;
    601     UWORD8 *pu1_dst_tmp;
    602 
    603     WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
    604     tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
    605     tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
    606 
    607     tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
    608     tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
    609     tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
    610 
    611     tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
    612 
    613     tmp_shift_t = vmovq_n_s32(tmp_shift);
    614 
    615     tmp_wgt0_u = vdup_n_s16(wgt0_cb);
    616     tmp_wgt0_v = vdup_n_s16(wgt0_cr);
    617     wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
    618     tmp_wgt1_u = vdup_n_s16(wgt1_cb);
    619     tmp_wgt1_v = vdup_n_s16(wgt1_cr);
    620     wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
    621 
    622     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    623     /* height has also been unrolled, hence 2 rows will processed at a time                     */
    624     /* store also has been taken care for two row process                                       */
    625     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    626     /* saturated and narrowed                                                                   */
    627 
    628     for(row = ht; row > 0; row -= 2)
    629     {
    630         for(col = 2 * wd; col > 0; col -= 4)
    631         {
    632             pi2_src_tmp1 = pi2_src1 + src_strd1;
    633             pi2_src_tmp2 = pi2_src2 + src_strd2;
    634 
    635             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
    636             pi2_src1 += 4;
    637             pu1_dst_tmp = pu1_dst + dst_strd;
    638 
    639             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
    640             pi2_src2 += 4;
    641             i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
    642 
    643             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
    644             i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
    645 
    646             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
    647             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
    648 
    649             i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
    650             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
    651 
    652             i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
    653             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
    654 
    655             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
    656             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    657 
    658             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
    659             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    660 
    661             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
    662             sto_res = vqmovn_u16(sto_res_tmp3);
    663 
    664             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    665             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    666 
    667             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
    668             pu1_dst += 4;
    669 
    670             sto_res = vqmovn_u16(sto_res_tmp3);
    671             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
    672         }
    673         pi2_src1 += 2 * src_strd1 - 2 * wd;
    674         pi2_src2 += 2 * src_strd2 - 2 * wd;
    675         pu1_dst += 2 * dst_strd - 2 * wd;
    676     }
    677 }
    678 //WEIGHTED_PRED_CHROMA_BI
    679 
    680 /**
    681 *******************************************************************************
    682 *
    683 * @brief
    684 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
    685 * pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
    686 * function is optimized considering the fact Width and  height are multiple
    687 * of 2.
    688 *
    689 * @par Description:
    690 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
    691 * >> shift  where shift = 15 - BitDepth
    692 *
    693 * @param[in] pi2_src1
    694 *  Pointer to source 1
    695 *
    696 * @param[in] pi2_src2
    697 *  Pointer to source 2
    698 *
    699 * @param[out] pu1_dst
    700 *  Pointer to destination
    701 *
    702 * @param[in] src_strd1
    703 *  Source stride 1
    704 *
    705 * @param[in] src_strd2
    706 *  Source stride 2
    707 *
    708 * @param[in] dst_strd
    709 *  Destination stride
    710 *
    711 * @param[in] lvl_shift1
    712 *  added before shift and offset
    713 *
    714 * @param[in] lvl_shift2
    715 *  added before shift and offset
    716 *
    717 * @param[in] ht
    718 *  height of the source
    719 *
    720 * @param[in] wd
    721 *  width of the source
    722 *
    723 * @returns
    724 *
    725 * @remarks
    726 *  None
    727 *
    728 *******************************************************************************
    729 */
    730 
    731 void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
    732                                              WORD16 *pi2_src2,
    733                                              UWORD8 *pu1_dst,
    734                                              WORD32 src_strd1,
    735                                              WORD32 src_strd2,
    736                                              WORD32 dst_strd,
    737                                              WORD32 lvl_shift1,
    738                                              WORD32 lvl_shift2,
    739                                              WORD32 ht,
    740                                              WORD32 wd)
    741 {
    742     WORD32 row, col;
    743     int16x4_t pi2_src1_val1;
    744     int16x4_t pi2_src1_val2;
    745     int16x4_t pi2_src2_val1;
    746     int16x4_t pi2_src2_val2;
    747     int32x4_t i4_tmp1_t1;
    748     int32x4_t i4_tmp1_t2;
    749     int32x4_t i4_tmp2_t1;
    750     int32x4_t i4_tmp2_t2;
    751     int32x4_t sto_res_tmp1;
    752     uint16x4_t sto_res_tmp2;
    753     uint16x8_t sto_res_tmp3;
    754     uint8x8_t sto_res;
    755     int32x4_t tmp_lvl_shift_t;
    756     int32x4_t tmp_shift_t;
    757     WORD16 *pi2_src_tmp1;
    758     WORD16 *pi2_src_tmp2;
    759     UWORD8 *pu1_dst_tmp;
    760     WORD32 shift;
    761 
    762     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
    763     WORD32 tmp_shift = 0 - shift;
    764     WORD32 tmp_lvl_shift = 1 << (shift - 1);
    765     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
    766     tmp_shift_t = vmovq_n_s32(tmp_shift);
    767 
    768     int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
    769     int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
    770 
    771     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    772     /* height has also been unrolled, hence 2 rows will processed at a time                     */
    773     /* store also has been taken care for two row process                                       */
    774     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    775     /* saturated and narrowed                                                                   */
    776 
    777     for(row = ht; row > 0; row -= 2)
    778     {
    779         for(col = wd; col > 0; col -= 4)
    780         {
    781             pi2_src_tmp1 = pi2_src1 + src_strd1;
    782             pi2_src_tmp2 = pi2_src2 + src_strd2;
    783 
    784             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
    785             pi2_src1 += 4;
    786             pu1_dst_tmp = pu1_dst + dst_strd;
    787 
    788             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
    789             pi2_src2 += 4;
    790             i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
    791 
    792             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
    793             i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
    794 
    795             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
    796             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
    797 
    798             i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
    799             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
    800 
    801             i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
    802             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
    803 
    804             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
    805             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    806 
    807             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
    808             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    809 
    810             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
    811             sto_res = vqmovn_u16(sto_res_tmp3);
    812 
    813             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    814             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    815 
    816             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
    817             pu1_dst += 4;
    818 
    819             sto_res = vqmovn_u16(sto_res_tmp3);
    820             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
    821         }
    822         pi2_src1 += 2 * src_strd1 - wd;
    823         pi2_src2 += 2 * src_strd2 - wd;
    824         pu1_dst += 2 * dst_strd - wd;
    825     }
    826 }
    827 //WEIGHTED_PRED_BI_DEFAULT
    828 
    829 /**
    830 *******************************************************************************
    831 *
    832 * @brief
    833 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
    834 * pi2_src2 and stores it at location  pointed by pi2_dst Assumptions : The
    835 * function is optimized considering the fact Width and  height are multiple
    836 * of 2.
    837 *
    838 * @par Description:
    839 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
    840 * >> shift  where shift = 15 - BitDepth
    841 *
    842 * @param[in] pi2_src1
    843 *  Pointer to source 1
    844 *
    845 * @param[in] pi2_src2
    846 *  Pointer to source 2
    847 *
    848 * @param[out] pu1_dst
    849 *  Pointer to destination
    850 *
    851 * @param[in] src_strd1
    852 *  Source stride 1
    853 *
    854 * @param[in] src_strd2
    855 *  Source stride 2
    856 *
    857 * @param[in] dst_strd
    858 *  Destination stride
    859 *
    860 * @param[in] lvl_shift1
    861 *  added before shift and offset
    862 *
    863 * @param[in] lvl_shift2
    864 *  added before shift and offset
    865 *
    866 * @param[in] ht
    867 *  height of the source
    868 *
    869 * @param[in] wd
    870 *  width of the source
    871 *
    872 * @returns
    873 *
    874 * @remarks
    875 *  None
    876 *
    877 *******************************************************************************
    878 */
    879 
    880 void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
    881                                                     WORD16 *pi2_src2,
    882                                                     UWORD8 *pu1_dst,
    883                                                     WORD32 src_strd1,
    884                                                     WORD32 src_strd2,
    885                                                     WORD32 dst_strd,
    886                                                     WORD32 lvl_shift1,
    887                                                     WORD32 lvl_shift2,
    888                                                     WORD32 ht,
    889                                                     WORD32 wd)
    890 {
    891     WORD32 row, col;
    892     int16x4_t pi2_src1_val1;
    893     int16x4_t pi2_src1_val2;
    894     int16x4_t pi2_src2_val1;
    895     int16x4_t pi2_src2_val2;
    896     int32x4_t i4_tmp1_t1;
    897     int32x4_t i4_tmp1_t2;
    898     int32x4_t i4_tmp2_t1;
    899     int32x4_t i4_tmp2_t2;
    900     int32x4_t sto_res_tmp1;
    901     uint16x4_t sto_res_tmp2;
    902     uint16x8_t sto_res_tmp3;
    903     uint8x8_t sto_res;
    904     int32x4_t tmp_lvl_shift_t;
    905     int32x4_t tmp_shift_t;
    906     WORD16 *pi2_src_tmp1;
    907     WORD16 *pi2_src_tmp2;
    908     UWORD8 *pu1_dst_tmp;
    909     WORD32 shift;
    910     WORD32 tmp_shift;
    911     WORD32 tmp_lvl_shift;
    912     int16x4_t lvl_shift1_t;
    913     int16x4_t lvl_shift2_t;
    914     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
    915     tmp_shift = 0 - shift;
    916     tmp_lvl_shift = 1 << (shift - 1);
    917     tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
    918     tmp_shift_t = vmovq_n_s32(tmp_shift);
    919 
    920     lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
    921     lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
    922 
    923     /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time.                                  */
    924     /* height has also been unrolled, hence 2 rows will processed at a time                     */
    925     /* store also has been taken care for two row process                                       */
    926     /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be        */
    927     /* saturated and narrowed                                                                   */
    928 
    929     for(row = ht; row > 0; row -= 2)
    930     {
    931         for(col = 2 * wd; col > 0; col -= 4)
    932         {
    933             pi2_src_tmp1 = pi2_src1 + src_strd1;
    934             pi2_src_tmp2 = pi2_src2 + src_strd2;
    935 
    936             pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
    937             pi2_src1 += 4;
    938             pu1_dst_tmp = pu1_dst + dst_strd;
    939 
    940             pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
    941             pi2_src2 += 4;
    942             i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
    943 
    944             pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
    945             i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
    946 
    947             pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
    948             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
    949 
    950             i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
    951             i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
    952 
    953             i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
    954             sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
    955 
    956             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
    957             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    958 
    959             i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
    960             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    961 
    962             sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
    963             sto_res = vqmovn_u16(sto_res_tmp3);
    964 
    965             sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
    966             sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
    967 
    968             vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
    969             pu1_dst += 4;
    970 
    971             sto_res = vqmovn_u16(sto_res_tmp3);
    972             vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
    973         }
    974         pi2_src1 += 2 * src_strd1 - 2 * wd;
    975         pi2_src2 += 2 * src_strd2 - 2 * wd;
    976         pu1_dst += 2 * dst_strd - 2 * wd;
    977     }
    978 }
    979 //WEIGHTED_PRED_CHROMA_BI_DEFAULT
    980