Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* //file
     21 //*  ihevc_weighted_pred_bi.s
     22 //*
     23 //* //brief
     24 //*  contains function definitions for weighted prediction used in inter
     25 //* prediction
     26 //*
     27 //* //author
     28 //*  parthiban v
     29 //*
     30 //* //par list of functions:
     31 //*  - ihevc_weighted_pred_bi()
     32 //*
     33 //* //remarks
     34 //*  none
     35 //*
     36 //*******************************************************************************
     37 //*/
     38 ///**
     39 //*******************************************************************************
     40 //*
     41 //* //brief
     42 //*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
     43 //* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
     44 //* function is optimized considering the fact width and  height are multiple
     45 //* of 2.
     46 //*
     47 //* //par description:
     48 //*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
     49 //* off1 + 1) << (shift - 1) ) >> shift
     50 //*
     51 //* //param[in] pi2_src1
     52 //*  pointer to source 1
     53 //*
     54 //* //param[in] pi2_src2
     55 //*  pointer to source 2
     56 //*
     57 //* //param[out] pu1_dst
     58 //*  pointer to destination
     59 //*
     60 //* //param[in] src_strd1
     61 //*  source stride 1
     62 //*
     63 //* //param[in] src_strd2
     64 //*  source stride 2
     65 //*
     66 //* //param[in] dst_strd
     67 //*  destination stride
     68 //*
     69 //* //param[in] wgt0
     70 //*  weight to be multiplied to source 1
     71 //*
     72 //* //param[in] off0
     73 //*  offset 0
     74 //*
     75 //* //param[in] wgt1
     76 //*  weight to be multiplied to source 2
     77 //*
     78 //* //param[in] off1
     79 //*  offset 1
     80 //*
     81 //* //param[in] shift
     82 //*  (14 bit depth) + log2_weight_denominator
     83 //*
     84 //* //param[in] lvl_shift1
     85 //*  added before shift and offset
     86 //*
     87 //* //param[in] lvl_shift2
     88 //*  added before shift and offset
     89 //*
     90 //* //param[in] ht
     91 //*  height of the source
     92 //*
     93 //* //param[in] wd
     94 //*  width of the source
     95 //*
     96 //* //returns
     97 //*
     98 //* //remarks
     99 //*  none
    100 //*
    101 //*******************************************************************************
    102 //*/
    103 
    104 //void ihevc_weighted_pred_bi(word16 *pi2_src1,
    105 //                            word16 *pi2_src2,
    106 //                            uword8 *pu1_dst,
    107 //                            word32 src_strd1,
    108 //                            word32 src_strd2,
    109 //                            word32 dst_strd,
    110 //                            word32 wgt0,
    111 //                            word32 off0,
    112 //                            word32 wgt1,
    113 //                            word32 off1,
    114 //                            word32 shift,
    115 //                            word32 lvl_shift1,
    116 //                            word32 lvl_shift2,
    117 //                            word32 ht,
    118 //                            word32 wd)
    119 
    120 //**************variables vs registers*****************************************
    121 //    x0 => *pi2_src1
    122 //    x1 => *pi2_src2
    123 //    x2 => *pu1_dst
    124 //    x3 =>  src_strd1
    125 //    x4 =>  src_strd2
    126 //    x5 =>  dst_strd
    127 //    x6 =>  wgt0
    128 //    x7 =>  off0
    129 //    x8 =>  wgt1
    130 //    x9 =>  off1
    131 //    x10 =>  shift
    132 //    x11 =>  lvl_shift1
    133 //    x12 =>    lvl_shift2
    134 //    x14 =>    ht
    135 //    x7    =>    wd
    136 
    137 .text
    138 .align 4
    139 
    140 .include "ihevc_neon_macros.s"
    141 
    142 .globl ihevc_weighted_pred_bi_av8
    143 
    144 .type ihevc_weighted_pred_bi_av8, %function
    145 
    146 ihevc_weighted_pred_bi_av8:
    147 
    148     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    149 
    150     ldr         w8,[sp,#0]
    151     ldr         w9,[sp,#8]
    152     ldr         w10,[sp,#16]
    153     ldr         w11,[sp,#24]
    154     ldr         w12,[sp,#32]
    155     ldr         w13,[sp,#40]
    156     ldr         w14,[sp,#48]
    157 
    158     sxtw        x8,w8
    159     sxtw        x9,w9
    160     sxtw        x10,w10
    161     sxtw        x11,w11
    162     sxtw        x12,w12
    163 
    164 
    165     stp         x19, x20,[sp,#-16]!
    166     stp         x21, x22,[sp,#-16]!
    167     stp         x23, x24,[sp,#-16]!
    168     stp         x25, x26,[sp,#-16]!
    169 
    170     mov         x15,x4 // src_strd2 40
    171     mov         x16,x5 // dst_strd 44
    172     mov         x17,x6 // wgt0 48
    173     mov         x19,x7 // off0 52
    174     mov         x20,x8 // wgt1 56
    175     mov         x21,x9 // off1 60
    176     mov         x22,x10 // shift 64
    177     mov         x23,x11 // lvl_shift1 68
    178     mov         x24,x12 // lvl_shift2 72
    179     mov         x25,x13 // ht 76
    180     mov         x26,x14 // wd 80
    181 
    182     mov         x6,x17                      //load wgt0
    183     mov         x11,x23                     //load lvl_shift1
    184     mov         x12,x24                     //load lvl_shift2
    185     mov         v7.h[0],w6                  //moved for scalar multiplication
    186     mul         x4, x11 , x6                //lvl_shift1 * wgt0
    187     mov         x8,x20                      //load wgt1
    188     mov         x7,x19                      //load off0
    189     mov         v7.h[1],w8                  //moved for scalar multiplication
    190     madd        x4,x12,x8,x4                //(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
    191     mov         x9,x21                      //load off1
    192     add         x5,x7,x9                    //off0 + off1
    193     mov         x10,x22                     //load shift
    194     add         x5,x5,#1                    //off0 + off1 + 1
    195     sub         x14,x10,#1                  //shift - 1
    196     mov         x7,x26                      //load wd
    197     lsl         x5,x5,x14                   //((off0 + off1 + 1) << (shift - 1))
    198     dup         v28.4s,w10                  //vmovq_n_s32(0-shift)
    199     add         x4,x4,x5                    //tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
    200     dup         v30.4s,w4                   //vmovq_n_s32(tmp_lvl_shift)
    201     neg         v28.4s, v28.4s
    202     mov         x4,x15                      //load src_strd2
    203     lsl         x9,x7,#1
    204     mov         x5,x16                      //load dst_strd
    205     lsl         x3,x3,#1
    206     mov         x14,x25                     //load ht
    207     lsl         x4,x4,#1
    208 
    209     cmp         x14,#0                      //check ht == 0
    210     beq         end_loops                   //if equal, then end the function
    211 
    212 outer_loop:
    213     cmp         x7,#0                       //check wd == 0
    214     beq         end_loops                   //if equal, then end the function
    215 
    216 core_loop:
    217     add         x6,x0,x3                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    218     add         x8,x1,x4                    //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    219     ld1         {v0.4h},[x0],#8             //load and increment the pi2_src1
    220     add         x10,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
    221     ld1         {v1.4h},[x1],#8             //load and increment the pi2_src2
    222     smull       v4.4s, v0.4h, v7.h[0]       //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
    223     ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 ii iteration
    224     smull       v5.4s, v1.4h, v7.h[1]       //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
    225     ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 ii iteration
    226     add         v4.4s,  v4.4s ,  v5.4s      //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
    227 
    228     ld1         {v0.4h},[x6],x3             //load and increment the pi2_src1 iii iteration
    229     smull       v6.4s, v2.4h, v7.h[0]       //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
    230 
    231     ld1         {v1.4h},[x8],x4             //load and increment the pi2_src2 iii iteration
    232     add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    233     smull       v19.4s, v0.4h, v7.h[0]      //vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
    234 
    235     ld1         {v2.4h},[x6],x3             //load and increment the pi2_src_tmp1 iv iteration
    236     smull       v17.4s, v3.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
    237     sshl        v4.4s,v4.4s,v28.4s          //vshlq_s32(i4_tmp1_t1, tmp_shift_t)
    238 
    239     ld1         {v3.4h},[x8],x4             //load and increment the pi2_src_tmp1 iv iteration
    240     add         v6.4s,  v6.4s ,  v17.4s     //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
    241 
    242     sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
    243     smull       v16.4s, v1.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
    244 
    245     add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
    246     //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    247     add         v19.4s,  v19.4s ,  v16.4s   //vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
    248 
    249     sshl        v6.4s,v6.4s,v28.4s
    250     //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
    251     smull       v18.4s, v2.4h, v7.h[0]      //vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
    252     uqxtn       v4.8b,v4.8h
    253     //vqmovn.u16    d4,q2                        //vqmovn_u16(sto_res_tmp3)
    254     add         v19.4s,  v19.4s ,  v30.4s   //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
    255 
    256     sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
    257     smull       v20.4s, v3.4h, v7.h[1]      //vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
    258 
    259     sshl        v19.4s,v19.4s,v28.4s
    260     //vshl.s32    q7,q7,q14                    //vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
    261     //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
    262 
    263     add         v18.4s,  v18.4s ,  v20.4s   //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    264     sqxtun      v19.4h, v19.4s              //vqmovun_s32(sto_res_tmp1) iii iteration
    265 
    266     add         v18.4s,  v18.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
    267     st1         {v4.s}[0],[x2],#4           //store pu1_dst i iteration
    268 
    269     uqxtn       v6.8b,v6.8h
    270     //vqmovn.u16    d10,q5                        //vqmovn_u16(sto_res_tmp3) ii iteration
    271     sshl        v18.4s,v18.4s,v28.4s
    272     //vshl.s32    q9,q9,q14                    //vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
    273     st1         {v6.s}[0],[x10],x5          //store pu1_dst ii iteration
    274 
    275 
    276     //mov v15, v14                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
    277     uqxtn       v19.8b,v19.8h
    278     //vqmovn.u16    d14,q7                        //vqmovn_u16(sto_res_tmp3) iii iteration
    279     sqxtun      v18.4h, v18.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
    280     //mov v19, v18                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    281     st1         {v19.s}[0],[x10],x5         //store pu1_dst iii iteration
    282     uqxtn       v18.8b,v18.8h
    283     //vqmovn.u16    d18,q9                        //vqmovn_u16(sto_res_tmp3) iv iteration
    284     subs        x7,x7,#4                    //decrement wd by 4 and check for 0
    285     st1         {v18.s}[0],[x10],x5         //store pu1_dst iv iteration
    286 
    287     bgt         core_loop                   //if greater than 0 repeat the core loop again
    288 
    289 end_core_loop:
    290     sub         x20,x9,x3,lsl #2            //2*src_strd1 - wd
    291     neg         x11, x20
    292     subs        x14,x14,#4                  //decrement the ht by 4
    293     sub         x20,x9,x4,lsl #2            //2*src_strd2 - wd
    294     neg         x12, x20
    295     add         x0,x0,x11                   //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    296     asr         x7,x9,#1
    297     add         x1,x1,x12                   //pi2_src2 + 4*src_strd2 - 2*wd
    298     sub         x20,x7,x5,lsl #2            //2*dst_strd - wd
    299     neg         x10, x20
    300     add         x2,x2,x10                   //pu1_dst + dst_std - wd
    301     bgt         core_loop                   //if ht is greater than 0 goto outer_loop
    302 
    303 end_loops:
    304     // ldmfd sp!,{x4-x12,x15}              //reload the registers from sp
    305     ldp         x25, x26,[sp],#16
    306     ldp         x23, x24,[sp],#16
    307     ldp         x21, x22,[sp],#16
    308     ldp         x19, x20,[sp],#16
    309 
    310     ret
    311 
    312 
    313 
    314 
    315 
    316 
    317