Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 //*******************************************************************************
     20 //* @file
     21 //*  ihevc_weighted_pred_uni.s
     22 //*
     23 //* @brief
     24 //*  contains function definitions for weighted prediction used in inter
     25 //* prediction
     26 //*
     27 //* @author
     28 //*  parthiban v
     29 //*
     30 //* @par list of functions:
     31 //*  - ihevc_weighted_pred_uni()
     32 //*
     33 //* @remarks
     34 //*  none
     35 //*
     36 //*******************************************************************************
     37 //*/
     38 
     39 ///**
     40 //*******************************************************************************
     41 //*
     42 //* @brief
     43 //*  does uni-weighted prediction on the array pointed by  pi2_src and stores
     44 //* it at the location pointed by pi2_dst assumptions : the function is
     45 //* optimized considering the fact width and  height are multiple of 2.
     46 //*
     47 //* @par description:
     48 //*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
     49 //* offset
     50 //*
     51 //* @param[in] pi2_src
     52 //*  pointer to the source
     53 //*
     54 //* @param[out] pu1_dst
     55 //*  pointer to the destination
     56 //*
     57 //* @param[in] src_strd
     58 //*  source stride
     59 //*
     60 //* @param[in] dst_strd
     61 //*  destination stride
     62 //*
     63 //* @param[in] wgt0
     64 //*  weight to be multiplied to the source
     65 //*
     66 //* @param[in] off0
     67 //*  offset to be added after rounding and
     68 //*
     69 //* @param[in] shifting
     70 //*
     71 //*
     72 //* @param[in] shift
     73 //*  (14 bit depth) + log2_weight_denominator
     74 //*
     75 //* @param[in] lvl_shift
     76 //*  added before shift and offset
     77 //*
     78 //* @param[in] ht
     79 //*  height of the source
     80 //*
     81 //* @param[in] wd
     82 //*  width of the source
     83 //*
     84 //* @returns
     85 //*
     86 //* @remarks
     87 //*  none
     88 //*
     89 //*******************************************************************************
     90 //*/
     91 
     92 //void ihevc_weighted_pred_uni(word16 *pi2_src,
     93 //                             uword8 *pu1_dst,
     94 //                             word32 src_strd,
     95 //                             word32 dst_strd,
     96 //                             word32 wgt0,
     97 //                             word32 off0,
     98 //                             word32 shift,
     99 //                             word32 lvl_shift,
    100 //                             word32 ht,
    101 //                             word32 wd)
    102 
    103 //**************variables vs registers*****************************************
    104 //    x0 => *pi2_src
    105 //    x1 => *pu1_dst
    106 //    x2 =>  src_strd
    107 //    x3 =>  dst_strd
    108 //    x4 =>  wgt0
    109 //    x5 =>  off0
    110 //    x6 =>  shift
    111 //    x7 =>  lvl_shift
    112 //    x8 =>    ht
    113 //    x9    =>    wd
    114 
    115 .text
    116 .align 4
    117 
    118 .include "ihevc_neon_macros.s"
    119 
    120 .globl ihevc_weighted_pred_uni_av8
    121 
    122 .type ihevc_weighted_pred_uni_av8, %function
    123 
    124 ihevc_weighted_pred_uni_av8:
    125 
    126     // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments
    127 
    128     ldr         w8,[sp,#0]
    129     ldr         w9,[sp,#8]
    130 
    131     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
    132 
    133     stp         x19, x20,[sp,#-16]!
    134     stp         x21, x22,[sp,#-16]!
    135 
    136     mov         x15,x4 // src_strd2 40
    137     mov         x16,x5 // dst_strd 44
    138     mov         x17,x6 // lvl_shift1 48
    139     mov         x19,x7 // lvl_shift2 52
    140     mov         x20,x8 // ht 56
    141     mov         x21,x9 // wd 60
    142 
    143     mov         x4,x15                      //load wgt0
    144     mov         x7,x19                      //load lvl_shift
    145     mov         x11,#1
    146     mov         x5,x16                      //load off0
    147     mul         x10, x7, x4                 //lvl_shift * wgt0
    148     mov         x6,x17                      //load shift
    149     mov         x8,x20                      //load ht
    150     lsl         x22,x5,x6
    151     add         x10,x10,x22                 //lvl_shift * wgt0 + (off0 << shift)
    152     mov         x9,x21                      //load wt
    153     sub         x12,x6,#1
    154     mov         v0.h[0], w4                 //moved for scalar multiplication
    155     lsl         x2,x2,#1
    156     dup         v28.4s,w6                   //vmovq_n_s32(tmp_shift)
    157     lsl         x22,x11,x12
    158     add         x10,x10,x22                 //tmp_lvl_shift += (1 << (shift - 1))
    159     dup         v30.4s,w10                  //vmovq_n_s32(tmp_lvl_shift)
    160     neg         v28.4s, v28.4s
    161     lsl         x4,x9,#1
    162 
    163     cmp         x8,#0                       //check ht == 0
    164     beq         end_loops                   //if equal, then end the function
    165 
    166 outer_loop:
    167     cmp         x9,#0                       //check wd == 0
    168     beq         end_loops                   //if equal, then end the function
    169 
    170 core_loop:
    171     add         x5,x0,x2                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    172     add         x6,x1,x3                    //pu1_dst_tmp = pu1_dst + dst_strd
    173     ld1         {v1.4h},[x0],#8             //load and increment the pi2_src
    174     ld1         {v2.4h},[x5],x2             //load and increment the pi2_src_tmp ii iteration
    175     smull       v4.4s, v1.4h, v0.h[0]       //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
    176 
    177     add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
    178     ld1         {v3.4h},[x5],x2             //load and increment the pi2_src iii iteration
    179 
    180     smull       v6.4s, v2.4h, v0.h[0]       //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
    181     ld1         {v5.4h},[x5],x2             //load and increment the pi2_src_tmp iv iteration
    182 
    183     sshl        v4.4s,v4.4s,v28.4s
    184     //vshl.s32    q2,q2,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t)
    185     add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
    186 
    187     smull       v7.4s, v3.4h, v0.h[0]       //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
    188     sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)
    189 
    190     add         v7.4s,  v7.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
    191     //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    192 
    193     sshl        v6.4s,v6.4s,v28.4s
    194     //vshl.s32    q3,q3,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
    195 
    196     smull       v16.4s, v5.4h, v0.h[0]      //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
    197     uqxtn       v4.8b,  v4.8h               //vqmovn_u16(sto_res_tmp3)
    198 
    199     sshl        v7.4s,v7.4s,v28.4s
    200     //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
    201     sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration
    202 
    203     add         v16.4s,  v16.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
    204     //mov v7, v6                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
    205 
    206     sqxtun      v7.4h, v7.4s                //vqmovun_s32(sto_res_tmp1) iii iteration
    207 
    208     sshl        v16.4s,v16.4s,v28.4s
    209     //vshl.s32    q6,q6,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
    210     st1         {v4.s}[0],[x1],#4           //store pu1_dst i iteration
    211     //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
    212 
    213     uqxtn       v6.8b,  v6.8h               //vqmovn_u16(sto_res_tmp3) ii iteration
    214     st1         {v6.s}[0],[x6],x3           //store pu1_dst ii iteration
    215 
    216     uqxtn       v7.8b,  v7.8h               //vqmovn_u16(sto_res_tmp3) iii iteration
    217     sqxtun      v16.4h, v16.4s              //vqmovun_s32(sto_res_tmp1) iv iteration
    218 
    219     //mov v13, v12                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
    220     st1         {v7.s}[0],[x6],x3           //store pu1_dst i iteration iii iteration
    221     uqxtn       v16.8b,  v16.8h             //vqmovn_u16(sto_res_tmp3) iv iteration
    222 
    223     subs        x9,x9,#4                    //decrement wd by 4 and check for 0
    224     st1         {v16.s}[0],[x6],x3          //store pu1_dst iv iteration
    225     bgt         core_loop                   //if greater than 0 repeat the core loop again
    226 
    227 end_core_loop:
    228     sub         x22,x4,x2,lsl #2            //2*src_strd - wd
    229     neg         x11, x22
    230     subs        x8,x8,#4                    //decrement the ht by 4
    231     add         x0,x0,x11                   //pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
    232     asr         x9,x4,#1
    233     sub         x22,x9,x3,lsl #2            //2*dst_strd - wd
    234     neg         x12, x22
    235     add         x1,x1,x12                   //pu1_dst + dst_std - wd
    236     bgt         core_loop                   //if ht is greater than 0 goto outer_loop
    237 
    238 end_loops:
    239     // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    240     ldp         x21, x22,[sp],#16
    241     ldp         x19, x20,[sp],#16
    242 
    243     ret
    244 
    245 
    246