Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_weighted_pred_uni.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for weighted prediction used in inter
     25 @* prediction
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_weighted_pred_uni()
     32 @*
     33 @* @remarks
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 
     39 @/**
     40 @*******************************************************************************
     41 @*
     42 @* @brief
     43 @*  does uni-weighted prediction on the array pointed by  pi2_src and stores
     44 @* it at the location pointed by pi2_dst assumptions : the function is
     45 @* optimized considering the fact width and  height are multiple of 2.
     46 @*
     47 @* @par description:
     48 @*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
     49 @* offset
     50 @*
     51 @* @param[in] pi2_src
     52 @*  pointer to the source
     53 @*
     54 @* @param[out] pu1_dst
     55 @*  pointer to the destination
     56 @*
     57 @* @param[in] src_strd
     58 @*  source stride
     59 @*
     60 @* @param[in] dst_strd
     61 @*  destination stride
     62 @*
     63 @* @param[in] wgt0
     64 @*  weight to be multiplied to the source
     65 @*
     66 @* @param[in] off0
     67 @*  offset to be added after rounding and
     68 @*
     69 @* @param[in] shifting
     70 @*
     71 @*
     72 @* @param[in] shift
     73 @*  (14 bit depth) + log2_weight_denominator
     74 @*
     75 @* @param[in] lvl_shift
     76 @*  added before shift and offset
     77 @*
     78 @* @param[in] ht
     79 @*  height of the source
     80 @*
     81 @* @param[in] wd
     82 @*  width of the source
     83 @*
     84 @* @returns
     85 @*
     86 @* @remarks
     87 @*  none
     88 @*
     89 @*******************************************************************************
     90 @*/
     91 
     92 @void ihevc_weighted_pred_uni(word16 *pi2_src,
     93 @                             uword8 *pu1_dst,
     94 @                             word32 src_strd,
     95 @                             word32 dst_strd,
     96 @                             word32 wgt0,
     97 @                             word32 off0,
     98 @                             word32 shift,
     99 @                             word32 lvl_shift,
    100 @                             word32 ht,
    101 @                             word32 wd)
    102 
    103 @**************variables vs registers*****************************************
    104 @   r0 => *pi2_src
    105 @   r1 => *pu1_dst
    106 @   r2 =>  src_strd
    107 @   r3 =>  dst_strd
    108 @   r4 =>  wgt0
    109 @   r5 =>  off0
    110 @   r6 =>  shift
    111 @   r7 =>  lvl_shift
    112 @   r8 =>   ht
    113 @   r9  =>  wd
    114 
    115 .text
    116 .align 4
    117 
    118 
    119 
    120 
    121 .globl ihevc_weighted_pred_uni_a9q
    122 
    123 .type ihevc_weighted_pred_uni_a9q, %function
    124 
    125 ihevc_weighted_pred_uni_a9q:
    126 
    127     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    128 
    129     ldr         r4,[sp,#40]                 @load wgt0
    130     ldr         r7,[sp,#52]                 @load lvl_shift
    131     mov         r11,#1
    132     ldr         r5,[sp,#44]                 @load off0
    133     mul         r10,r7,r4                   @lvl_shift * wgt0
    134     ldr         r6,[sp,#48]                 @load shift
    135     ldr         r8,[sp,#56]                 @load ht
    136     add         r10,r10,r5,lsl r6           @lvl_shift * wgt0 + (off0 << shift)
    137     ldr         r9,[sp,#60]                 @load wt
    138     sub         r12,r6,#1
    139     vmov.s16    d0[0],r4                    @moved for scalar multiplication
    140     lsl         r2,r2,#1
    141     vdup.u32    q14,r6                      @vmovq_n_s32(tmp_shift)
    142     add         r10,r10,r11,lsl r12         @tmp_lvl_shift += (1 << (shift - 1))
    143     vdup.s32    q15,r10                     @vmovq_n_s32(tmp_lvl_shift)
    144     vneg.s32    q14,q14
    145     lsl         r4,r9,#1
    146 
    147     cmp         r8,#0                       @check ht == 0
    148     beq         end_loops                   @if equal, then end the function
    149 
    150 outer_loop:
    151     cmp         r9,#0                       @check wd == 0
    152     beq         end_loops                   @if equal, then end the function
    153 
    154 core_loop:
    155     add         r5,r0,r2                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    156     add         r6,r1,r3                    @pu1_dst_tmp = pu1_dst + dst_strd
    157     vld1.s16    {d1},[r0]!                  @load and increment the pi2_src
    158     vld1.s16    {d2},[r5],r2                @load and increment the pi2_src_tmp ii iteration
    159     vmull.s16   q2,d1,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0)
    160 
    161     vadd.i32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
    162     vld1.s16    {d8},[r5],r2                @load and increment the pi2_src iii iteration
    163 
    164     vmull.s16   q3,d2,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
    165     vld1.s16    {d9},[r5],r2                @load and increment the pi2_src_tmp iv iteration
    166 
    167     vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t)
    168     vadd.i32    q3,q3,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration
    169 
    170     vmull.s16   q5,d8,d0[0]                 @vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
    171     vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
    172 
    173     vadd.i32    q5,q5,q15                   @vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
    174     vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    175 
    176     vshl.s32    q3,q3,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration
    177 
    178     vmull.s16   q6,d9,d0[0]                 @vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
    179     vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
    180 
    181     vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
    182     vqmovun.s32 d6,q3                       @vqmovun_s32(sto_res_tmp1) ii iteration
    183 
    184     vadd.i32    q6,q6,q15                   @vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
    185     vmov.s32    d7,d6                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
    186 
    187     vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) iii iteration
    188 
    189     vshl.s32    q6,q6,q14                   @vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
    190     vst1.32     {d4[0]},[r1]!               @store pu1_dst i iteration
    191     vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
    192 
    193     vqmovn.u16  d6,q3                       @vqmovn_u16(sto_res_tmp3) ii iteration
    194     vst1.32     {d6[0]},[r6],r3             @store pu1_dst ii iteration
    195 
    196     vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) iii iteration
    197     vqmovun.s32 d12,q6                      @vqmovun_s32(sto_res_tmp1) iv iteration
    198 
    199     vmov.s32    d13,d12                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
    200     vst1.32     {d10[0]},[r6],r3            @store pu1_dst i iteration iii iteration
    201     vqmovn.u16  d12,q6                      @vqmovn_u16(sto_res_tmp3) iv iteration
    202 
    203     subs        r9,r9,#4                    @decrement wd by 4 and check for 0
    204     vst1.32     {d12[0]},[r6],r3            @store pu1_dst iv iteration
    205     bgt         core_loop                   @if greater than 0 repeat the core loop again
    206 
    207 end_core_loop:
    208     rsb         r11,r4,r2,lsl #2            @2*src_strd - wd
    209     subs        r8,r8,#4                    @decrement the ht by 4
    210     add         r0,r0,r11                   @pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
    211     asr         r9,r4,#1
    212     rsb         r12,r9,r3,lsl #2            @2*dst_strd - wd
    213     add         r1,r1,r12                   @pu1_dst + dst_std - wd
    214     bgt         core_loop                   @if ht is greater than 0 goto outer_loop
    215 
    216 end_loops:
    217     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    218 
    219 
    220