Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @*******************************************************************************
     20 @* @file
     21 @*  ihevc_weighted_pred_bi.s
     22 @*
     23 @* @brief
     24 @*  contains function definitions for weighted prediction used in inter
     25 @* prediction
     26 @*
     27 @* @author
     28 @*  parthiban v
     29 @*
     30 @* @par list of functions:
     31 @*  - ihevc_weighted_pred_bi()
     32 @*
     33 @* @remarks
     34 @*  none
     35 @*
     36 @*******************************************************************************
     37 @*/
     38 @/**
     39 @*******************************************************************************
     40 @*
     41 @* @brief
     42 @*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
     43 @* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
     44 @* function is optimized considering the fact width and  height are multiple
     45 @* of 2.
     46 @*
     47 @* @par description:
     48 @*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
     49 @* off1 + 1) << (shift - 1) ) >> shift
     50 @*
     51 @* @param[in] pi2_src1
     52 @*  pointer to source 1
     53 @*
     54 @* @param[in] pi2_src2
     55 @*  pointer to source 2
     56 @*
     57 @* @param[out] pu1_dst
     58 @*  pointer to destination
     59 @*
     60 @* @param[in] src_strd1
     61 @*  source stride 1
     62 @*
     63 @* @param[in] src_strd2
     64 @*  source stride 2
     65 @*
     66 @* @param[in] dst_strd
     67 @*  destination stride
     68 @*
     69 @* @param[in] wgt0
     70 @*  weight to be multiplied to source 1
     71 @*
     72 @* @param[in] off0
     73 @*  offset 0
     74 @*
     75 @* @param[in] wgt1
     76 @*  weight to be multiplied to source 2
     77 @*
     78 @* @param[in] off1
     79 @*  offset 1
     80 @*
     81 @* @param[in] shift
     82 @*  (14 bit depth) + log2_weight_denominator
     83 @*
     84 @* @param[in] lvl_shift1
     85 @*  added before shift and offset
     86 @*
     87 @* @param[in] lvl_shift2
     88 @*  added before shift and offset
     89 @*
     90 @* @param[in] ht
     91 @*  height of the source
     92 @*
     93 @* @param[in] wd
     94 @*  width of the source
     95 @*
     96 @* @returns
     97 @*
     98 @* @remarks
     99 @*  none
    100 @*
    101 @*******************************************************************************
    102 @*/
    103 
    104 @void ihevc_weighted_pred_bi(word16 *pi2_src1,
    105 @                            word16 *pi2_src2,
    106 @                            uword8 *pu1_dst,
    107 @                            word32 src_strd1,
    108 @                            word32 src_strd2,
    109 @                            word32 dst_strd,
    110 @                            word32 wgt0,
    111 @                            word32 off0,
    112 @                            word32 wgt1,
    113 @                            word32 off1,
    114 @                            word32 shift,
    115 @                            word32 lvl_shift1,
    116 @                            word32 lvl_shift2,
    117 @                            word32 ht,
    118 @                            word32 wd)
    119 
    120 @**************variables vs registers*****************************************
    121 @   r0 => *pi2_src1
    122 @   r1 => *pi2_src2
    123 @   r2 => *pu1_dst
    124 @   r3 =>  src_strd1
    125 @   r4 =>  src_strd2
    126 @   r5 =>  dst_strd
    127 @   r6 =>  wgt0
    128 @   r7 =>  off0
    129 @   r8 =>  wgt1
    130 @   r9 =>  off1
    131 @   r10 =>  shift
    132 @   r11 =>  lvl_shift1
    133 @   r12 =>  lvl_shift2
    134 @   r14 =>  ht
    135 @   r7  =>  wd
    136 
    137 .text
    138 .align 4
    139 
    140 
    141 
    142 
    143 .globl ihevc_weighted_pred_bi_a9q
    144 
    145 .type ihevc_weighted_pred_bi_a9q, %function
    146 
    147 ihevc_weighted_pred_bi_a9q:
    148 
    149     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    150 
    151     ldr         r6,[sp,#48]                 @load wgt0
    152     ldr         r11,[sp,#68]                @load lvl_shift1
    153     ldr         r12,[sp,#72]                @load lvl_shift2
    154     vmov.s16    d7[0],r6                    @moved for scalar multiplication
    155     mul         r4,r11,r6                   @lvl_shift1 * wgt0
    156     ldr         r8,[sp,#56]                 @load wgt1
    157     ldr         r7,[sp,#52]                 @load off0
    158     vmov.s16    d7[1],r8                    @moved for scalar multiplication
    159     mla         r4,r12,r8,r4                @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
    160     ldr         r9,[sp,#60]                 @load off1
    161     add         r5,r7,r9                    @off0 + off1
    162     ldr         r10,[sp,#64]                @load shift
    163     add         r5,r5,#1                    @off0 + off1 + 1
    164     sub         r14,r10,#1                  @shift - 1
    165     ldr         r7,[sp,#80]                 @load wd
    166     lsl         r5,r5,r14                   @((off0 + off1 + 1) << (shift - 1))
    167     vdup.u32    q14,r10                     @vmovq_n_s32(0-shift)
    168     add         r4,r4,r5                    @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
    169     vdup.u32    q15,r4                      @vmovq_n_s32(tmp_lvl_shift)
    170     vneg.s32    q14,q14
    171     ldr         r4,[sp,#40]                 @load src_strd2
    172     lsl         r9,r7,#1
    173     ldr         r5,[sp,#44]                 @load dst_strd
    174     lsl         r3,r3,#1
    175     ldr         r14,[sp,#76]                @load ht
    176     lsl         r4,r4,#1
    177 
    178     cmp         r14,#0                      @check ht == 0
    179     beq         end_loops                   @if equal, then end the function
    180 
    181 outer_loop:
    182     cmp         r7,#0                       @check wd == 0
    183     beq         end_loops                   @if equal, then end the function
    184 
    185 core_loop:
    186     add         r6,r0,r3                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    187     add         r8,r1,r4                    @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    188     vld1.s16    {d0},[r0]!                  @load and increment the pi2_src1
    189     add         r10,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    190     vld1.s16    {d1},[r1]!                  @load and increment the pi2_src2
    191     vmull.s16   q2,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
    192     vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 ii iteration
    193     vmull.s16   q4,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
    194     vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 ii iteration
    195     vadd.s32    q2,q2,q4                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)
    196 
    197     vld1.s16    {d0},[r6],r3                @load and increment the pi2_src1 iii iteration
    198     vmull.s16   q5,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration
    199 
    200     vld1.s16    {d1},[r8],r4                @load and increment the pi2_src2 iii iteration
    201     vadd.s32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    202     vmull.s16   q7,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration
    203 
    204     vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 iv iteration
    205     vmull.s16   q6,d3,d7[1]                 @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
    206     vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t)
    207 
    208     vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 iv iteration
    209     vadd.s32    q5,q5,q6                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration
    210 
    211     vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
    212     vmull.s16   q8,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration
    213 
    214     vadd.s32    q5,q5,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
    215     vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    216     vadd.s32    q7,q7,q8                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration
    217 
    218     vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
    219     vmull.s16   q9,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
    220     vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
    221     vadd.s32    q7,q7,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
    222 
    223     vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) ii iteration
    224     vmull.s16   q10,d3,d7[1]                @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration
    225 
    226     vshl.s32    q7,q7,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
    227     vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration
    228 
    229     vadd.s32    q9,q9,q10                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    230     vqmovun.s32 d14,q7                      @vqmovun_s32(sto_res_tmp1) iii iteration
    231 
    232     vadd.s32    q9,q9,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
    233     vst1.s32    {d4[0]},[r2]!               @store pu1_dst i iteration
    234 
    235     vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) ii iteration
    236     vshl.s32    q9,q9,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
    237     vst1.s32    {d10[0]},[r10],r5           @store pu1_dst ii iteration
    238 
    239 
    240     vmov.s32    d15,d14                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
    241     vqmovn.u16  d14,q7                      @vqmovn_u16(sto_res_tmp3) iii iteration
    242     vqmovun.s32 d18,q9                      @vqmovun_s32(sto_res_tmp1) iv iteration
    243     vmov.s32    d19,d18                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    244     vst1.s32    {d14[0]},[r10],r5           @store pu1_dst iii iteration
    245     vqmovn.u16  d18,q9                      @vqmovn_u16(sto_res_tmp3) iv iteration
    246     subs        r7,r7,#4                    @decrement wd by 4 and check for 0
    247     vst1.s32    {d18[0]},[r10],r5           @store pu1_dst iv iteration
    248 
    249     bgt         core_loop                   @if greater than 0 repeat the core loop again
    250 
    251 end_core_loop:
    252     rsb         r11,r9,r3,lsl #2            @2*src_strd1 - wd
    253     subs        r14,r14,#4                  @decrement the ht by 4
    254     rsb         r12,r9,r4,lsl #2            @2*src_strd2 - wd
    255     add         r0,r0,r11                   @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    256     asr         r7,r9,#1
    257     add         r1,r1,r12                   @pi2_src2 + 4*src_strd2 - 2*wd
    258     rsb         r10,r7,r5,lsl #2            @2*dst_strd - wd
    259     add         r2,r2,r10                   @pu1_dst + dst_std - wd
    260     bgt         core_loop                   @if ht is greater than 0 goto outer_loop
    261 
    262 end_loops:
    263     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    264 
    265 
    266 
    267 
    268 
    269 
    270