Home | History | Annotate | Download | only in arm
      1 @/*****************************************************************************
      2 @*
      3 @* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 @*
      5 @* Licensed under the Apache License, Version 2.0 (the "License");
      6 @* you may not use this file except in compliance with the License.
      7 @* You may obtain a copy of the License at:
      8 @*
      9 @* http://www.apache.org/licenses/LICENSE-2.0
     10 @*
     11 @* Unless required by applicable law or agreed to in writing, software
     12 @* distributed under the License is distributed on an "AS IS" BASIS,
     13 @* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @* See the License for the specific language governing permissions and
     15 @* limitations under the License.
     16 @*
     17 @*****************************************************************************/
     18 @/**
     19 @ *******************************************************************************
     20 @ * @file
     21 @ *  ihevc_itrans_recon_4x4_neon.s
     22 @ *
     23 @ * @brief
     24 @ *  contains function definitions for single stage  inverse transform
     25 @ *
     26 @ * @author
     27 @ *  naveen sr
     28 @ *
     29 @ * @par list of functions:
     30 @ *  - ihevc_itrans_recon_4x4()
     31 @ *
     32 @ * @remarks
     33 @ *  none
     34 @ *
     35 @ *******************************************************************************
     36 @*/
     37 @ /**
     38 @ *******************************************************************************
     39 @ *
     40 @ * @brief
     41 @ *  this function performs inverse transform  and reconstruction for 4x4
     42 @ * input block
     43 @ *
     44 @ * @par description:
     45 @ *  performs inverse transform and adds the prediction  data and clips output
     46 @ * to 8 bit
     47 @ *
     48 @ * @param[in] pi2_src
     49 @ *  input 4x4 coefficients
     50 @ *
     51 @ * @param[in] pi2_tmp
     52 @ *  temporary 4x4 buffer for storing inverse
     53 @ *
     54 @ *  transform
     55 @ *  1st stage output
     56 @ *
     57 @ * @param[in] pu1_pred
     58 @ *  prediction 4x4 block
     59 @ *
     60 @ * @param[out] pu1_dst
     61 @ *  output 4x4 block
     62 @ *
     63 @ * @param[in] src_strd
     64 @ *  input stride
     65 @ *
     66 @ * @param[in] pred_strd
     67 @ *  prediction stride
     68 @ *
     69 @ * @param[in] dst_strd
     70 @ *  output stride
     71 @ *
     72 @ * @param[in] shift
     73 @ *  output shift
     74 @ *
     75 @ * @param[in] zero_cols
     76 @ *  zero columns in pi2_src
     77 @ *
     78 @ * @returns  void
     79 @ *
     80 @ * @remarks
     81 @ *  none
     82 @ *
     83 @ *******************************************************************************
     84 @ */
     85 @void ihevc_itrans_recon_4x4(word16 *pi2_src,
     86 @       word16 *pi2_tmp,
     87 @       uword8 *pu1_pred,
     88 @       uword8 *pu1_dst,
     89 @       word32 src_strd,
     90 @       word32 pred_strd,
     91 @       word32 dst_strd,
     92 @       word32 zero_cols)
     93 @**************variables vs registers*************************
     94 @   r0 => *pi2_src
     95 @   r1 => *pi2_tmp
     96 @   r2 => *pu1_pred
     97 @   r3 => *pu1_dst
     98 @   r4 => src_strd
     99 @   r5 => pred_strd
    100 @   r6 => dst_strd
    101 @   r7 => zero_cols
    102 
    103 
    104 .text
    105 .align 4
    106 
    107 
    108 .set shift_stage1_idct ,   7
    109 .set shift_stage2_idct ,   12
    110 
    111 
    112 
    113 .globl ihevc_itrans_recon_4x4_a9q
    114 
    115 .extern g_ai2_ihevc_trans_4_transpose
    116 
    117 g_ai2_ihevc_trans_4_transpose_addr:
    118 .long g_ai2_ihevc_trans_4_transpose - ulbl1 - 8
    119 
    120 .type ihevc_itrans_recon_4x4_a9q, %function
    121 
    122 ihevc_itrans_recon_4x4_a9q:
    123 
    124     stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments
    125 
    126     ldr         r8,g_ai2_ihevc_trans_4_transpose_addr
    127 ulbl1:
    128     add         r8,r8,pc
    129 
    130     ldr         r4,[sp,#40]                 @loading src_strd
    131     ldr         r5,[sp,#44]                 @loading pred_strd
    132     add         r4,r4,r4                    @ src_strd in terms of word16
    133 
    134     ldr         r6,[sp,#48]                 @loading dst_strd
    135     ldr         r7,[sp,#52]                 @loading zero_cols
    136     add         r9,r0,r4                    @ pi2_src[0] + src_strd
    137 
    138 
    139 
    140     vld1.16     d4,[r8]                     @loading first row of g_ai2_ihevc_trans_4_transpose
    141     @ d4 = {36,64,83,64}
    142     @index = 3  2  1  0
    143     add         r10,r9,r4, lsl #1           @ 3*src_strd
    144     add         r4,r4,r4
    145     vld1.16     d1,[r9]                     @loading pi2_src 2nd row
    146     vld1.16     d3,[r10]                    @loading pi2_src 4th row
    147     vld1.16     d0,[r0],r4                  @loading pi2_src 1st row
    148     vld1.16     d2,[r0],r4                  @loading pi2_src 3rd row
    149 
    150 
    151     @ first stage computation starts
    152     vmull.s16   q3,d1,d4[1]                 @83 * pi2_src[1]
    153     vmlal.s16   q3,d3,d4[3]                 @o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
    154     vmull.s16   q4,d1,d4[3]                 @36 * pi2_src[1]
    155     vld1.32     d22[0], [r2],r5
    156     vmlsl.s16   q4,d3,d4[1]                 @o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
    157 
    158     vaddl.s16   q5,d0,d2                    @pi2_src[0] + pi2_src[2]
    159     vsubl.s16   q6,d0,d2                    @pi2_src[0] - pi2_src[2]
    160     vshl.s32    q5,q5,#6                    @e[0] = 64*(pi2_src[0] + pi2_src[2])
    161     vshl.s32    q6,q6,#6                    @e[1] = 64*(pi2_src[0] - pi2_src[2])
    162 
    163     vadd.s32    q7,q5,q3                    @((e[0] + o[0] )
    164     vadd.s32    q8,q6,q4                    @((e[1] + o[1])
    165     vsub.s32    q9,q6,q4                    @((e[1] - o[1])
    166     vsub.s32    q10,q5,q3                   @((e[0] - o[0])
    167 
    168     vqrshrn.s32 d0,q7,#shift_stage1_idct    @pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
    169     vqrshrn.s32 d1,q8,#shift_stage1_idct    @pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
    170     vqrshrn.s32 d2,q9,#shift_stage1_idct    @pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
    171     vqrshrn.s32 d3,q10,#shift_stage1_idct   @pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
    172 
    173     vtrn.16     d0,d1
    174     vtrn.16     d2,d3
    175     vtrn.32     d0,d2
    176     vtrn.32     d1,d3
    177 
    178     @ first stage ends
    179     @ output in d0,d1,d2,d3
    180     @ second stage starts
    181     vmull.s16   q3,d1,d4[1]                 @83 * pi2_src[1]
    182     vld1.32     d22[1], [r2],r5
    183     vmlal.s16   q3,d3,d4[3]                 @o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
    184     vmull.s16   q4,d1,d4[3]                 @36 * pi2_src[1]
    185     vmlsl.s16   q4,d3,d4[1]                 @o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
    186     vld1.32     d23[0], [r2],r5
    187 
    188     vaddl.s16   q5,d0,d2                    @pi2_src[0] + pi2_src[2]
    189     vsubl.s16   q6,d0,d2                    @pi2_src[0] - pi2_src[2]
    190     vshl.s32    q5,q5,#6                    @e[0] = 64*(pi2_src[0] + pi2_src[2])
    191     vshl.s32    q6,q6,#6                    @e[1] = 64*(pi2_src[0] - pi2_src[2])
    192 
    193 
    194     vadd.s32    q7,q5,q3                    @((e[0] + o[0] )
    195     vadd.s32    q8,q6,q4                    @((e[1] + o[1])
    196     vsub.s32    q9,q6,q4                    @((e[1] - o[1])
    197     vsub.s32    q10,q5,q3                   @((e[0] - o[0])
    198 
    199     vqrshrn.s32 d0,q7,#shift_stage2_idct    @pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
    200     vqrshrn.s32 d1,q8,#shift_stage2_idct    @pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
    201     vqrshrn.s32 d2,q9,#shift_stage2_idct    @pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
    202     vqrshrn.s32 d3,q10,#shift_stage2_idct   @pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
    203     vld1.32     d23[1], [r2],r5
    204 
    205     vtrn.16     d0,d1
    206     vtrn.16     d2,d3
    207     vtrn.32     d0,d2
    208     vtrn.32     d1,d3
    209     @ second stage ends
    210     @ output in d0,d1,d2,d3
    211     @ second stage computation ends
    212 
    213     @ loading pred
    214 
    215     vaddw.u8    q0,q0,d22                   @ pi2_out(16bit) + pu1_pred(8bit)
    216     vaddw.u8    q1,q1,d23                   @ pi2_out(16bit) + pu1_pred(8bit)
    217     vqmovun.s16 d0,q0                       @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
    218     vqmovun.s16 d1,q1                       @ clip_u8(pi2_out(16bit) + pu1_pred(8bit))
    219 
    220     @ storing destination
    221     vst1.32     {d0[0]},[r3],r6
    222     vst1.32     {d0[1]},[r3],r6
    223     vst1.32     {d1[0]},[r3],r6
    224     vst1.32     {d1[1]},[r3],r6
    225 
    226 
    227     ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp
    228 
    229 
    230 
    231 
    232 
    233