Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 // *******************************************************************************
     20 // * @file
     21 // *  ihevc_itrans_recon_4x4_neon.s
     22 // *
     23 // * @brief
     24 // *  contains function definitions for single stage  inverse transform
     25 // *
     26 // * @author
     27 // *     naveen sr
     28 // *
     29 // * @par list of functions:
     30 // *  - ihevc_itrans_recon_4x4()
     31 // *
     32 // * @remarks
     33 // *  none
     34 // *
     35 // *******************************************************************************
     36 //*/
     37 // /**
     38 // *******************************************************************************
     39 // *
     40 // * @brief
     41 // *  this function performs inverse transform  and reconstruction for 4x4
     42 // * input block
     43 // *
     44 // * @par description:
     45 // *  performs inverse transform and adds the prediction  data and clips output
     46 // * to 8 bit
     47 // *
     48 // * @param[in] pi2_src
     49 // *  input 4x4 coefficients
     50 // *
     51 // * @param[in] pi2_tmp
     52 // *  temporary 4x4 buffer for storing inverse
     53 // *
     54 // *  transform
     55 // *  1st stage output
     56 // *
     57 // * @param[in] pu1_pred
     58 // *  prediction 4x4 block
     59 // *
     60 // * @param[out] pu1_dst
     61 // *  output 4x4 block
     62 // *
     63 // * @param[in] src_strd
     64 // *  input stride
     65 // *
     66 // * @param[in] pred_strd
     67 // *  prediction stride
     68 // *
     69 // * @param[in] dst_strd
     70 // *  output stride
     71 // *
     72 // * @param[in] shift
     73 // *  output shift
     74 // *
     75 // * @param[in] zero_cols
     76 // *  zero columns in pi2_src
     77 // *
     78 // * @returns  void
     79 // *
     80 // * @remarks
     81 // *  none
     82 // *
     83 // *******************************************************************************
     84 // */
     85 //void ihevc_itrans_recon_4x4(word16 *pi2_src,
     86 //        word16 *pi2_tmp,
     87 //        uword8 *pu1_pred,
     88 //        uword8 *pu1_dst,
     89 //        word32 src_strd,
     90 //        word32 pred_strd,
     91 //        word32 dst_strd,
     92 //        word32 zero_cols)
     93 //**************variables vs registers*************************
     94 //    x0 => *pi2_src
     95 //    x1 => *pi2_tmp
     96 //    x2 => *pu1_pred
     97 //    x3 => *pu1_dst
     98 //    x4 => src_strd
     99 //    x5 => pred_strd
    100 //    x6 => dst_strd
    101 //    x7 => zero_cols
    102 
    103 .text
    104 .align 4
    105 
    106 .include "ihevc_neon_macros.s"
    107 
    108 .set shift_stage1_idct ,   7
    109 .set shift_stage2_idct ,   12
    110 
    111 
    112 
    113 .globl ihevc_itrans_recon_4x4_av8
    114 
    115 .extern g_ai2_ihevc_trans_4_transpose
    116 
    117 .type ihevc_itrans_recon_4x4_av8, %function
    118 
    119 ihevc_itrans_recon_4x4_av8:
    120 
    121     // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments
    122 
    123     stp         x19, x20,[sp,#-16]!
    124 
    125     adrp        x8, :got:g_ai2_ihevc_trans_4_transpose
    126     ldr         x8, [x8, #:got_lo12:g_ai2_ihevc_trans_4_transpose]
    127 
    128     add         x4,x4,x4                    // src_strd in terms of word16
    129     add         x9,x0,x4                    // pi2_src[0] + src_strd
    130 
    131     ld1         {v4.4h},[x8]                //loading first row of g_ai2_ihevc_trans_4_transpose
    132     // d4 = {36,64,83,64}
    133     //index = 3  2  1  0
    134     add         x10,x9,x4, lsl #1           // 3*src_strd
    135     add         x4,x4,x4
    136     ld1         {v1.4h},[x9]                //loading pi2_src 2nd row
    137     ld1         {v3.4h},[x10]               //loading pi2_src 4th row
    138     ld1         {v0.4h},[x0],x4             //loading pi2_src 1st row
    139     ld1         {v2.4h},[x0],x4             //loading pi2_src 3rd row
    140 
    141 
    142     // first stage computation starts
    143     smull       v6.4s, v1.4h, v4.h[1]       //83 * pi2_src[1]
    144     smlal       v6.4s, v3.4h, v4.h[3]       //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
    145     smull       v5.4s, v1.4h, v4.h[3]       //36 * pi2_src[1]
    146     ld1         {v22.s}[0],[x2],x5
    147     smlsl       v5.4s, v3.4h, v4.h[1]       //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
    148 
    149     saddl       v7.4s, v0.4h, v2.4h         //pi2_src[0] + pi2_src[2]
    150     ssubl       v17.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
    151     shl         v7.4s, v7.4s,#6             //e[0] = 64*(pi2_src[0] + pi2_src[2])
    152     shl         v17.4s, v17.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
    153 
    154     add         v19.4s,  v7.4s ,  v6.4s     //((e[0] + o[0] )
    155     add         v16.4s,  v17.4s ,  v5.4s    //((e[1] + o[1])
    156     sub         v18.4s,  v17.4s ,  v5.4s    //((e[1] - o[1])
    157     sub         v20.4s,  v7.4s ,  v6.4s     //((e[0] - o[0])
    158 
    159     sqrshrn     v28.4h, v19.4s,#shift_stage1_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
    160     sqrshrn     v29.4h, v16.4s,#shift_stage1_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
    161     sqrshrn     v30.4h, v18.4s,#shift_stage1_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
    162     sqrshrn     v31.4h, v20.4s,#shift_stage1_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
    163 
    164     trn1        v24.4h, v28.4h, v29.4h
    165     trn2        v25.4h, v28.4h, v29.4h
    166     trn1        v26.4h, v30.4h, v31.4h
    167     trn2        v27.4h, v30.4h, v31.4h
    168     trn1        v0.2s, v24.2s, v26.2s
    169     trn2        v2.2s, v24.2s, v26.2s
    170     trn1        v1.2s, v25.2s, v27.2s
    171     trn2        v3.2s, v25.2s, v27.2s
    172 
    173     // first stage ends
    174     // output in d0,d1,d2,d3
    175     // second stage starts
    176     smull       v6.4s, v1.4h, v4.h[1]       //83 * pi2_src[1]
    177     ld1         {v22.s}[1],[x2],x5
    178     smlal       v6.4s, v3.4h, v4.h[3]       //o[0] = 83 * pi2_src[1] + 36 * pi2_src[3]
    179     smull       v5.4s, v1.4h, v4.h[3]       //36 * pi2_src[1]
    180     smlsl       v5.4s, v3.4h, v4.h[1]       //o[1] = 36 * pi2_src[1] - 83 * pi2_src[3]
    181     ld1         {v23.s}[0],[x2],x5
    182 
    183     saddl       v7.4s, v0.4h, v2.4h         //pi2_src[0] + pi2_src[2]
    184     ssubl       v17.4s, v0.4h, v2.4h        //pi2_src[0] - pi2_src[2]
    185     shl         v7.4s, v7.4s,#6             //e[0] = 64*(pi2_src[0] + pi2_src[2])
    186     shl         v17.4s, v17.4s,#6           //e[1] = 64*(pi2_src[0] - pi2_src[2])
    187 
    188 
    189     add         v19.4s,  v7.4s ,  v6.4s     //((e[0] + o[0] )
    190     add         v16.4s,  v17.4s ,  v5.4s    //((e[1] + o[1])
    191     sub         v18.4s,  v17.4s ,  v5.4s    //((e[1] - o[1])
    192     sub         v20.4s,  v7.4s ,  v6.4s     //((e[0] - o[0])
    193 
    194     sqrshrn     v28.4h, v19.4s,#shift_stage2_idct //pi2_out[0] = clip_s16((e[0] + o[0] + add)>>shift) )
    195     sqrshrn     v29.4h, v16.4s,#shift_stage2_idct //pi2_out[1] = clip_s16((e[1] + o[1] + add)>>shift) )
    196     sqrshrn     v30.4h, v18.4s,#shift_stage2_idct //pi2_out[2] = clip_s16((e[0] - o[0] + add)>>shift) )
    197     sqrshrn     v31.4h, v20.4s,#shift_stage2_idct //pi2_out[3] = clip_s16((e[0] - o[0] + add)>>shift) )
    198     ld1         {v23.s}[1],[x2],x5
    199 
    200     trn1        v24.4h, v28.4h, v29.4h
    201     trn2        v25.4h, v28.4h, v29.4h
    202     trn1        v26.4h, v30.4h, v31.4h
    203     trn2        v27.4h, v30.4h, v31.4h
    204     trn1        v0.2s, v24.2s, v26.2s
    205     trn2        v2.2s, v24.2s, v26.2s
    206     trn1        v1.2s, v25.2s, v27.2s
    207     trn2        v3.2s, v25.2s, v27.2s
    208     // second stage ends
    209     // output in d0,d1,d2,d3
    210     // second stage computation ends
    211 
    212     // loading pred
    213 
    214     mov         v0.d[1],v1.d[0]
    215     mov         v2.d[1],v3.d[0]
    216 
    217     uaddw       v0.8h,  v0.8h ,  v22.8b     // pi2_out(16bit) + pu1_pred(8bit)
    218     uaddw       v2.8h,  v2.8h ,  v23.8b     // pi2_out(16bit) + pu1_pred(8bit)
    219     sqxtun      v0.8b, v0.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
    220     sqxtun      v1.8b, v2.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
    221 
    222     // storing destination
    223     st1         {v0.s}[0],[x3],x6
    224     st1         {v0.s}[1],[x3],x6
    225     st1         {v1.s}[0],[x3],x6
    226     st1         {v1.s}[1],[x3],x6
    227 
    228 
    229     // ldmfd sp!,{x4-x12,x15}                //reload the registers from sp
    230     ldp         x19, x20,[sp],#16
    231 
    232     ret
    233 
    234 
    235 
    236 
    237 
    238