Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 // *******************************************************************************
     20 // * @file
     21 // *  ihevc_itrans_recon_4x4_ttype1.s
     22 // *
     23 // * @brief
     24 // *  contains function definitions for inverse transform  and reconstruction
     25 // *
     26 // *
     27 // * @author
     28 // *  naveen sr
     29 // *
     30 // * @par list of functions:
     31 // *  - ihevc_itrans_recon_4x4_ttype1()
     32 // *
     33 // * @remarks
     34 // *  none
     35 // *
     36 // *******************************************************************************
     37 // */
     38 
     39 ///* all the functions here are replicated from ihevc_itrans.c and modified to */
     40 ///* include reconstruction */
     41 //
     42 ///**
     43 // *******************************************************************************
     44 // *
     45 // * @brief
     46 // *  this function performs inverse transform type 1 (dst)  and reconstruction
     47 // * for 4x4 input block
     48 // *
     49 // * @par description:
     50 // *  performs inverse transform and adds the prediction  data and clips output
     51 // * to 8 bit
     52 // *
     53 // * @param[in] pi2_src
     54 // *  input 4x4 coefficients
     55 // *
     56 // * @param[in] pi2_tmp
     57 // *  temporary 4x4 buffer for storing inverse
     58 // *
     59 // *  transform
     60 // *  1st stage output
     61 // *
     62 // * @param[in] pu1_pred
     63 // *  prediction 4x4 block
     64 // *
     65 // * @param[out] pu1_dst
     66 // *  output 4x4 block
     67 // *
     68 // * @param[in] src_strd
     69 // *  input stride
     70 // *
     71 // * @param[in] pred_strd
     72 // *  prediction stride
     73 // *
     74 // * @param[in] dst_strd
     75 // *  output stride
     76 // *
     77 // * @param[in] zero_cols
     78 // *  zero columns in pi2_src
     79 // *
     80 // * @returns  void
     81 // *
     82 // * @remarks
     83 // *  none
     84 // *
     85 // *******************************************************************************
     86 // */
     87 //void ihevc_itrans_recon_4x4_ttype1(word16 *pi2_src,
     88 //        word16 *pi2_tmp,
     89 //        uword8 *pu1_pred,
     90 //        uword8 *pu1_dst,
     91 //        word32 src_strd,
     92 //        word32 pred_strd,
     93 //        word32 dst_strd,
     94 //        word32 zero_cols)
     95 
     96 //**************variables vs registers*************************
     97 //    x0 => *pi2_src
     98 //    x1 => *pi2_tmp
     99 //    x2 => *pu1_pred
    100 //    x3 => *pu1_dst
    101 //    x4 => src_strd
    102 //    x5 => pred_strd
    103 //    x6 => dst_strd
    104 //    x7 => zero_cols
    105 
    106 .text
    107 .align 4
    108 
    109 .include "ihevc_neon_macros.s"
    110 
    111 .set shift_stage1_idct ,   7
    112 .set shift_stage2_idct ,   12
    113 
    114 .globl ihevc_itrans_recon_4x4_ttype1_av8
    115 
    116 .type ihevc_itrans_recon_4x4_ttype1_av8, %function
    117 
    118 ihevc_itrans_recon_4x4_ttype1_av8:
    119 
    120     // stmfd sp!, {x4-x12, x14}    //stack stores the values of the arguments
    121 
    122     stp         x19, x20,[sp,#-16]!
    123 
    124     add         x4,x4,x4                    // src_strd in terms of word16
    125 
    126     mov         x8,#29
    127     mov         x9,#55
    128     mov         x10,#74
    129     mov         x11,#84
    130     mov         v4.4h[0], w8
    131     ld1         {v0.4h},[x0],x4             //loading pi2_src 1st row
    132     mov         v4.4h[1], w9
    133     ld1         {v1.4h},[x0],x4             //loading pi2_src 2nd row
    134     mov         v4.4h[2], w10
    135     ld1         {v2.4h},[x0],x4             //loading pi2_src 3rd row
    136     mov         v4.4h[3], w11
    137     ld1         {v3.4h},[x0],x4             //loading pi2_src 4th row
    138 
    139     // first stage computation starts
    140     smull       v6.4s, v1.4h, v4.4h[2]      //74 * pi2_src[1]
    141     smlal       v6.4s, v0.4h, v4.4h[0]      //74 * pi2_src[1] + 29 * pi2_src[0]
    142     smlal       v6.4s, v3.4h, v4.4h[1]      //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
    143     smlal       v6.4s, v2.4h, v4.4h[3]      //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
    144 
    145     smull       v5.4s, v1.4h, v4.4h[2]      //74 * pi2_src[1]
    146     smlal       v5.4s, v0.4h, v4.4h[1]      //74 * pi2_src[1] + 55 * pi2_src[0]
    147     smlsl       v5.4s, v2.4h, v4.4h[0]      //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
    148     smlsl       v5.4s, v3.4h, v4.4h[3]      //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
    149 
    150     smull       v7.4s, v0.4h, v4.4h[2]      // 74 * pi2_src[0]
    151     smlsl       v7.4s, v2.4h, v4.4h[2]      // 74 * pi2_src[0] - 74 * pi2_src[2]
    152     smlal       v7.4s, v3.4h, v4.4h[2]      //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
    153 
    154     smull       v20.4s, v2.4h, v4.4h[1]     // 55 * pi2_src[2]
    155     smlsl       v20.4s, v1.4h, v4.4h[2]     // 55 * pi2_src[2] - 74 * pi2_src[1]
    156     smlsl       v20.4s, v3.4h, v4.4h[0]     // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
    157     smlal       v20.4s, v0.4h, v4.4h[3]     //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
    158 
    159     sqrshrn     v28.4h, v6.4s,#shift_stage1_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
    160     sqrshrn     v29.4h, v5.4s,#shift_stage1_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
    161     sqrshrn     v30.4h, v7.4s,#shift_stage1_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
    162     sqrshrn     v31.4h, v20.4s,#shift_stage1_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
    163     ld1         {v18.s}[0],[x2],x5
    164 
    165     trn1        v24.4h, v28.4h, v29.4h
    166     trn2        v25.4h, v28.4h, v29.4h
    167     trn1        v26.4h, v30.4h, v31.4h
    168     trn2        v27.4h, v30.4h, v31.4h
    169     trn1        v21.2s, v24.2s, v26.2s
    170     trn2        v16.2s, v24.2s, v26.2s
    171     trn1        v22.2s, v25.2s, v27.2s
    172     trn2        v17.2s, v25.2s, v27.2s
    173     // output in d14,d15,d16,d17
    174     // first stage computation ends
    175 
    176     // second stage computation starts  :  copy pasting 1st stage
    177     // register changes
    178     // d14 - d0
    179     // d15 - d1
    180     // d16 - d2
    181     // d17 - d3
    182     ld1         {v18.s}[1],[x2],x5
    183     smull       v6.4s, v22.4h, v4.4h[2]     //74 * pi2_src[1]
    184     smlal       v6.4s, v21.4h, v4.4h[0]     //74 * pi2_src[1] + 29 * pi2_src[0]
    185     smlal       v6.4s, v17.4h, v4.4h[1]     //74 * pi2_src[1] + 29 * pi2_src[0] + 55 * pi2_src[3]
    186     smlal       v6.4s, v16.4h, v4.4h[3]     //pi2_out[0] = 29* pi2_src[0] + 74 * pi2_src[1] + 84* pi2_src[2] + 55 * pi2_src[3]
    187 
    188     smull       v5.4s, v22.4h, v4.4h[2]     //74 * pi2_src[1]
    189     smlal       v5.4s, v21.4h, v4.4h[1]     //74 * pi2_src[1] + 55 * pi2_src[0]
    190     smlsl       v5.4s, v16.4h, v4.4h[0]     //74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2]
    191     smlsl       v5.4s, v17.4h, v4.4h[3]     //pi2_out[1] = 74 * pi2_src[1] + 55 * pi2_src[0] -  29 * pi2_src[2] - 84 * pi2_src[3])
    192 
    193     smull       v7.4s, v21.4h, v4.4h[2]     // 74 * pi2_src[0]
    194     smlsl       v7.4s, v16.4h, v4.4h[2]     // 74 * pi2_src[0] - 74 * pi2_src[2]
    195     smlal       v7.4s, v17.4h, v4.4h[2]     //pi2_out[2] = 74 * pi2_src[0] - 74 * pi2_src[2] + 74 * pi2_src[3]
    196     ld1         {v19.s}[0],[x2],x5
    197 
    198     smull       v20.4s, v16.4h, v4.4h[1]    // 55 * pi2_src[2]
    199     smlsl       v20.4s, v22.4h, v4.4h[2]    //  - 74 * pi2_src[1] +   55 * pi2_src[2]
    200     smlsl       v20.4s, v17.4h, v4.4h[0]    // - 74 * pi2_src[1] +   55 * pi2_src[2]    - 29 * pi2_src[3]
    201     smlal       v20.4s, v21.4h, v4.4h[3]    //pi2_out[3] = 84 * pi2_src[0] - 74 * pi2_src[1] + 55 * pi2_src[2] - 29 * pi2_src[3]
    202 
    203     sqrshrn     v28.4h, v6.4s,#shift_stage2_idct // (pi2_out[0] + rounding ) >> shift_stage1_idct
    204     sqrshrn     v29.4h, v5.4s,#shift_stage2_idct // (pi2_out[1] + rounding ) >> shift_stage1_idct
    205     sqrshrn     v30.4h, v7.4s,#shift_stage2_idct // (pi2_out[2] + rounding ) >> shift_stage1_idct
    206     sqrshrn     v31.4h, v20.4s,#shift_stage2_idct // (pi2_out[3] + rounding ) >> shift_stage1_idct
    207     ld1         {v19.s}[1],[x2],x5
    208     trn1        v24.4h, v28.4h, v29.4h
    209     trn2        v25.4h, v28.4h, v29.4h
    210     trn1        v26.4h, v30.4h, v31.4h
    211     trn2        v27.4h, v30.4h, v31.4h
    212     trn1        v0.2s, v24.2s, v26.2s
    213     trn2        v2.2s, v24.2s, v26.2s
    214     trn1        v1.2s, v25.2s, v27.2s
    215     trn2        v3.2s, v25.2s, v27.2s
    216     // output in d0,d1,d2,d3
    217     // second stage computation ends
    218 
    219     // loading pred
    220     mov         v0.d[1],v1.d[0]
    221     mov         v2.d[1],v3.d[0]
    222 
    223     uaddw       v0.8h,  v0.8h ,  v18.8b     // pi2_out(16bit) + pu1_pred(8bit)
    224     sqxtun      v0.8b, v0.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
    225     uaddw       v2.8h,  v2.8h ,  v19.8b     // pi2_out(16bit) + pu1_pred(8bit)
    226     sqxtun      v1.8b, v2.8h                // clip_u8(pi2_out(16bit) + pu1_pred(8bit))
    227 
    228     // storing destination
    229     st1         {v0.s}[0],[x3],x6
    230     st1         {v0.s}[1],[x3],x6
    231     st1         {v1.s}[0],[x3],x6
    232     st1         {v1.s}[1],[x3],x6
    233 
    234     // ldmfd sp!,{x4-x12,x15}            //reload the registers from sp
    235     ldp         x19, x20,[sp],#16
    236 
    237     ret
    238 
    239 
    240 
    241 
    242 
    243 
    244 
    245 
    246 
    247