Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 // *******************************************************************************
     22 // * @file
     23 // *  ih264_ihadamard_scaling_av8.s
     24 // *
     25 // * @brief
     26 // *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
     27 // *  of 16x16 intra-prediction
     28 // *
     29 // * @author
     30 // *  Mohit
     31 // *
     32 // * @par List of Functions:
     33 // *  - ih264_ihadamard_scaling_4x4_av8()
     34 // *
     35 // * @remarks
     36 // *  None
     37 // *
     38 .include "ih264_neon_macros.s"
     39 
     40 // *******************************************************************************
     41 // */
     42 // * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
     43 // * of a 16x16 intra prediction macroblock, and then performs scaling.
     44 // * prediction buffer
     45 // *
     46 // * @par Description:
     47 // *  The DC coefficients pass through a 2-stage inverse hadamard transform.
     48 // *  This inverse transformed content is scaled to based on Qp value.
     49 // *
     50 // * @param[in] pi2_src
     51 // *  input 4x4 block of DC coefficients
     52 // *
     53 // * @param[out] pi2_out
     54 // *  output 4x4 block
     55 // *
     56 // * @param[in] pu2_iscal_mat
     57 // *  pointer to scaling list
     58 // *
     59 // * @param[in] pu2_weigh_mat
     60 // *  pointer to weight matrix
     61 // *
     62 // * @param[in] u4_qp_div_6
     63 // *  Floor (qp/6)
     64 // *
     65 // * @param[in] pi4_tmp
     66 // * temporary buffer of size 1*16
     67 // *
     68 // * @returns none
     69 // *
     70 // * @remarks none
     71 // *
     72 // *******************************************************************************
     73 // */
     74 // *
     75 // *******************************************************************************
     76 // */
     77 // void ih264_ihadamard_scaling_4x4(word16* pi2_src,
     78 //        word16* pi2_out,
     79 //        const uword16 *pu2_iscal_mat,
     80 //        const uword16 *pu2_weigh_mat,
     81 //        uword32 u4_qp_div_6,
     82 //        word32* pi4_tmp)
     83 //**************variables vs registers*****************************************
     84 //x0 => *pi2_src
     85 //x1 => *pi2_out
     86 //x2 => *pu2_iscal_mat
     87 //x3 => *pu2_weigh_mat
     88 //x4=>   u4_qp_div_6
     89 
     90 .text
     91 .p2align 2
     92 
     93     .global ih264_ihadamard_scaling_4x4_av8
     94 ih264_ihadamard_scaling_4x4_av8:
     95 
     96 //only one shift is done in horizontal inverse because,
     97 //if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
     98 //if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
     99     push_v_regs
    100 
    101 //=======================inverse hadamard transform================================
    102 
    103     ld4       {v0.4h-v3.4h}, [x0]       //load x4,x5,x6,x7
    104 
    105     dup       v14.4s, w4                // populate the u4_qp_div_6
    106     ld1       {v15.h}[0], [x3]          // pu2_weigh_mat
    107     ld1       {v16.h}[0], [x2]          //pu2_iscal_mat
    108 
    109     saddl     v4.4s, v0.4h, v3.4h       //x0 = x4 + x7
    110     saddl     v5.4s, v1.4h, v2.4h       //x1 = x5 + x6
    111     ssubl     v6.4s, v1.4h, v2.4h       //x2 = x5 - x6
    112     ssubl     v7.4s, v0.4h, v3.4h       //x3 = x4 - x7
    113 
    114     add       v0.4s, v4.4s, v5.4s       //pi4_tmp_ptr[0] = x0 + x1
    115     add       v1.4s, v7.4s, v6.4s       //pi4_tmp_ptr[1] = x3 + x2
    116     sub       v2.4s, v4.4s, v5.4s       //pi4_tmp_ptr[2] = x0 - x1
    117     sub       v3.4s, v7.4s, v6.4s       //pi4_tmp_ptr[3] = x3 - x2
    118 
    119     umull     v15.4s, v15.4h, v16.4h
    120     dup       v15.4s, v15.s[0]          //pu2_weigh_mat[0]*pu2_iscal_mat[0]
    121 
    122     //transpose
    123     trn1      v4.4s, v0.4s, v1.4s
    124     trn2      v5.4s, v0.4s, v1.4s
    125     trn1      v6.4s, v2.4s, v3.4s
    126     trn2      v7.4s, v2.4s, v3.4s
    127 
    128     trn1      v0.2d, v4.2d, v6.2d
    129     trn2      v2.2d, v4.2d, v6.2d
    130     trn1      v1.2d, v5.2d, v7.2d
    131     trn2      v3.2d, v5.2d, v7.2d
    132     //end transpose
    133 
    134     add       v4.4s, v0.4s, v3.4s       //x0 = x4+x7
    135     add       v5.4s, v1.4s, v2.4s       //x1 = x5+x6
    136     sub       v6.4s, v1.4s, v2.4s       //x2 = x5-x6
    137     sub       v7.4s, v0.4s, v3.4s       //x3 = x4-x7
    138 
    139     add       v0.4s, v4.4s, v5.4s       //pi4_tmp_ptr[0] = x0 + x1
    140     add       v1.4s, v7.4s, v6.4s       //pi4_tmp_ptr[1] = x3 + x2
    141     sub       v2.4s, v4.4s, v5.4s       //pi4_tmp_ptr[2] = x0 - x1
    142     sub       v3.4s, v7.4s, v6.4s       //pi4_tmp_ptr[3] = x3 - x2
    143 
    144     mul       v0.4s, v0.4s, v15.4s      // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    145     mul       v1.4s, v1.4s, v15.4s      // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    146     mul       v2.4s, v2.4s, v15.4s      // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    147     mul       v3.4s, v3.4s, v15.4s      // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
    148 
    149     sshl      v0.4s, v0.4s, v14.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
    150     sshl      v1.4s, v1.4s, v14.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
    151     sshl      v2.4s, v2.4s, v14.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
    152     sshl      v3.4s, v3.4s, v14.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
    153 
    154     sqrshrn   v0.4h, v0.4s, #6          // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
    155     sqrshrn   v1.4h, v1.4s, #6          // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
    156     sqrshrn   v2.4h, v2.4s, #6          // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
    157     sqrshrn   v3.4h, v3.4s, #6          // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
    158 
    159     st1       {v0.4h-v3.4h}, [x1]       //store the result
    160 
    161     pop_v_regs
    162     ret
    163 
    164 
    165 // *******************************************************************************
    166 // */
    167 // * @brief This function performs a 2x2 inverse hadamard transform for chroma block
    168 // *
    169 // * @par Description:
    170 // *  The DC coefficients pass through a 2-stage inverse hadamard transform.
    171 // *  This inverse transformed content is scaled to based on Qp value.
    172 // *  Both DC blocks of U and v blocks are processesd
    173 // *
    174 // * @param[in] pi2_src
    175 // *  input 1x8 block of ceffs. First 4 are from U and next from V
    176 // *
    177 // * @param[out] pi2_out
    178 // *  output 1x8 block
    179 // *
    180 // * @param[in] pu2_iscal_mat
    181 // *  pointer to scaling list
    182 // *
    183 // * @param[in] pu2_weigh_mat
    184 // *  pointer to weight matrix
    185 // *
    186 // * @param[in] u4_qp_div_6
    187 // *  Floor (qp/6)
    188 // *
    189 // * @returns none
    190 // *
    191 // * @remarks none
    192 // *
    193 // *******************************************************************************
    194 // */
    195 // *
    196 // *******************************************************************************
    197 // */
    198 // void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
    199 //                                  WORD16* pi2_out,
    200 //                                  const UWORD16 *pu2_iscal_mat,
    201 //                                  const UWORD16 *pu2_weigh_mat,
    202 //                                  UWORD32 u4_qp_div_6,
    203 
    204     .global ih264_ihadamard_scaling_2x2_uv_av8
    205 ih264_ihadamard_scaling_2x2_uv_av8:
    206 
    207 //Registers used
    208 //   x0 : *pi2_src
    209 //   x1 : *pi2_out
    210 //   x2 : *pu2_iscal_mat
    211 //   x3 : *pu2_weigh_mat
    212 //   x4 : u4_qp_div_6
    213     push_v_regs
    214     ld1       {v26.h}[0], [x2]
    215     ld1       {v27.h}[0], [x3]
    216 
    217     sub       w4, w4, #5                //qp/6 - 4
    218     dup       v28.4s, w4                //load qp/6
    219 
    220     ld2       {v0.4h, v1.4h}, [x0]      //load 8 dc coeffs
    221                                         //i2_x4,i2_x6,i2_y4,i1_y6 -> d0
    222                                         //i2_x5,i2_x7,i2_y5,i1_y6 -> d1
    223 
    224     saddl     v2.4s, v0.4h, v1.4h       //i4_x0 = i4_x4 + i4_x5;...x2
    225     ssubl     v4.4s, v0.4h, v1.4h       //i4_x1 = i4_x4 - i4_x5;...x3
    226 
    227     umull     v30.4s, v26.4h, v27.4h    //pu2_iscal_mat[0]*pu2_weigh_mat[0]
    228     dup       v30.4s, v30.s[0]
    229 
    230     trn1      v0.4s, v2.4s, v4.4s
    231     trn2      v1.4s, v2.4s, v4.4s       //i4_x0 i4_x1 -> q1
    232 
    233     add       v2.4s, v0.4s, v1.4s       //i4_x4 = i4_x0+i4_x2;.. i4_x5
    234     sub       v3.4s, v0.4s, v1.4s       //i4_x6 = i4_x0-i4_x2;.. i4_x7
    235 
    236     mul       v2.4s, v2.4s, v30.4s
    237     mul       v3.4s, v3.4s, v30.4s
    238 
    239     sshl      v2.4s, v2.4s, v28.4s
    240     sshl      v3.4s, v3.4s, v28.4s
    241 
    242     xtn       v0.4h, v2.4s              //i4_x4 i4_x5 i4_y4 i4_y5
    243     xtn       v1.4h, v3.4s              //i4_x6 i4_x7 i4_y6 i4_y7
    244 
    245     st2       {v0.4s-v1.4s}, [x1]
    246     pop_v_regs
    247     ret
    248 
    249 
    250 
    251