Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 // *******************************************************************************
     22 // * @file
     23 // *  ih264_iquant_itrans_recon_dc_av8.s
     24 // *
     25 // * @brief
     26 // *  Contains function definitions for single stage  inverse transform
     27 // *
     28 // * @author
     29 // *  Mohit
     30 // *
     31 // * @par List of Functions:
     32 // *  - ih264_iquant_itrans_recon_4x4_dc_av8()
     33 // *     - ih264_iquant_itrans_recon_8x8_dc_av8()
     34 // *  - ih264_iquant_itrans_recon_chroma_4x4_dc_av8()
     35 // *
     36 // * @remarks
     37 // *  None
     38 // *
     39 // *******************************************************************************
     40 //*/
     41 
     42 
     43 .include "ih264_neon_macros.s"
     44 
     45 
     46 ///**
     47 // *******************************************************************************
     48 // *
     49 // * @brief
     50 // *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
     51 // *     for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
     52 // *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
     53 // *
     54 // * @par Description:
     55 // *  Performs inverse transform Ci4 and adds the residue to get the
     56 // *  reconstructed block
     57 // *
     58 // * @param[in] pi2_src
     59 // *  Input 4x4 coefficients
     60 // *
     61 // * @param[in] pu1_pred
     62 // *  Prediction 4x4 block
     63 // *
     64 // * @param[out] pu1_out
     65 // *  Output 4x4 block
     66 // *
     67 // * @param[in] u4_qp_div_6
     68 // *     QP
     69 // *
     70 // * @param[in] pu2_weigh_mat
     71 // * Pointer to weight matrix
     72 // *
     73 // * @param[in] pred_strd,
     74 // *  Prediction stride
     75 // *
     76 // * @param[in] out_strd
     77 // *  Output Stride
     78 // *
     79 // *@param[in] pi2_tmp
     80 // * temporary buffer of size 1*16
     81 // *
     82 // * @param[in] pu2_iscal_mat
     83 // * Pointer to the inverse quantization matrix
     84 // *
     85 // * @returns  Void
     86 // *
     87 // * @remarks
     88 // *  None
     89 // *
     90 // *******************************************************************************
     91 // */
     92 //void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
     93 //                                    UWORD8 *pu1_pred,
     94 //                                    UWORD8 *pu1_out,
     95 //                                    WORD32 pred_strd,
     96 //                                    WORD32 out_strd,
     97 //                                    const UWORD16 *pu2_iscal_mat,
     98 //                                    const UWORD16 *pu2_weigh_mat,
     99 //                                    UWORD32 u4_qp_div_6,
    100 //                                    WORD32 *pi4_tmp,
    101 //                                    WORD32 iq_start_idx
    102 //                                   WORD16 *pi2_dc_ld_addr)
    103 //**************Variables Vs Registers*****************************************
    104 //x0 => *pi2_src
    105 //x1 => *pu1_pred
    106 //x2 => *pu1_out
    107 //x3 =>  pred_strd
    108 //x4 =>  out_strd
    109 //x5 => *pu2_iscal_mat
    110 //x6 => *pu2_weigh_mat
    111 //x7 =>  u4_qp_div_6
    112 //   =>  pi4_tmp
    113 //   =>  iq_start_idx
    114 //   =>  pi2_dc_ld_addr
    115 
    116 .text
    117 .p2align 2
    118 
    119     .global ih264_iquant_itrans_recon_4x4_dc_av8
    120 ih264_iquant_itrans_recon_4x4_dc_av8:
    121 
    122     ldr       w8, [sp, #8]              //Loads iq_start_idx
    123     subs      w8, w8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
    124 
    125     ldr       x10, [sp, #16]            //Load alternate dc address
    126     push_v_regs
    127     dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
    128 
    129 
    130     bne       donot_use_pi2_dc_ld_addr_luma_dc
    131     ld1       {v0.h}[0], [x10]
    132 donot_use_pi2_dc_ld_addr_luma_dc:
    133 
    134     beq       donot_use_pi2_src_luma_dc
    135     ld1       {v0.h}[0], [x5]
    136     ld1       {v1.h}[0], [x6]
    137     ld1       {v2.h}[0], [x0]
    138     mul       v0.4h, v1.4h, v0.4h
    139     smull     v0.4s, v0.4h, v2.4h
    140     sshl      v0.4s, v0.4s, v30.4s
    141     sqrshrn   v0.4h, v0.4s, #4
    142 donot_use_pi2_src_luma_dc:
    143 
    144 
    145     dup       v0.8h, v0.h[0]
    146     srshr     v0.8h, v0.8h, #6
    147 
    148     ld1       {v1.s}[0], [x1], x3
    149     ld1       {v1.s}[1], [x1], x3
    150     ld1       {v2.s}[0], [x1], x3
    151     ld1       {v2.s}[1], [x1]
    152 
    153     uxtl      v1.8h, v1.8b
    154     uxtl      v2.8h, v2.8b
    155 
    156     add       v1.8h, v0.8h, v1.8h
    157     add       v2.8h, v0.8h, v2.8h
    158 
    159     sqxtun    v1.8b, v1.8h
    160     sqxtun    v2.8b, v2.8h
    161 
    162     st1       {v1.s}[0], [x2], x4
    163     st1       {v1.s}[1], [x2], x4
    164     st1       {v2.s}[0], [x2], x4
    165     st1       {v2.s}[1], [x2]
    166     pop_v_regs
    167     ret
    168 
    169 // /*
    170 // ********************************************************************************
    171 // *
    172 // * @brief This function reconstructs a 4x4 sub block from quantized resiude and
    173 // * prediction buffer if only dc value is present for residue
    174 // *
    175 // * @par Description:
    176 // *  The quantized residue is first inverse quantized,
    177 // *  This inverse quantized content is added to the prediction buffer to recon-
    178 // *  struct the end output
    179 // *
    180 // * @param[in] pi2_src
    181 // *  quantized dc coeffiient
    182 // *
    183 // * @param[in] pu1_pred
    184 // *  prediction 4x4 block in interleaved format
    185 // *
    186 // * @param[in] pred_strd,
    187 // *  Prediction buffer stride in interleaved format
    188 // *
    189 // * @param[in] out_strd
    190 // *  recon buffer Stride
    191 // *
    192 // * @returns none
    193 // *
    194 // * @remarks none
    195 // *
    196 // *******************************************************************************
    197 // */
    198 // void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
    199 //                                             UWORD8 *pu1_pred,
    200 //                                             UWORD8 *pu1_out,
    201 //                                             WORD32 pred_strd,
    202 //                                             WORD32 out_strd,
    203 //                                             const UWORD16 *pu2_iscal_mat,
    204 //                                             const UWORD16 *pu2_weigh_mat,
    205 //                                             UWORD32 u4_qp_div_6,
    206 //                                             WORD16 *pi2_tmp,
    207 //                                             WORD16 *pi2_dc_src)
    208 // Register Usage
    209 // x0 : pi2_src
    210 // x1 : pu1_pred
    211 // x2 : pu1_out
    212 // x3 : pred_strd
    213 // x4 : out_strd
    214 // x5 : pu2_iscal_mat
    215 // x6 : pu2_weigh_mat
    216 // x7 : u4_qp_div_6
    217 //    : pi2_tmp
    218 //    : pi2_dc_src
    219 // Neon registers d0-d7, d16-d30 are used
    220 // No need for pushing  arm and neon registers
    221 
    222 
    223     .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8
    224 ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
    225 
    226     ldr       x0, [sp, #8]
    227     push_v_regs
    228     ld1       {v0.h}[0], [x0]
    229     dup       v0.8h, v0.h[0]
    230     srshr     v0.8h, v0.8h, #6
    231 
    232 
    233     //backup pu1_out
    234     mov       x0, x2
    235 
    236     //nop       v3.16b                            //dummy for deinterleaving
    237     movi      v31.8h, #0x00ff           //mask for interleaving [copy lower 8 bits]
    238 
    239     ld1       {v1.d}[0], [x1], x3
    240     ld1       {v1.d}[1], [x1], x3
    241     ld1       {v2.d}[0], [x1], x3
    242     ld1       {v2.d}[1], [x1], x3
    243 
    244     ld1       {v11.d}[0], [x2], x4      //load pu1_out for interleaving
    245     ld1       {v11.d}[1], [x2], x4
    246     ld1       {v12.d}[0], [x2], x4
    247     ld1       {v12.d}[1], [x2]
    248 
    249     uzp1      v1.16b, v1.16b, v3.16b
    250     uzp1      v2.16b, v2.16b, v3.16b
    251 
    252     uaddw     v1.8h, v0.8h, v1.8b
    253     uaddw     v2.8h, v0.8h, v2.8b
    254 
    255     sqxtun    v1.8b, v1.8h
    256     sqxtun    v2.8b, v2.8h
    257 
    258     uxtl      v1.8h, v1.8b
    259     uxtl      v2.8h, v2.8b
    260 
    261     bit       v11.16b, v1.16b, v31.16b
    262     bit       v12.16b, v2.16b, v31.16b
    263 
    264     st1       {v11.d}[0], [x0], x4
    265     st1       {v11.d}[1], [x0], x4
    266     st1       {v12.d}[0], [x0], x4
    267     st1       {v12.d}[1], [x0]
    268     pop_v_regs
    269     ret
    270 
    271 ///*
    272 // *******************************************************************************
    273 // *
    274 // * //brief
    275 // *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
    276 // *   [Only for Dc coeff]
    277 // * //par Description:
    278 // *  Performs inverse transform Ci8 and adds the residue to get the
    279 // *  reconstructed block
    280 // *
    281 // * //param[in] pi2_src
    282 // *  Input 4x4 coefficients
    283 // *
    284 // * //param[in] pu1_pred
    285 // *  Prediction 4x4 block
    286 // *
    287 // * //param[out] pu1_out
    288 // *  Output 4x4 block
    289 // *
    290 // * //param[in] u4_qp_div_6
    291 // *     QP
    292 // *
    293 // * //param[in] pu2_weigh_mat
    294 // * Pointer to weight matrix
    295 // *
    296 // * //param[in] pred_strd,
    297 // *  Prediction stride
    298 // *
    299 // * //param[in] out_strd
    300 // *  Output Stride
    301 // *
    302 // *//param[in] pi2_tmp
    303 // * temporary buffer of size 1*64
    304 // *
    305 // * //param[in] pu2_iscal_mat
    306 // * Pointer to the inverse quantization matrix
    307 // *
    308 // * //returns  Void
    309 // *
    310 // * //remarks
    311 // *  None
    312 // *
    313 // *******************************************************************************
    314 // */
    315 //void ih264_iquant_itrans_recon_dc_8x8(WORD16 *pi2_src,
    316 //                                   UWORD8 *pu1_pred,
    317 //                                   UWORD8 *pu1_out,
    318 //                                   WORD32 pred_strd,
    319 //                                   WORD32 out_strd,
    320 //                                   const UWORD16 *pu2_iscal_mat,
    321 //                                   const UWORD16 *pu2_weigh_mat,
    322 //                                   UWORD32 u4_qp_div_6,
    323 //                                   WORD32 *pi4_tmp,
    324 //                                   WORD32 iq_start_idx
    325 //                                   WORD16 *pi2_dc_ld_addr)
    326 //**************Variables Vs Registers*****************************************
    327 //x0       => *pi2_src
    328 //x1       => *pu1_pred
    329 //x2       => *pu1_out
    330 //x3       =>  pred_strd
    331 //x4       =>  out_strd
    332 //x5       =>  *pu2_iscal_mat
    333 //x6       =>  *pu2_weigh_mat
    334 //x7       =>  u4_qp_div_6
    335 //NOT USED =>  pi4_tmp
    336 //NOT USED =>  iq_start_idx
    337 //NOT USED =>  pi2_dc_ld_addr
    338 
    339     .global ih264_iquant_itrans_recon_8x8_dc_av8
    340 ih264_iquant_itrans_recon_8x8_dc_av8:
    341 
    342     push_v_regs
    343 
    344     ld1       {v1.h}[0], [x5]
    345     ld1       {v2.h}[0], [x6]
    346     ld1       {v0.h}[0], [x0]
    347     dup       v3.4s, w7
    348 
    349 
    350     mul       v1.8h, v1.8h, v2.8h
    351     smull     v0.4s, v0.4h, v1.4h
    352     sshl      v0.4s, v0.4s, v3.4s
    353 
    354     sqrshrn   v0.4h, v0.4s, #6
    355     srshr     v0.8h, v0.8h, #6
    356     dup       v0.8h, v0.h[0]
    357 
    358     ld1       {v22.8b}, [x1], x3
    359     ld1       {v23.8b}, [x1], x3
    360     ld1       {v24.8b}, [x1], x3
    361     ld1       {v25.8b}, [x1], x3
    362     ld1       {v26.8b}, [x1], x3
    363     ld1       {v27.8b}, [x1], x3
    364     ld1       {v28.8b}, [x1], x3
    365     ld1       {v29.8b}, [x1]
    366 
    367     uaddw     v1.8h, v0.8h, v22.8b
    368     uaddw     v2.8h, v0.8h, v23.8b
    369     uaddw     v3.8h, v0.8h, v24.8b
    370     uaddw     v8.8h, v0.8h, v25.8b
    371     uaddw     v9.8h, v0.8h, v26.8b
    372     uaddw     v10.8h, v0.8h, v27.8b
    373     uaddw     v11.8h, v0.8h, v28.8b
    374     uaddw     v12.8h, v0.8h, v29.8b
    375 
    376     sqxtun    v1.8b, v1.8h
    377     sqxtun    v2.8b, v2.8h
    378     sqxtun    v3.8b, v3.8h
    379     sqxtun    v8.8b, v8.8h
    380     sqxtun    v9.8b, v9.8h
    381     sqxtun    v10.8b, v10.8h
    382     sqxtun    v11.8b, v11.8h
    383     sqxtun    v12.8b, v12.8h
    384 
    385     st1       {v1.8b}, [x2], x4
    386     st1       {v2.8b}, [x2], x4
    387     st1       {v3.8b}, [x2], x4
    388     st1       {v8.8b}, [x2], x4
    389     st1       {v9.8b}, [x2], x4
    390     st1       {v10.8b}, [x2], x4
    391     st1       {v11.8b}, [x2], x4
    392     st1       {v12.8b}, [x2]
    393 
    394     pop_v_regs
    395     ret
    396 
    397 
    398