Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 // *******************************************************************************
     22 // * @file
     23 // *  ih264_iquant_itrans_recon_dc_av8.s
     24 // *
     25 // * @brief
     26 // *  Contains function definitions for single stage  inverse transform
     27 // *
     28 // * @author
     29 // *  Mohit
     30 // *
     31 // * @par List of Functions:
     32 // *  - ih264_iquant_itrans_recon_4x4_dc_av8()
     33 // *     - ih264_iquant_itrans_recon_8x8_dc_av8()
     34 // *  - ih264_iquant_itrans_recon_chroma_4x4_dc_av8()
     35 // *
     36 // * @remarks
     37 // *  None
     38 // *
     39 // *******************************************************************************
     40 //*/
     41 
     42 
     43 .include "ih264_neon_macros.s"
     44 
     45 
     46 ///**
     47 // *******************************************************************************
     48 // *
     49 // * @brief
     50 // *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
     51 // *     for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
     52 // *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
     53 // *
     54 // * @par Description:
     55 // *  Performs inverse transform Ci4 and adds the residue to get the
     56 // *  reconstructed block
     57 // *
     58 // * @param[in] pi2_src
     59 // *  Input 4x4 coefficients
     60 // *
     61 // * @param[in] pu1_pred
     62 // *  Prediction 4x4 block
     63 // *
     64 // * @param[out] pu1_out
     65 // *  Output 4x4 block
     66 // *
     67 // * @param[in] u4_qp_div_6
     68 // *     QP
     69 // *
     70 // * @param[in] pu2_weigh_mat
     71 // * Pointer to weight matrix
     72 // *
     73 // * @param[in] pred_strd,
     74 // *  Prediction stride
     75 // *
     76 // * @param[in] out_strd
     77 // *  Output Stride
     78 // *
     79 // *@param[in] pi2_tmp
     80 // * temporary buffer of size 1*16
     81 // *
     82 // * @param[in] pu2_iscal_mat
     83 // * Pointer to the inverse quantization matrix
     84 // *
     85 // * @returns  Void
     86 // *
     87 // * @remarks
     88 // *  None
     89 // *
     90 // *******************************************************************************
     91 // */
     92 //void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
     93 //                                    UWORD8 *pu1_pred,
     94 //                                    UWORD8 *pu1_out,
     95 //                                    WORD32 pred_strd,
     96 //                                    WORD32 out_strd,
     97 //                                    const UWORD16 *pu2_iscal_mat,
     98 //                                    const UWORD16 *pu2_weigh_mat,
     99 //                                    UWORD32 u4_qp_div_6,
    100 //                                    WORD32 *pi4_tmp,
    101 //                                    WORD32 iq_start_idx
    102 //                                   WORD16 *pi2_dc_ld_addr)
    103 //**************Variables Vs Registers*****************************************
    104 //x0 => *pi2_src
    105 //x1 => *pu1_pred
    106 //x2 => *pu1_out
    107 //w3 =>  pred_strd
    108 //w4 =>  out_strd
    109 //x5 => *pu2_iscal_mat
    110 //x6 => *pu2_weigh_mat
    111 //w7 =>  u4_qp_div_6
    112 //   =>  pi4_tmp
    113 //   =>  iq_start_idx
    114 //   =>  pi2_dc_ld_addr
    115 
    116 .text
    117 .p2align 2
    118 
    119     .global ih264_iquant_itrans_recon_4x4_dc_av8
    120 ih264_iquant_itrans_recon_4x4_dc_av8:
    121 
    122     sxtw      x3, w3
    123     sxtw      x4, w4
    124     ldr       w8, [sp, #8]              //Loads iq_start_idx
    125     subs      w8, w8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
    126 
    127     ldr       x10, [sp, #16]            //Load alternate dc address
    128     push_v_regs
    129     dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
    130 
    131 
    132     bne       donot_use_pi2_dc_ld_addr_luma_dc
    133     ld1       {v0.h}[0], [x10]
    134 donot_use_pi2_dc_ld_addr_luma_dc:
    135 
    136     beq       donot_use_pi2_src_luma_dc
    137     ld1       {v0.h}[0], [x5]
    138     ld1       {v1.h}[0], [x6]
    139     ld1       {v2.h}[0], [x0]
    140     mul       v0.4h, v1.4h, v0.4h
    141     smull     v0.4s, v0.4h, v2.4h
    142     sshl      v0.4s, v0.4s, v30.4s
    143     sqrshrn   v0.4h, v0.4s, #4
    144 donot_use_pi2_src_luma_dc:
    145 
    146 
    147     dup       v0.8h, v0.h[0]
    148     srshr     v0.8h, v0.8h, #6
    149 
    150     ld1       {v1.s}[0], [x1], x3
    151     ld1       {v1.s}[1], [x1], x3
    152     ld1       {v2.s}[0], [x1], x3
    153     ld1       {v2.s}[1], [x1]
    154 
    155     uxtl      v1.8h, v1.8b
    156     uxtl      v2.8h, v2.8b
    157 
    158     add       v1.8h, v0.8h, v1.8h
    159     add       v2.8h, v0.8h, v2.8h
    160 
    161     sqxtun    v1.8b, v1.8h
    162     sqxtun    v2.8b, v2.8h
    163 
    164     st1       {v1.s}[0], [x2], x4
    165     st1       {v1.s}[1], [x2], x4
    166     st1       {v2.s}[0], [x2], x4
    167     st1       {v2.s}[1], [x2]
    168     pop_v_regs
    169     ret
    170 
    171 // /*
    172 // ********************************************************************************
    173 // *
    174 // * @brief This function reconstructs a 4x4 sub block from quantized resiude and
    175 // * prediction buffer if only dc value is present for residue
    176 // *
    177 // * @par Description:
    178 // *  The quantized residue is first inverse quantized,
    179 // *  This inverse quantized content is added to the prediction buffer to recon-
    180 // *  struct the end output
    181 // *
    182 // * @param[in] pi2_src
    183 // *  quantized dc coeffiient
    184 // *
    185 // * @param[in] pu1_pred
    186 // *  prediction 4x4 block in interleaved format
    187 // *
    188 // * @param[in] pred_strd,
    189 // *  Prediction buffer stride in interleaved format
    190 // *
    191 // * @param[in] out_strd
    192 // *  recon buffer Stride
    193 // *
    194 // * @returns none
    195 // *
    196 // * @remarks none
    197 // *
    198 // *******************************************************************************
    199 // */
    200 // void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
    201 //                                             UWORD8 *pu1_pred,
    202 //                                             UWORD8 *pu1_out,
    203 //                                             WORD32 pred_strd,
    204 //                                             WORD32 out_strd,
    205 //                                             const UWORD16 *pu2_iscal_mat,
    206 //                                             const UWORD16 *pu2_weigh_mat,
    207 //                                             UWORD32 u4_qp_div_6,
    208 //                                             WORD16 *pi2_tmp,
    209 //                                             WORD16 *pi2_dc_src)
    210 // Register Usage
    211 // x0 : pi2_src
    212 // x1 : pu1_pred
    213 // x2 : pu1_out
    214 // w3 : pred_strd
    215 // w4 : out_strd
    216 // x5 : pu2_iscal_mat
    217 // x6 : pu2_weigh_mat
    218 // w7 : u4_qp_div_6
    219 //    : pi2_tmp
    220 //    : pi2_dc_src
    221 // Neon registers d0-d7, d16-d30 are used
    222 // No need for pushing  arm and neon registers
    223 
    224 
    225     .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8
    226 ih264_iquant_itrans_recon_chroma_4x4_dc_av8:
    227 
    228     sxtw      x3, w3
    229     sxtw      x4, w4
    230     ldr       x0, [sp, #8]
    231     push_v_regs
    232     ld1       {v0.h}[0], [x0]
    233     dup       v0.8h, v0.h[0]
    234     srshr     v0.8h, v0.8h, #6
    235 
    236 
    237     //backup pu1_out
    238     mov       x0, x2
    239 
    240     //nop       v3.16b                            //dummy for deinterleaving
    241     movi      v31.8h, #0x00ff           //mask for interleaving [copy lower 8 bits]
    242 
    243     ld1       {v1.d}[0], [x1], x3
    244     ld1       {v1.d}[1], [x1], x3
    245     ld1       {v2.d}[0], [x1], x3
    246     ld1       {v2.d}[1], [x1], x3
    247 
    248     ld1       {v11.d}[0], [x2], x4      //load pu1_out for interleaving
    249     ld1       {v11.d}[1], [x2], x4
    250     ld1       {v12.d}[0], [x2], x4
    251     ld1       {v12.d}[1], [x2]
    252 
    253     uzp1      v1.16b, v1.16b, v3.16b
    254     uzp1      v2.16b, v2.16b, v3.16b
    255 
    256     uaddw     v1.8h, v0.8h, v1.8b
    257     uaddw     v2.8h, v0.8h, v2.8b
    258 
    259     sqxtun    v1.8b, v1.8h
    260     sqxtun    v2.8b, v2.8h
    261 
    262     uxtl      v1.8h, v1.8b
    263     uxtl      v2.8h, v2.8b
    264 
    265     bit       v11.16b, v1.16b, v31.16b
    266     bit       v12.16b, v2.16b, v31.16b
    267 
    268     st1       {v11.d}[0], [x0], x4
    269     st1       {v11.d}[1], [x0], x4
    270     st1       {v12.d}[0], [x0], x4
    271     st1       {v12.d}[1], [x0]
    272     pop_v_regs
    273     ret
    274 
    275 ///*
    276 // *******************************************************************************
    277 // *
    278 // * //brief
    279 // *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
    280 // *   [Only for Dc coeff]
    281 // * //par Description:
    282 // *  Performs inverse transform Ci8 and adds the residue to get the
    283 // *  reconstructed block
    284 // *
    285 // * //param[in] pi2_src
    286 // *  Input 4x4 coefficients
    287 // *
    288 // * //param[in] pu1_pred
    289 // *  Prediction 4x4 block
    290 // *
    291 // * //param[out] pu1_out
    292 // *  Output 4x4 block
    293 // *
    294 // * //param[in] u4_qp_div_6
    295 // *     QP
    296 // *
    297 // * //param[in] pu2_weigh_mat
    298 // * Pointer to weight matrix
    299 // *
    300 // * //param[in] pred_strd,
    301 // *  Prediction stride
    302 // *
    303 // * //param[in] out_strd
    304 // *  Output Stride
    305 // *
    306 // *//param[in] pi2_tmp
    307 // * temporary buffer of size 1*64
    308 // *
    309 // * //param[in] pu2_iscal_mat
    310 // * Pointer to the inverse quantization matrix
    311 // *
    312 // * //returns  Void
    313 // *
    314 // * //remarks
    315 // *  None
    316 // *
    317 // *******************************************************************************
    318 // */
    319 //void ih264_iquant_itrans_recon_dc_8x8(WORD16 *pi2_src,
    320 //                                   UWORD8 *pu1_pred,
    321 //                                   UWORD8 *pu1_out,
    322 //                                   WORD32 pred_strd,
    323 //                                   WORD32 out_strd,
    324 //                                   const UWORD16 *pu2_iscal_mat,
    325 //                                   const UWORD16 *pu2_weigh_mat,
    326 //                                   UWORD32 u4_qp_div_6,
    327 //                                   WORD32 *pi4_tmp,
    328 //                                   WORD32 iq_start_idx
    329 //                                   WORD16 *pi2_dc_ld_addr)
    330 //**************Variables Vs Registers*****************************************
    331 //x0       => *pi2_src
    332 //x1       => *pu1_pred
    333 //x2       => *pu1_out
    334 //w3       =>  pred_strd
    335 //w4       =>  out_strd
    336 //x5       =>  *pu2_iscal_mat
    337 //x6       =>  *pu2_weigh_mat
    338 //w7       =>  u4_qp_div_6
    339 //NOT USED =>  pi4_tmp
    340 //NOT USED =>  iq_start_idx
    341 //NOT USED =>  pi2_dc_ld_addr
    342 
    343     .global ih264_iquant_itrans_recon_8x8_dc_av8
    344 ih264_iquant_itrans_recon_8x8_dc_av8:
    345 
    346     push_v_regs
    347     sxtw      x3, w3
    348     sxtw      x4, w4
    349 
    350     ld1       {v1.h}[0], [x5]
    351     ld1       {v2.h}[0], [x6]
    352     ld1       {v0.h}[0], [x0]
    353     dup       v3.4s, w7
    354 
    355 
    356     mul       v1.8h, v1.8h, v2.8h
    357     smull     v0.4s, v0.4h, v1.4h
    358     sshl      v0.4s, v0.4s, v3.4s
    359 
    360     sqrshrn   v0.4h, v0.4s, #6
    361     srshr     v0.8h, v0.8h, #6
    362     dup       v0.8h, v0.h[0]
    363 
    364     ld1       {v22.8b}, [x1], x3
    365     ld1       {v23.8b}, [x1], x3
    366     ld1       {v24.8b}, [x1], x3
    367     ld1       {v25.8b}, [x1], x3
    368     ld1       {v26.8b}, [x1], x3
    369     ld1       {v27.8b}, [x1], x3
    370     ld1       {v28.8b}, [x1], x3
    371     ld1       {v29.8b}, [x1]
    372 
    373     uaddw     v1.8h, v0.8h, v22.8b
    374     uaddw     v2.8h, v0.8h, v23.8b
    375     uaddw     v3.8h, v0.8h, v24.8b
    376     uaddw     v8.8h, v0.8h, v25.8b
    377     uaddw     v9.8h, v0.8h, v26.8b
    378     uaddw     v10.8h, v0.8h, v27.8b
    379     uaddw     v11.8h, v0.8h, v28.8b
    380     uaddw     v12.8h, v0.8h, v29.8b
    381 
    382     sqxtun    v1.8b, v1.8h
    383     sqxtun    v2.8b, v2.8h
    384     sqxtun    v3.8b, v3.8h
    385     sqxtun    v8.8b, v8.8h
    386     sqxtun    v9.8b, v9.8h
    387     sqxtun    v10.8b, v10.8h
    388     sqxtun    v11.8b, v11.8h
    389     sqxtun    v12.8b, v12.8h
    390 
    391     st1       {v1.8b}, [x2], x4
    392     st1       {v2.8b}, [x2], x4
    393     st1       {v3.8b}, [x2], x4
    394     st1       {v8.8b}, [x2], x4
    395     st1       {v9.8b}, [x2], x4
    396     st1       {v10.8b}, [x2], x4
    397     st1       {v11.8b}, [x2], x4
    398     st1       {v12.8b}, [x2]
    399 
    400     pop_v_regs
    401     ret
    402 
    403 
    404