Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @ *******************************************************************************
     22 @ * @file
     23 @ *  ih264_iquant_itrans_recon_dc_a9.s
     24 @ *
     25 @ * @brief
     26 @ *  Contains function definitions for single stage  inverse transform
     27 @ *
     28 @ * @author
     29 @ *  Mohit
     30 @ *
     31 @ * @par List of Functions:
     32 @ *  - ih264_iquant_itrans_recon_4x4_dc_a9()
     33 @ *  - ih264_iquant_itrans_recon_8x8_dc_a9()
     34 @ *  - ih264_iquant_itrans_recon_chroma_4x4_dc_a9()
     35 @ *
     36 @ * @remarks
     37 @ *  None
     38 @ *
     39 @ *******************************************************************************
     40 @*
     41 @**
     42 @ *******************************************************************************
     43 @ *
     44 @ * @brief
     45 @ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
     46 @ *  for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is
     47 @ *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
     48 @ *
     49 @ * @par Description:
     50 @ *  Performs inverse transform Ci4 and adds the residue to get the
     51 @ *  reconstructed block
     52 @ *
     53 @ * @param[in] pi2_src
     54 @ *  Input 4x4 coefficients
     55 @ *
     56 @ * @param[in] pu1_pred
     57 @ *  Prediction 4x4 block
     58 @ *
     59 @ * @param[out] pu1_out
     60 @ *  Output 4x4 block
     61 @ *
     62 @ * @param[in] u4_qp_div_6
     63 @ *     QP
     64 @ *
     65 @ * @param[in] pu2_weigh_mat
     66 @ * Pointer to weight matrix
     67 @ *
     68 @ * @param[in] pred_strd,
     69 @ *  Prediction stride
     70 @ *
     71 @ * @param[in] out_strd
     72 @ *  Output Stride
     73 @ *
     74 @ *@param[in] pi2_tmp
     75 @ * temporary buffer of size 1*16
     76 @ *
     77 @ * @param[in] pu2_iscal_mat
     78 @ * Pointer to the inverse quantization matrix
     79 @ *
     80 @ * @returns  Void
     81 @ *
     82 @ * @remarks
     83 @ *  None
     84 @ *
     85 @ *******************************************************************************
     86 @ *
     87 @void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src,
     88 @                                   UWORD8 *pu1_pred,
     89 @                                   UWORD8 *pu1_out,
     90 @                                   WORD32 pred_strd,
     91 @                                   WORD32 out_strd,
     92 @                                   const UWORD16 *pu2_iscal_mat,
     93 @                                   const UWORD16 *pu2_weigh_mat,
     94 @                                   UWORD32 u4_qp_div_6,
     95 @                                   WORD32 *pi4_tmp,
     96 @                                   WORD32 iq_start_idx
     97 @                                   WORD16 *pi2_dc_ld_addr)
     98 @**************Variables Vs Registers*****************************************
     99 @r0 => *pi2_src
    100 @r1 => *pu1_pred
    101 @r2 => *pu1_out
    102 @r3 =>  pred_strd
    103 @r4 =>  out_strd
    104 @r5 =>  *pu2_iscal_mat
    105 @r6 =>  *pu2_weigh_mat
    106 @r7 =>  u4_qp_div_6
    107 @r9 =>  iq_start_idx
    108 @unused =>  pi2_dc_ld_addr
    109 
    110 .text
    111 .syntax unified
    112 .p2align 2
    113 
    114     .global ih264_iquant_itrans_recon_4x4_dc_a9
    115 
    116 ih264_iquant_itrans_recon_4x4_dc_a9:
    117 
    118 @Only one shift is done in horizontal inverse because,
    119 @if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
    120 @if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
    121 
    122     stmfd         sp!, {r4-r10, r14}    @stack stores the values of the arguments
    123     ldr           r5, [sp, #36]         @Loads *pu2_iscal_mat
    124     ldr           r6, [sp, #40]         @Loads *pu2_weigh_mat
    125     ldrsh         r8, [r0]              @load pi2_src[0], SH for signed halfword load
    126     ldrh          r6, [r6]              @load pu2_weight_mat[0] , H for unsigned halfword load
    127     ldrh          r5, [r5]              @load pu2_iscal_mat[0] , H for unsigned halfword load
    128 @=======================DEQUANT FROM HERE===================================
    129     mul           r6, r6, r5            @pu2_iscal_mat[0]*pu2_weigh_mat[0]
    130     ldr           r7, [sp, #44]         @Loads u4_qp_div_6
    131     mul           r6, r6, r8            @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
    132     ldr           r4, [sp, #32]         @Loads out_strd
    133     ldr           r9, [sp, #52]         @Loads iq_start_idx
    134 
    135     lsl           r6, r6, r7            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
    136     add           r6, r6, #8            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
    137     asr           r6, r6, #4            @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
    138 
    139     subs          r9, r9, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
    140     ldrsheq       r10, [r0]             @ Loads signed halfword pi2_src[0], if r9==1
    141     moveq         r6, r10               @ Restore dc value in case of intra, i.e. r9 == 1
    142 
    143     add           r6, r6, #32           @i_macro = q0 + 32
    144     asr           r6, r6, #6            @i_macro >>6 = DC output of 2-stage transform
    145     vdup.s16      q0, r6                @copy transform output to Q0
    146 
    147     vld1.32       d30[0], [r1], r3      @I row Load pu1_pred buffer
    148 
    149     vld1.32       d30[1], [r1], r3      @II row Load pu1_pred buffer
    150 
    151     vld1.32       d31[0], [r1], r3      @III row Load pu1_pred buf
    152 
    153     vld1.32       d31[1], [r1], r3      @IV row Load pu1_pred buffer
    154     vaddw.u8      q10, q0, d30
    155 
    156     vaddw.u8      q11, q0, d31
    157 
    158     vqmovun.s16   d0, q10
    159 
    160     vst1.32       d0[0], [r2], r4       @I row store the value
    161     vqmovun.s16   d1, q11
    162     vst1.32       d0[1], [r2], r4       @II row store the value
    163     vst1.32       d1[0], [r2], r4       @III row store the value
    164     vst1.32       d1[1], [r2]           @IV row store the value
    165 
    166     ldmfd         sp!, {r4-r10, r15}    @Reload the registers from SP
    167 
    168 
    169 
    170 
    171 @*
    172 @ *******************************************************************************
    173 @ *
    174 @ * @brief
    175 @ *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
    176 @ *  for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is
    177 @ *  non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s
    178 @ *
    179 @ * @par Description:
    180 @ *  Performs inverse transform Ci8 and adds the residue to get the
    181 @ *  reconstructed block
    182 @ *
    183 @ * @param[in] pi2_src
    184 @ *  Input 4x4 coefficients
    185 @ *
    186 @ * @param[in] pu1_pred
    187 @ *  Prediction 4x4 block
    188 @ *
    189 @ * @param[out] pu1_out
    190 @ *  Output 4x4 block
    191 @ *
    192 @ * @param[in] u4_qp_div_6
    193 @ *     QP
    194 @ *
    195 @ * @param[in] pu2_weigh_mat
    196 @ * Pointer to weight matrix
    197 @ *
    198 @ * @param[in] pred_strd,
    199 @ *  Prediction stride
    200 @ *
    201 @ * @param[in] out_strd
    202 @ *  Output Stride
    203 @ *
    204 @ *@param[in] pi2_tmp
    205 @ * temporary buffer of size 1*64
    206 @ *
    207 @ * @param[in] pu2_iscal_mat
    208 @ * Pointer to the inverse quantization matrix
    209 @ *
    210 @ * @returns  Void
    211 @ *
    212 @ * @remarks
    213 @ *  None
    214 @ *
    215 @ *******************************************************************************
    216 @ *
    217 @void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src,
    218 @                                   UWORD8 *pu1_pred,
    219 @                                   UWORD8 *pu1_out,
    220 @                                   WORD32 pred_strd,
    221 @                                   WORD32 out_strd,
    222 @                                   const UWORD16 *pu2_iscal_mat,
    223 @                                   const UWORD16 *pu2_weigh_mat,
    224 @                                   UWORD32 u4_qp_div_6,
    225 @                                   WORD32 *pi4_tmp,
    226 @                                   WORD32 iq_start_idx)
    227 @**************Variables Vs Registers*****************************************
    228 @r0 => *pi2_src
    229 @r1 => *pu1_pred
    230 @r2 => *pu1_out
    231 @r3 =>  pred_strd
    232 @r4 =>  out_strd
    233 @r5 =>  *pu2_iscal_mat
    234 @r6 =>  *pu2_weigh_mat
    235 @r7 =>  u4_qp_div_6
    236 
    237 
    238     .global ih264_iquant_itrans_recon_8x8_dc_a9
    239 ih264_iquant_itrans_recon_8x8_dc_a9:
    240 
    241     stmfd         sp!, {r4-r8, r14}     @stack stores the values of the arguments
    242     ldr           r5, [sp, #28]         @Loads *pu2_iscal_mat
    243     ldr           r6, [sp, #32]         @Loads *pu2_weigh_mat
    244     ldrsh         r8, [r0]              @load pi2_src[0], SH for signed halfword load
    245     ldrh          r6, [r6]              @load pu2_weight_mat[0] , H for unsigned halfword load
    246     ldrh          r5, [r5]              @load pu2_iscal_mat[0] , H for unsigned halfword load
    247 @=======================DEQUANT FROM HERE===================================
    248     mul           r6, r6, r5            @pu2_iscal_mat[0]*pu2_weigh_mat[0]
    249     ldr           r7, [sp, #36]         @Loads u4_qp_div_6
    250     mul           r6, r6, r8            @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0]
    251     ldr           r4, [sp, #24]         @Loads out_strd
    252 
    253     vpush         {d8-d15}
    254     lsl           r6, r6, r7            @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6
    255     add           r6, r6, #32           @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact
    256     asr           r6, r6, #6            @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4)
    257     add           r6, r6, #32           @i_macro = q0 + 32
    258     asr           r6, r6, #6            @i_macro >>6 = DC output of 2-stage transform
    259     vdup.s16      q8, r6                @copy transform output to Q0
    260 
    261     vld1.32       d24, [r1], r3         @ Q12 = 0x070605....0x070605....
    262 
    263     vld1.32       d25, [r1], r3         @ Q12 = 0x070605....0x070605....
    264 
    265     vld1.32       d26, [r1], r3         @ Q12 = 0x070605....0x070605....
    266     vaddw.u8      q0, q8, d24
    267     vld1.32       d27, [r1], r3         @ Q12 = 0x070605....0x070605....
    268     vaddw.u8      q1, q8, d25
    269     vld1.32       d28, [r1], r3         @ Q12 = 0x070605....0x070605....
    270     vaddw.u8      q2, q8, d26
    271     vld1.32       d29, [r1], r3         @ Q12 = 0x070605....0x070605....
    272     vaddw.u8      q3, q8, d27
    273     vld1.32       d30, [r1], r3         @ Q12 = 0x070605....0x070605....
    274     vaddw.u8      q4, q8, d28
    275     vld1.32       d31, [r1], r3         @ Q12 = 0x070605....0x070605....
    276 
    277 @ Code Added to pack sign and magnitudes
    278 
    279 
    280     vqmovun.s16   d0, q0
    281     vaddw.u8      q5, q8, d29
    282     vqmovun.s16   d1, q1
    283     vaddw.u8      q6, q8, d30
    284     vqmovun.s16   d2, q2
    285     vqmovun.s16   d3, q3
    286     vaddw.u8      q7, q8, d31
    287     vqmovun.s16   d4, q4
    288     vqmovun.s16   d5, q5
    289     vst1.32       d0, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    290     vqmovun.s16   d6, q6
    291     vst1.32       d1, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    292     vqmovun.s16   d7, q7
    293     vst1.32       d2, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    294     vst1.32       d3, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    295     vst1.32       d4, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    296     vst1.32       d5, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    297     vst1.32       d6, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    298     vst1.32       d7, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    299 
    300     vpop          {d8-d15}
    301     ldmfd         sp!, {r4-r8, r15}
    302 
    303 
    304 @ *
    305 @ ********************************************************************************
    306 @ *
    307 @ * @brief This function reconstructs a 4x4 sub block from quantized resiude and
    308 @ * prediction buffer if only dc value is present for residue
    309 @ *
    310 @ * @par Description:
    311 @ *  The quantized residue is first inverse quantized,
    312 @ *  This inverse quantized content is added to the prediction buffer to recon-
    313 @ *  struct the end output
    314 @ *
    315 @ * @param[in] pi2_src
    316 @ *  quantized dc coeffiient
    317 @ *
    318 @ * @param[in] pu1_pred
    319 @ *  prediction 4x4 block in interleaved format
    320 @ *
    321 @ * @param[in] pred_strd,
    322 @ *  Prediction buffer stride in interleaved format
    323 @ *
    324 @ * @param[in] out_strd
    325 @ *  recon buffer Stride
    326 @ *
    327 @ * @returns none
    328 @ *
    329 @ * @remarks none
    330 @ *
    331 @ *******************************************************************************
    332 @ *
    333 @ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src,
    334 @                                             UWORD8 *pu1_pred,
    335 @                                             UWORD8 *pu1_out,
    336 @                                             WORD32 pred_strd,
    337 @                                             WORD32 out_strd,
    338 @                                             const UWORD16 *pu2_iscal_mat,
    339 @                                             const UWORD16 *pu2_weigh_mat,
    340 @                                             UWORD32 u4_qp_div_6,
    341 @                                             WORD16 *pi2_tmp,
    342 @                                             WORD16 *pi2_dc_src)
    343 @ Register Usage
    344 @ r0 : pi2_src
    345 @ r1 : pu1_pred
    346 @ r2 : pu1_out
    347 @ r3 : pred_strd
    348 @ Neon registers d0-d7, d16-d30 are used
    349 @ No need for pushing  arm and neon registers
    350     .global ih264_iquant_itrans_recon_chroma_4x4_dc_a9
    351 ih264_iquant_itrans_recon_chroma_4x4_dc_a9:
    352 
    353     ldr           r0, [sp, #20]
    354     vld1.s16      d0, [r0]              @load pi2_dc_src
    355 
    356     ldr           r0, [sp]              @load out_strd
    357 
    358     vld2.s8       {d2, d3}, [r1], r3    @load pred plane 1 => d2 &pred palne 2 => d3
    359     vld2.s8       {d3, d4}, [r1], r3
    360     vrshr.s16     d0, d0, #6            @i_macro = ((q0 + 32) >> 6);
    361     vld2.s8       {d4, d5}, [r1], r3
    362     vld2.s8       {d5, d6}, [r1], r3
    363 
    364     vdup.s16      q0, d0[0]             @duplicate pi2_sr[0]
    365     mov           r1, r2                @backup pu1_out
    366 
    367     vtrn.32       d2, d3                @mov the 4 coeffs of current block to d2
    368     vtrn.32       d4, d5
    369 
    370     vmov.u16      q15, #0x00ff
    371 
    372 
    373     vld1.u8       d18, [r2], r0         @load out [8 bit size) -8 coeffs
    374     vaddw.u8      q1, q0, d2            @Add pred
    375     vld1.u8       d19, [r2], r0
    376     vaddw.u8      q2, q0, d4
    377     vld1.u8       d20, [r2], r0
    378     vld1.u8       d21, [r2], r0
    379 
    380     vqmovun.s16   d2, q1
    381     vqmovun.s16   d4, q2
    382 
    383     vmovl.u8      q1, d2
    384     vmovl.u8      q2, d4
    385 
    386     vbit.u8       q9, q1, q15
    387     vbit.u8       q10, q2, q15
    388 
    389     vst1.u8       d18, [r1], r0         @store  out
    390     vst1.u8       d19, [r1], r0
    391     vst1.u8       d20, [r1], r0
    392     vst1.u8       d21, [r1], r0
    393 
    394     bx            lr
    395 
    396 
    397 
    398 
    399 
    400 
    401 
    402