Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @ *******************************************************************************
     22 @ * @file
     23 @ *  ih264_ihadamard_scaling_a9.s
     24 @ *
     25 @ * @brief
     26 @ *  Contains function definitions for inverse hadamard transform on 4x4 DC outputs
     27 @ *  of 16x16 intra-prediction
     28 @ *
     29 @ * @author
     30 @ *  Mohit
     31 @ *
     32 @ * @par List of Functions:
     33 @ *  - ih264_ihadamard_scaling_4x4_a9()
     34 @ *  - ih264_ihadamard_scaling_2x2_uv_a9()
     35 @ *
     36 @ * @remarks
     37 @ *  None
     38 @ *
     39 @ *******************************************************************************
     40 @ *
     41 @ * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
     42 @ * of a 16x16 intra prediction macroblock, and then performs scaling.
     43 @ * prediction buffer
     44 @ *
     45 @ * @par Description:
     46 @ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
     47 @ *  This inverse transformed content is scaled to based on Qp value.
     48 @ *
     49 @ * @param[in] pi2_src
     50 @ *  input 4x4 block of DC coefficients
     51 @ *
     52 @ * @param[out] pi2_out
     53 @ *  output 4x4 block
     54 @ *
     55 @ * @param[in] pu2_iscal_mat
     56 @ *  pointer to scaling list
     57 @ *
     58 @ * @param[in] pu2_weigh_mat
     59 @ *  pointer to weight matrix
     60 @ *
     61 @ * @param[in] u4_qp_div_6
     62 @ *  Floor (qp/6)
     63 @ *
     64 @ * @param[in] pi4_tmp
     65 @ * temporary buffer of size 1*16
     66 @ *
     67 @ * @returns none
     68 @ *
     69 @ * @remarks none
     70 @ *
     71 @ *******************************************************************************
     72 @ *
     73 @ *
     74 @ *******************************************************************************
     75 @ *
     76 @ void ih264_ihadamard_scaling_4x4(WORD16* pi2_src,
     77 @       WORD16* pi2_out,
     78 @       const UWORD16 *pu2_iscal_mat,
     79 @       const UWORD16 *pu2_weigh_mat,
     80 @       UWORD32 u4_qp_div_6,
     81 @       WORD32* pi4_tmp)
     82 @**************Variables Vs Registers*****************************************
     83 @r0 => *pi2_src
     84 @r1 => *pi2_out
     85 @r2 =>  *pu2_iscal_mat
     86 @r3 =>  *pu2_weigh_mat
     87 @r4 =>  u4_qp_div_6
     88 
     89 .text
     90 .p2align 2
     91 
     92     .global ih264_ihadamard_scaling_4x4_a9
     93 
     94 ih264_ihadamard_scaling_4x4_a9:
     95 
     96 @VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
     97 @If the macro value changes need to change the instruction according to it.
     98 @Only one shift is done in horizontal inverse because,
     99 @if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
    100 @if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
    101 
    102     stmfd         sp!, {r4-r12, r14}    @ stack stores the values of the arguments
    103     ldr           r4, [sp, #40]         @ Loads u4_qp_div_6
    104     vdup.s32      q10, r4               @ Populate the u4_qp_div_6 in Q10
    105     ldrh          r6, [r3]              @ load pu2_weight_mat[0] , H for unsigned halfword load
    106     ldrh          r7, [r2]              @ load pu2_iscal_mat[0] , H for unsigned halfword load
    107     mul           r6, r6, r7            @ pu2_iscal_mat[0]*pu2_weigh_mat[0]
    108     vdup.s32      q9, r6                @ Populate pu2_iscal_mat[0]*pu2_weigh_mat[0] 32-bit in Q9
    109     vpush         {d8-d15}
    110 @=======================INVERSE HADAMARD TRANSFORM================================
    111 
    112     vld4.s16      {d0, d1, d2, d3}, [r0] @load x4,x5,x6,x7
    113     vaddl.s16     q12, d0, d3           @x0 = x4 + x7
    114     vaddl.s16     q13, d1, d2           @x1 = x5 + x6
    115     vsubl.s16     q14, d1, d2           @x2 = x5 - x6
    116     vsubl.s16     q15, d0, d3           @x3 = x4 - x7
    117 
    118     vadd.s32      q2, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
    119     vadd.s32      q3, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
    120     vsub.s32      q4, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
    121     vsub.s32      q5, q15, q14          @pi4_tmp_ptr[3] = x3 - x2
    122 
    123     vtrn.32       q2, q3                @Transpose the register for vertical transform
    124     vtrn.32       q4, q5
    125 
    126     vswp          d5, d8                @Q2 = x4, Q4 = x6
    127     vswp          d7, d10               @Q3 = x5, Q5 = x7
    128 
    129 
    130     vadd.s32      q12, q2, q5           @x0 = x4+x7
    131     vadd.s32      q13, q3, q4           @x1 = x5+x6
    132     vsub.s32      q14, q3, q4           @x2 = x5-x6
    133     vsub.s32      q15, q2, q5           @x3 = x4-x7
    134 
    135     vadd.s32      q0, q12, q13          @pi4_tmp_ptr[0] = x0 + x1
    136     vadd.s32      q1, q15, q14          @pi4_tmp_ptr[1] = x3 + x2
    137     vsub.s32      q2, q12, q13          @pi4_tmp_ptr[2] = x0 - x1
    138     vsub.s32      q3, q15, q14          @pi4_tmp_ptr[3] = x3 - x2
    139 
    140 
    141     vmul.s32      q0, q0, q9            @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    142     vmul.s32      q1, q1, q9            @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    143     vmul.s32      q2, q2, q9            @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    144     vmul.s32      q3, q3, q9            @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
    145 
    146     vshl.s32      q0, q0, q10           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
    147     vshl.s32      q1, q1, q10           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
    148     vshl.s32      q2, q2, q10           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
    149     vshl.s32      q3, q3, q10           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
    150 
    151     vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
    152     vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
    153     vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
    154     vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
    155 
    156     vst1.s16      {d0, d1, d2, d3}, [r1] @IV row store the value
    157 
    158     vpop          {d8-d15}
    159     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
    160 
    161 
    162 
    163 @ *******************************************************************************
    164 @ *
    165 @ * @brief This function performs a 2x2 inverse hadamard transform for chroma block
    166 @ *
    167 @ * @par Description:
    168 @ *  The DC coefficients pass through a 2-stage inverse hadamard transform.
    169 @ *  This inverse transformed content is scaled to based on Qp value.
    170 @ *  Both DC blocks of U and v blocks are processesd
    171 @ *
    172 @ * @param[in] pi2_src
    173 @ *  input 1x8 block of ceffs. First 4 are from U and next from V
    174 @ *
    175 @ * @param[out] pi2_out
    176 @ *  output 1x8 block
    177 @ *
    178 @ * @param[in] pu2_iscal_mat
    179 @ *  pointer to scaling list
    180 @ *
    181 @ * @param[in] pu2_weigh_mat
    182 @ *  pointer to weight matrix
    183 @ *
    184 @ * @param[in] u4_qp_div_6
    185 @ *  Floor (qp/6)
    186 @ *
    187 @ * @returns none
    188 @ *
    189 @ * @remarks none
    190 @ *
    191 @ *******************************************************************************
    192 @ *
    193 @ *
    194 @ *******************************************************************************
    195 @ *
    196 @ void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
    197 @                                  WORD16* pi2_out,
    198 @                                  const UWORD16 *pu2_iscal_mat,
    199 @                                  const UWORD16 *pu2_weigh_mat,
    200 @                                  UWORD32 u4_qp_div_6,
    201 
    202     .global ih264_ihadamard_scaling_2x2_uv_a9
    203 ih264_ihadamard_scaling_2x2_uv_a9:
    204 
    205 @Registers used
    206 @   r0 : *pi2_src
    207 @   r1 : *pi2_out
    208 @   r2 : *pu2_iscal_mat
    209 @   r3 : *pu2_weigh_mat
    210 
    211     vld1.u16      d26[0], [r2]
    212     vld1.u16      d27[0], [r3]
    213     vmull.u16     q15, d26, d27         @pu2_iscal_mat[0] *  pu2_weigh_mat[0]
    214     vdup.u32      q15, d30[0]
    215 
    216     vld1.u16      d28[0], [sp]          @load qp/6
    217 
    218     vpush         {d8-d15}
    219 
    220     vmov.u16      d29, #5
    221     vsubl.u16     q14, d28, d29         @qp\6 - 5
    222     vdup.s32      q14, d28[0]
    223 
    224     vld2.s16      {d0, d1}, [r0]        @load 8 dc coeffs
    225                                         @i2_x4,i2_x6,i2_y4,i1_y6 -> d0
    226                                         @i2_x5,i2_x7,i2_y5,i1_y6 -> d1
    227 
    228     vaddl.s16     q1, d0, d1            @  i4_x0 = i4_x4 + i4_x5;...x2
    229     vsubl.s16     q2, d0, d1            @  i4_x1 = i4_x4 - i4_x5;...x3
    230 
    231     vtrn.s32      q1, q2                @i4_x0 i4_x1 -> q1
    232 
    233     vadd.s32      q3, q1, q2            @i4_x4 = i4_x0+i4_x2;.. i4_x5
    234     vsub.s32      q1, q1, q2            @i4_x6 = i4_x0-i4_x2;.. i4_x7
    235 
    236     vmul.s32      q5, q3, q15
    237     vmul.s32      q6, q1, q15
    238 
    239     vshl.s32      q7, q5, q14
    240     vshl.s32      q8, q6, q14
    241 
    242     vmovn.s32     d18, q7               @i4_x4 i4_x5 i4_y4 i4_y5
    243     vmovn.s32     d19, q8               @i4_x6 i4_x7 i4_y6 i4_y7
    244 
    245     vst2.s32      {d18-d19}, [r1]
    246 
    247     vpop          {d8-d15}
    248     bx            lr
    249 
    250 
    251