Home | History | Annotate | Download | only in arm
      1 @/******************************************************************************
      2 @ *
      3 @ * Copyright (C) 2015 The Android Open Source Project
      4 @ *
      5 @ * Licensed under the Apache License, Version 2.0 (the "License");
      6 @ * you may not use this file except in compliance with the License.
      7 @ * You may obtain a copy of the License at:
      8 @ *
      9 @ * http://www.apache.org/licenses/LICENSE-2.0
     10 @ *
     11 @ * Unless required by applicable law or agreed to in writing, software
     12 @ * distributed under the License is distributed on an "AS IS" BASIS,
     13 @ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 @ * See the License for the specific language governing permissions and
     15 @ * limitations under the License.
     16 @ *
     17 @ *****************************************************************************
     18 @ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 @*/
     20 @**
     21 @ *******************************************************************************
     22 @ * @file
     23 @ *  ih264_iquant_itrans_recon_a9.s
     24 @ *
     25 @ * @brief
     26 @ *  Contains function definitions for single stage  inverse transform
     27 @ *
     28 @ * @author
     29 @ *  Mohit
     30 @ *  Harinarayanaan
     31 @ *
     32 @ * @par List of Functions:
     33 @ *  - ih264_iquant_itrans_recon_4x4_a9()
     34 @ *  - ih264_iquant_itrans_recon_8x8_a9()
     35 @ *  - ih264_iquant_itrans_recon_chroma_4x4_a9()
     36 @ *
     37 @ * @remarks
     38 @ *  None
     39 @ *
     40 @ *******************************************************************************
     41 @*
     42 @**
     43 @ *******************************************************************************
     44 @ *
     45 @ * @brief
     46 @ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
     47 @ *
     48 @ * @par Description:
     49 @ *  Performs inverse transform Ci4 and adds the residue to get the
     50 @ *  reconstructed block
     51 @ *
     52 @ * @param[in] pi2_src
     53 @ *  Input 4x4 coefficients
     54 @ *
     55 @ * @param[in] pu1_pred
     56 @ *  Prediction 4x4 block
     57 @ *
     58 @ * @param[out] pu1_out
     59 @ *  Output 4x4 block
     60 @ *
     61 @ * @param[in] u4_qp_div_6
     62 @ *     QP
     63 @ *
     64 @ * @param[in] pu2_weigh_mat
     65 @ * Pointer to weight matrix
     66 @ *
     67 @ * @param[in] pred_strd,
     68 @ *  Prediction stride
     69 @ *
     70 @ * @param[in] out_strd
     71 @ *  Output Stride
     72 @ *
     73 @ *@param[in] pi2_tmp
     74 @ * temporary buffer of size 1*16
     75 @ *
     76 @ * @param[in] pu2_iscal_mat
     77 @ * Pointer to the inverse quantization matrix
     78 @ *
     79 @ * @returns  Void
     80 @ *
     81 @ * @remarks
     82 @ *  None
     83 @ *
     84 @ *******************************************************************************
     85 @ *
     86 @void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
     87 @                                   UWORD8 *pu1_pred,
     88 @                                   UWORD8 *pu1_out,
     89 @                                   WORD32 pred_strd,
     90 @                                   WORD32 out_strd,
     91 @                                   const UWORD16 *pu2_iscal_mat,
     92 @                                   const UWORD16 *pu2_weigh_mat,
     93 @                                   UWORD32 u4_qp_div_6,
     94 @                                   WORD32 *pi4_tmp,
     95 @                                   WORD32 iq_start_idx
     96 @                                   WORD16 *pi2_dc_ld_addr)
     97 @**************Variables Vs Registers*****************************************
     98 @r0 => *pi2_src
     99 @r1 => *pu1_pred
    100 @r2 => *pu1_out
    101 @r3 =>  pred_strd
    102 @r4 =>  out_strd
    103 @r5 =>  *pu2_iscal_mat
    104 @r6 =>  *pu2_weigh_mat
    105 @r7 =>  u4_qp_div_6
    106 @r8 =>  iq_start_idx
    107 @r10=>  pi2_dc_ld_addr
    108 .text
    109 .syntax unified
    110 .p2align 2
    111 
    112     .global ih264_iquant_itrans_recon_4x4_a9
    113 
    114 ih264_iquant_itrans_recon_4x4_a9:
    115 
    116 @VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
    117 @If the macro value changes need to change the instruction according to it.
    118 @Only one shift is done in horizontal inverse because,
    119 @if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
    120 @if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
    121 
    122     stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
    123     ldr           r7, [sp, #52]         @Loads u4_qp_div_6
    124     ldr           r4, [sp, #40]         @Loads out_strd
    125     vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
    126     ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
    127 
    128     ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
    129 
    130     ldr           r8, [sp, #60]         @Loads iq_start_idx
    131 
    132     ldr           r10, [sp, #64]        @Load alternate dc address
    133 
    134     vpush         {d8-d15}
    135 @=======================DEQUANT FROM HERE===================================
    136 
    137     vld4.s16      {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
    138     vld4.s16      {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
    139     vmul.s16      q10, q10, q13         @x[i]=(scale[i] * dequant[i]) where i = 0..7
    140     vld4.s16      {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
    141 
    142     vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
    143 
    144     subs          r8, r8, #1            @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set
    145     ldrsheq       r9, [r10]             @ Loads signed halfword pi2_dc_ld_addr[0], if r8==1
    146 
    147     vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    148     vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    149     vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    150     vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
    151 
    152     vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
    153     vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
    154     vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
    155     vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
    156 
    157     vqrshrn.s32   d0, q0, #0x4          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
    158     vqrshrn.s32   d1, q1, #0x4          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
    159     vqrshrn.s32   d2, q2, #0x4          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
    160     vqrshrn.s32   d3, q3, #0x4          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
    161 
    162     vmoveq.16     d0[0], r9             @ Restore dc value in case of intra, i.e. r8 == 1
    163 
    164 @========= PROCESS IDCT FROM HERE =======
    165 @Steps for Stage 1:
    166 @------------------
    167     vld1.32       d30[0], [r1], r3      @I row Load pu1_pred buffer
    168     vadd.s16      d4, d0, d2            @x0 = q0 + q1;
    169 
    170     vsub.s16      d5, d0, d2            @x1 = q0 - q1;
    171 
    172     vshr.s16      d8, d1, #1            @q0>>1
    173     vshr.s16      d9, d3, #1            @q1>>1
    174 
    175     vsub.s16      d6, d8, d3            @x2 = (q0 >> 1) -  q1;
    176     vadd.s16      d7, d1, d9            @x3 = q0+ (q1 >> 1);
    177     vld1.32       d30[1], [r1], r3      @II row Load pu1_pred buffer
    178 
    179     vswp          d6, d7                @Reverse positions of x2 and x3
    180 
    181     vsub.s16      q6, q2, q3            @x0-x3 and x1-x2 combined
    182     vadd.s16      q5, q2, q3            @x0 + x3 and x1+x2 combined
    183 
    184     vld1.32       d31[0], [r1], r3      @III row Load pu1_pred buf
    185 
    186     vswp          d12, d13
    187 @Steps for Stage 2:
    188 @------------------
    189     vtrn.16       d10, d11
    190     vtrn.16       d12, d13
    191     vtrn.32       d10, d12
    192     vtrn.32       d11, d13
    193     vadd.s16      d14, d10, d12         @x0 = q0 + q1;
    194 
    195     vsub.s16      d15, d10, d12         @x1 = q0 - q1;
    196 
    197     vshr.s16      d18, d11, #1          @q0>>1
    198     vshr.s16      d19, d13, #1          @q1>>1
    199 
    200     vsub.s16      d16, d18, d13         @x2 = (q0 >> 1) -  q1;
    201     vadd.s16      d17, d11, d19         @x3 = q0+ (q1 >> 1);
    202 
    203     vld1.32       d31[1], [r1], r3      @IV row Load pu1_pred buffer
    204     vswp          d16, d17              @Reverse positions of x2 and x3
    205 
    206     vsub.s16      q11, q7, q8           @x0-x3 and x1-x2 combined
    207     vadd.s16      q10, q7, q8           @x0 + x3 and x1+x2 combined
    208 
    209     vswp          d22, d23
    210 
    211     vrshr.s16     q10, q10, #6          @
    212     vrshr.s16     q11, q11, #6
    213 
    214     vaddw.u8      q10, q10, d30
    215     vaddw.u8      q11, q11, d31
    216 
    217     vqmovun.s16   d0, q10
    218     vqmovun.s16   d1, q11
    219 
    220     vst1.32       d0[0], [r2], r4       @I row store the value
    221     vst1.32       d0[1], [r2], r4       @II row store the value
    222     vst1.32       d1[0], [r2], r4       @III row store the value
    223     vst1.32       d1[1], [r2]           @IV row store the value
    224 
    225     vpop          {d8-d15}
    226     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
    227 
    228 
    229 @**
    230 @ *******************************************************************************
    231 @ *
    232 @ * @brief
    233 @ *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
    234 @ *
    235 @ * @par Description:
    236 @ *  Performs inverse transform Ci4 and adds the residue to get the
    237 @ *  reconstructed block
    238 @ *
    239 @ * @param[in] pi2_src
    240 @ *  Input 4x4 coefficients
    241 @ *
    242 @ * @param[in] pu1_pred
    243 @ *  Prediction 4x4 block
    244 @ *
    245 @ * @param[out] pu1_out
    246 @ *  Output 4x4 block
    247 @ *
    248 @ * @param[in] u4_qp_div_6
    249 @ *     QP
    250 @ *
    251 @ * @param[in] pu2_weigh_mat
    252 @ * Pointer to weight matrix
    253 @ *
    254 @ * @param[in] pred_strd,
    255 @ *  Prediction stride
    256 @ *
    257 @ * @param[in] out_strd
    258 @ *  Output Stride
    259 @ *
    260 @ *@param[in] pi2_tmp
    261 @ * temporary buffer of size 1*16
    262 @ *
    263 @ * @param[in] pu2_iscal_mat
    264 @ * Pointer to the inverse quantization matrix
    265 @ *
    266 @ * @returns  Void
    267 @ *
    268 @ * @remarks
    269 @ *  None
    270 @ *
    271 @ *******************************************************************************
    272 @ *
    273 @void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
    274 @                                   UWORD8 *pu1_pred,
    275 @                                   UWORD8 *pu1_out,
    276 @                                   WORD32 pred_strd,
    277 @                                   WORD32 out_strd,
    278 @                                   const UWORD16 *pu2_iscal_mat,
    279 @                                   const UWORD16 *pu2_weigh_mat,
    280 @                                   UWORD32 u4_qp_div_6,
    281 @                                   WORD32 *pi4_tmp
    282 @                                   WORD16 *pi2_dc_src)
    283 @**************Variables Vs Registers*****************************************
    284 @r0 => *pi2_src
    285 @r1 => *pu1_pred
    286 @r2 => *pu1_out
    287 @r3 =>  pred_strd
    288 @r4 =>  out_strd
    289 @r5 =>  *pu2_iscal_mat
    290 @r6 =>  *pu2_weigh_mat
    291 @r7 =>  u4_qp_div_6
    292 
    293     .global ih264_iquant_itrans_recon_chroma_4x4_a9
    294 ih264_iquant_itrans_recon_chroma_4x4_a9:
    295 
    296 @VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
    297 @If the macro value changes need to change the instruction according to it.
    298 @Only one shift is done in horizontal inverse because,
    299 @if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
    300 @if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
    301 
    302     stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
    303     ldr           r7, [sp, #52]         @Loads u4_qp_div_6
    304     ldr           r4, [sp, #40]         @Loads out_strd
    305     vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
    306     ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
    307     ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
    308     ldr           r8, [sp, #60]         @loads *pi2_dc_src
    309 
    310     vpush         {d8-d15}
    311 @=======================DEQUANT FROM HERE===================================
    312 
    313     vld4.s16      {d20, d21, d22, d23}, [r5] @Load pu2_iscal_mat[i], i =0..15
    314     vld4.s16      {d26, d27, d28, d29}, [r6] @pu2_weigh_mat[i], i =0..15
    315     vmul.s16      q10, q10, q13         @x[i]=(scale[i] * dequant[i]) where i = 0..7
    316     vld4.s16      {d16, d17, d18, d19}, [r0] @pi2_src_tmp[i], i =0..15
    317 
    318     vmul.s16      q11, q11, q14         @x[i]=(scale[i] * dequant[i]) where i = 8..15
    319 
    320     vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    321     vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    322     vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    323     vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
    324 
    325     vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
    326     vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
    327     vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
    328     vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
    329 
    330     vqrshrn.s32   d0, q0, #0x4          @ D0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
    331     vqrshrn.s32   d1, q1, #0x4          @ D1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
    332     vqrshrn.s32   d2, q2, #0x4          @ D2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
    333     vqrshrn.s32   d3, q3, #0x4          @ D3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
    334 
    335     ldrsh         r9, [r8]              @ Loads signed halfword pi2_dc_src[0]
    336     vmov.16       d0[0], r9             @ Restore dc value since its chroma iq-it
    337 
    338 @========= PROCESS IDCT FROM HERE =======
    339 @Steps for Stage 1:
    340 @------------------
    341     vld2.8        {d28, d29}, [r1], r3  @I row Load pu1_pred buffer
    342     vadd.s16      d4, d0, d2            @x0 = q0 + q1;
    343 
    344     vsub.s16      d5, d0, d2            @x1 = q0 - q1;
    345 
    346     vshr.s16      d8, d1, #1            @q0>>1
    347     vshr.s16      d9, d3, #1            @q1>>1
    348 
    349     vsub.s16      d6, d8, d3            @x2 = (q0 >> 1) -  q1;
    350     vadd.s16      d7, d1, d9            @x3 = q0+ (q1 >> 1);
    351     vld2.8        {d29, d30}, [r1], r3  @II row Load pu1_pred buffer
    352 
    353     vswp          d6, d7                @Reverse positions of x2 and x3
    354 
    355     vsub.s16      q6, q2, q3            @x0-x3 and x1-x2 combined
    356     vtrn.32       d28, d29              @ D28 -- row I and II of pu1_pred_buffer
    357     vadd.s16      q5, q2, q3            @x0 + x3 and x1+x2 combined
    358 
    359     vld2.8        {d29, d30}, [r1], r3  @III row Load pu1_pred buf
    360 
    361     vswp          d12, d13
    362 @Steps for Stage 2:
    363 @------------------
    364     vtrn.16       d10, d11
    365     vtrn.16       d12, d13
    366     vtrn.32       d10, d12
    367     vtrn.32       d11, d13
    368     vadd.s16      d14, d10, d12         @x0 = q0 + q1;
    369 
    370     vsub.s16      d15, d10, d12         @x1 = q0 - q1;
    371 
    372     vshr.s16      d18, d11, #1          @q0>>1
    373     vshr.s16      d19, d13, #1          @q1>>1
    374 
    375     vsub.s16      d16, d18, d13         @x2 = (q0 >> 1) -  q1;
    376     vadd.s16      d17, d11, d19         @x3 = q0+ (q1 >> 1);
    377 
    378     vld2.8        {d30, d31}, [r1], r3  @IV row Load pu1_pred buffer
    379     vswp          d16, d17              @Reverse positions of x2 and x3
    380 
    381     vsub.s16      q11, q7, q8           @x0-x3 and x1-x2 combined
    382     vtrn.32       d29, d30              @ D29 -- row III and IV of pu1_pred_buf
    383     vadd.s16      q10, q7, q8           @x0 + x3 and x1+x2 combined
    384 
    385     vswp          d22, d23
    386 
    387     vrshr.s16     q10, q10, #6          @
    388     vrshr.s16     q11, q11, #6
    389 
    390     vaddw.u8      q10, q10, d28
    391     vaddw.u8      q11, q11, d29
    392 
    393     vld1.u8       d0, [r2], r4          @Loading out buffer 16 coeffs
    394     vld1.u8       d1, [r2], r4
    395     vld1.u8       d2, [r2], r4
    396     vld1.u8       d3, [r2], r4
    397 
    398     sub           r2, r2, r4, lsl #2
    399 
    400     vqmovun.s16   d20, q10              @Getting quantized coeffs
    401     vqmovun.s16   d22, q11
    402 
    403     vmovl.u8      q10, d20              @Move the coffs into 16 bit
    404     vmovl.u8      q11, d22              @so that we can use vbit to copy
    405 
    406     vmov.u16      q14, #0x00ff          @Copy lsb from qantized(long)coeffs
    407 
    408     vbit.u8       q0, q10, q14
    409     vbit.u8       q1, q11, q14
    410 
    411     vst1.u8       d0, [r2], r4
    412     vst1.u8       d1, [r2], r4
    413     vst1.u8       d2, [r2], r4
    414     vst1.u8       d3, [r2]
    415 
    416     vpop          {d8-d15}
    417     ldmfd         sp!, {r4-r12, r15}    @Reload the registers from SP
    418 
    419 
    420 @*
    421 @ *******************************************************************************
    422 @ *
    423 @ * @brief
    424 @ *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
    425 @ *
    426 @ * @par Description:
    427 @ *  Performs inverse transform Ci8 and adds the residue to get the
    428 @ *  reconstructed block
    429 @ *
    430 @ * @param[in] pi2_src
    431 @ *  Input 4x4 coefficients
    432 @ *
    433 @ * @param[in] pu1_pred
    434 @ *  Prediction 4x4 block
    435 @ *
    436 @ * @param[out] pu1_out
    437 @ *  Output 4x4 block
    438 @ *
    439 @ * @param[in] u4_qp_div_6
    440 @ *     QP
    441 @ *
    442 @ * @param[in] pu2_weigh_mat
    443 @ * Pointer to weight matrix
    444 @ *
    445 @ * @param[in] pred_strd,
    446 @ *  Prediction stride
    447 @ *
    448 @ * @param[in] out_strd
    449 @ *  Output Stride
    450 @ *
    451 @ *@param[in] pi2_tmp
    452 @ * temporary buffer of size 1*64
    453 @ *
    454 @ * @param[in] pu2_iscal_mat
    455 @ * Pointer to the inverse quantization matrix
    456 @ *
    457 @ * @returns  Void
    458 @ *
    459 @ * @remarks
    460 @ *  None
    461 @ *
    462 @ *******************************************************************************
    463 @ *
    464 @void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
    465 @                                   UWORD8 *pu1_pred,
    466 @                                   UWORD8 *pu1_out,
    467 @                                   WORD32 pred_strd,
    468 @                                   WORD32 out_strd,
    469 @                                   const UWORD16 *pu2_iscal_mat,
    470 @                                   const UWORD16 *pu2_weigh_mat,
    471 @                                   UWORD32 u4_qp_div_6,
    472 @                                   WORD32 *pi4_tmp,
    473 @                                   WORD32 iq_start_idx)
    474 @**************Variables Vs Registers*****************************************
    475 @r0 => *pi2_src
    476 @r1 => *pu1_pred
    477 @r2 => *pu1_out
    478 @r3 =>  pred_strd
    479 @r4 =>  out_strd
    480 @r5 =>  *pu2_iscal_mat
    481 @r6 =>  *pu2_weigh_mat
    482 @r7 =>  u4_qp_div_6
    483 
    484 
    485     .global ih264_iquant_itrans_recon_8x8_a9
    486 ih264_iquant_itrans_recon_8x8_a9:
    487 
    488     stmfd         sp!, {r4-r12, r14}    @stack stores the values of the arguments
    489     ldr           r7, [sp, #52]         @Loads u4_qp_div_6
    490     ldr           r4, [sp, #40]         @Loads out_strd
    491 
    492     ldr           r5, [sp, #44]         @Loads *pu2_iscal_mat
    493     ldr           r6, [sp, #48]         @Loads *pu2_weigh_mat
    494     vdup.s32      q15, r7               @Populate the u4_qp_div_6 in Q15
    495     vpush         {d8-d15}
    496 
    497 idct_8x8_begin:
    498 
    499 @========= DEQUANT FROM HERE ===========
    500 
    501     vld1.32       {q13}, [r5]!          @ Q13 = dequant values row 0
    502     vld1.32       {q10}, [r6]!          @ Q10 = scaling factors row 0
    503     vld1.32       {q14}, [r5]!          @ Q14 = dequant values row 1
    504     vmul.s16      q10, q10, q13         @ Q10 = x[i] = (scale[i] * dequant[i]) where i = 0..7
    505     vld1.32       {q11}, [r6]!          @ Q11 = scaling factors row 1
    506     vld1.32       {q8}, [r0]!           @ Q8  = Source row 0
    507     vmul.s16      q11, q11, q14         @ Q11 = x[i] = (scale[i] * dequant[i]) where i = 8..15
    508     vmull.s16     q0, d16, d20          @ Q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    509     vld1.32       {q9}, [r0]!           @ Q8  = Source row 1
    510     vmull.s16     q1, d17, d21          @ Q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    511     vmull.s16     q2, d18, d22          @ Q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    512     vld1.32       {q13}, [r6]!          @ Scaling factors row 2
    513     vmull.s16     q3, d19, d23          @ Q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
    514     vld1.32       {q14}, [r6]!          @ Scaling factors row 3
    515     vshl.s32      q0, q0, q15           @ Q0  = q[i] = (p[i] << (qP/6)) where i = 0..3
    516     vld1.32       {q10}, [r5]!          @ Q10 = Dequant values row 2
    517     vshl.s32      q1, q1, q15           @ Q1  = q[i] = (p[i] << (qP/6)) where i = 4..7
    518     vld1.32       {q8}, [r0]!           @ Source Row 2
    519     vshl.s32      q2, q2, q15           @ Q2  = q[i] = (p[i] << (qP/6)) where i = 8..11
    520     vld1.32       {q11}, [r5]!          @ Q11 = Dequant values row 3
    521     vshl.s32      q3, q3, q15           @ Q3  = q[i] = (p[i] << (qP/6)) where i = 12..15
    522     vld1.32       {q9}, [r0]!           @ Source Row 3
    523     vmul.s16      q10, q10, q13         @ Dequant row2*scale matrix row 2
    524     vmul.s16      q11, q11, q14         @ Dequant row 3*scale matrix row 3
    525     vld1.32       {q4}, [r6]!           @ Scaling factors row 4
    526     vqrshrn.s32   d0, q0, #0x6          @ D0  = c[i] = ((q[i] + 32) >> 6) where i = 0..3
    527     vqrshrn.s32   d1, q1, #0x6          @ D1  = c[i] = ((q[i] + 32) >> 6) where i = 4..7
    528     vld1.32       {q5}, [r6]!           @ Scaling factors row 5
    529     vqrshrn.s32   d2, q2, #0x6          @ D2  = c[i] = ((q[i] + 32) >> 6) where i = 8..11
    530     vqrshrn.s32   d3, q3, #0x6          @ D3  = c[i] = ((q[i] + 32) >> 6) where i = 12..15
    531     vld1.32       {q13}, [r5]!          @ Q13 = Dequant values row 4
    532     vmull.s16     q2, d16, d20          @ p[i] = (x[i] * trns_coeff[i]) where i=16..19
    533     vmull.s16     q3, d17, d21          @ p[i] = (x[i] * trns_coeff[i]) where i=20..23
    534     vld1.32       {q12}, [r5]!          @ Q12 = Dequant values row 5
    535     vmull.s16     q6, d18, d22          @ p[i] = (x[i] * trns_coeff[i]) where i=24..27
    536     vmull.s16     q7, d19, d23          @ p[i] = (x[i] * trns_coeff[i]) where i=28..31
    537 
    538     vld1.32       {q14}, [r0]!          @ Source row 4
    539     vmul.s16      q10, q4, q13          @ Dequant row4*scale matrix row 4
    540     vmul.s16      q11, q5, q12          @ Dequant row5*scale matrix row 5
    541     vld1.32       {q9}, [r0]!           @ Source row 5
    542     vshl.s32      q2, q2, q15           @
    543     vshl.s32      q3, q3, q15           @
    544     vld1.32       {q13}, [r6]!          @ Scaling factors row 6
    545     vshl.s32      q6, q6, q15           @
    546     vshl.s32      q7, q7, q15           @
    547     vmull.s16     q4, d28, d20          @ i = 32..35
    548     vqrshrn.s32   d4, q2, #0x6          @ D4  = c[i] = ((q[i] + 32) >> 6) where i = 16..19
    549     vqrshrn.s32   d5, q3, #0x6          @ D5  = c[i] = ((q[i] + 32) >> 6) where i = 20..23
    550     vmull.s16     q5, d29, d21          @ i =36..39
    551     vld1.32       {q10}, [r5]!          @ Dequant values row 6
    552     vqrshrn.s32   d6, q6, #0x6          @ D6  = c[i] = ((q[i] + 32) >> 6) where i = 24..27
    553     vqrshrn.s32   d7, q7, #0x6          @ D7  = c[i] = ((q[i] + 32) >> 6) where i = 28..31
    554     vld1.32       {q14}, [r6]!          @ Scaling factors row 7
    555     vmull.s16     q6, d18, d22          @
    556     vld1.32       {q8}, [r0]!           @ Source row 6
    557     vmull.s16     q7, d19, d23          @
    558     vld1.32       {q11}, [r5]!          @ Dequant values row 7
    559     vshl.s32      q4, q4, q15           @
    560     vld1.32       {q9}, [r0]!           @ Source row 7
    561     vshl.s32      q5, q5, q15           @
    562 
    563     vshl.s32      q6, q6, q15           @
    564     vshl.s32      q7, q7, q15           @
    565     vmul.s16      q10, q10, q13         @ Dequant*scaling row 6
    566     vmul.s16      q11, q11, q14         @ Dequant*scaling row 7
    567     vqrshrn.s32   d8, q4, #0x6          @ D8  = c[i] = ((q[i] + 32) >> 6) where i = 32..35
    568     vqrshrn.s32   d9, q5, #0x6          @ D9  = c[i] = ((q[i] + 32) >> 6) where i = 36..39
    569     vqrshrn.s32   d10, q6, #0x6         @ D10  = c[i] = ((q[i] + 32) >> 6) where i = 40..43
    570     vqrshrn.s32   d11, q7, #0x6         @ D11  = c[i] = ((q[i] + 32) >> 6) where i = 44..47
    571     vmull.s16     q6, d16, d20          @ i= 48..51
    572     vmull.s16     q7, d17, d21          @ i= 52..55
    573     vmull.s16     q8, d18, d22          @ i=56..59
    574     vmull.s16     q9, d19, d23          @ i=60..63
    575     vshl.s32      q6, q6, q15           @
    576     vzip.s16      q0, q1                @Transpose
    577     vshl.s32      q7, q7, q15           @
    578     vshl.s32      q8, q8, q15           @
    579     vzip.s16      q2, q3                @
    580     vshl.s32      q9, q9, q15           @
    581     vqrshrn.s32   d12, q6, #0x6         @ D12  = c[i] = ((q[i] + 32) >> 6) where i = 48..51
    582     vzip.s16      q4, q5                @Transpose
    583     vqrshrn.s32   d13, q7, #0x6         @ D13  = c[i] = ((q[i] + 32) >> 6) where i = 52..55
    584     vqrshrn.s32   d14, q8, #0x6         @ D14  = c[i] = ((q[i] + 32) >> 6) where i = 56..59
    585     vzip.s32      q0, q2                @Transpose
    586     vqrshrn.s32   d15, q9, #0x6         @ D15  = c[i] = ((q[i] + 32) >> 6) where i = 60..63
    587 
    588 @========= PROCESS IDCT FROM HERE =======
    589 
    590 @Steps for Stage 2:
    591 @------------------
    592 
    593 @   TRANSPOSE 8x8 coeffs to actual order
    594 
    595     vzip.s16      q6, q7                @
    596 
    597     vzip.s32      q1, q3                @
    598     vzip.s32      q4, q6                @
    599     vzip.s32      q5, q7                @
    600 
    601     vswp          d1, d8                @ Q0/Q1 = Row order x0/x1
    602     vswp          d3, d10               @ Q2/Q3 = Row order x2/x3
    603     vswp          d5, d12               @ Q4/Q5 = Row order x4/x5
    604     vswp          d7, d14               @ Q6/Q7 = Row order x6/x7
    605 
    606     vswp          q1, q4                @
    607     vshr.s16      q10, q2, #0x1         @
    608     vswp          q3, q6                @
    609 
    610 @Steps for Stage 1:
    611 @------------------
    612 
    613     vadd.s16      q8, q0, q4            @ Q8 = y0
    614     vsub.s16      q9, q0, q4            @ Q9 = y2
    615 
    616     vsra.s16      q2, q6, #0x1          @ Q2 = y6
    617     vsub.s16      q6, q10, q6           @ Q6 = y4
    618 
    619     vaddl.s16     q12, d14, d2          @ y3 (0-3) 1+7
    620     vaddl.s16     q13, d15, d3          @ y3 (4-7) 1+7
    621 
    622     vsubl.s16     q10, d14, d2          @ y5 (0-3) 7-1
    623     vsubl.s16     q11, d15, d3          @ y5 (4-7) 7-1
    624 
    625     vadd.s16      q0, q8, q2            @ Q0 = z0
    626     vsub.s16      q4, q8, q2            @ Q4 = z6
    627 
    628     vadd.s16      q8, q9, q6            @ Q8 = z2
    629     vsub.s16      q2, q9, q6            @ Q2 = z4
    630 
    631     vsubw.s16     q12, q12, d6          @ y3 (0-3) 1+7-3
    632     vsubw.s16     q13, q13, d7          @ y3 (0-7) 1+7-3
    633 
    634     vshr.s16      q6, q3, #0x1          @
    635 
    636     vaddw.s16     q10, q10, d10         @
    637     vaddw.s16     q11, q11, d11         @
    638 
    639     vshr.s16      q9, q5, #0x1          @
    640 
    641     vsubw.s16     q12, q12, d12         @
    642     vsubw.s16     q13, q13, d13         @
    643 
    644     vaddw.s16     q10, q10, d18         @
    645     vaddw.s16     q11, q11, d19         @
    646 
    647     vqmovn.s32    d12, q12              @
    648     vaddl.s16     q12, d10, d6          @
    649     vqmovn.s32    d13, q13              @ Q6 = y3
    650     vaddl.s16     q13, d11, d7          @
    651     vqmovn.s32    d18, q10              @
    652     vsubl.s16     q10, d10, d6          @
    653     vqmovn.s32    d19, q11              @ Q9 = y5
    654     vsubl.s16     q11, d11, d7          @
    655 
    656     vshr.s16      q3, q6, #0x2          @
    657 
    658     vsra.s16      q6, q9, #0x2          @ Q6 = z3
    659 
    660     vaddw.s16     q12, q12, d2          @
    661     vaddw.s16     q13, q13, d3          @
    662 
    663     vshr.s16      q1, #0x1              @
    664 
    665     vsub.s16      q5, q3, q9            @ Q5 = z5
    666 
    667     vsubw.s16     q10, q10, d14         @
    668     vsubw.s16     q11, q11, d15         @
    669 
    670     vshr.s16      q7, #0x1              @
    671 
    672     vaddw.s16     q12, q12, d2          @
    673     vaddw.s16     q13, q13, d3          @
    674 
    675     vsubw.s16     q10, q10, d14         @
    676     vsubw.s16     q11, q11, d15         @
    677 
    678 
    679     vqmovn.s32    d14, q12              @
    680     vadd.s16      q1, q8, q5            @ Q1 = x1
    681     vqmovn.s32    d15, q13              @ Q7 = y7
    682     vsub.s16      q3, q8, q5            @ Q3 = x6
    683     vqmovn.s32    d18, q10              @
    684     vsub.s16      q5, q2, q6            @ Q5 = x5
    685     vqmovn.s32    d19, q11              @ Q9 = y1
    686     vadd.s16      q2, q2, q6            @ Q2 = x2
    687 
    688     vshr.s16      q12, q9, #0x2         @
    689     vsra.s16      q9, q7, #0x2          @ Q9 = z1
    690 
    691     vsub.s16      q11, q7, q12          @ Q11 = z7
    692 
    693     vadd.s16      q6, q4, q9            @ Q6 = x3
    694     vsub.s16      q4, q4, q9            @ Q4 = x4
    695 
    696     vsub.s16      q7, q0, q11           @ Q7 = x7
    697     vadd.s16      q0, q0, q11           @ Q0 = x0
    698 
    699     vswp.s16      q3, q6                @ Q3 = x3, Q6 = x6
    700 
    701 
    702 @Steps for Stage 2:
    703 @------------------
    704 
    705 @   TRANSPOSE 8x8 coeffs to actual order
    706 
    707     vzip.s16      q0, q1                @
    708     vzip.s16      q2, q3                @
    709     vzip.s16      q4, q5                @
    710     vzip.s16      q6, q7                @
    711 
    712     vzip.s32      q0, q2                @
    713     vzip.s32      q1, q3                @
    714     vzip.s32      q4, q6                @
    715     vzip.s32      q5, q7                @
    716 
    717     vswp          d1, d8                @ Q0/Q1 = Row order x0/x1
    718     vswp          d3, d10               @ Q2/Q3 = Row order x2/x3
    719     vswp          d5, d12               @ Q4/Q5 = Row order x4/x5
    720     vswp          d7, d14               @ Q6/Q7 = Row order x6/x7
    721 
    722     vswp          q1, q4                @
    723     vshr.s16      q10, q2, #0x1         @
    724     vswp          q3, q6                @
    725 
    726 @Steps for Stage 3:
    727 @------------------
    728 
    729 @Repeat stage 1 again for vertical transform
    730 
    731     vadd.s16      q8, q0, q4            @ Q8 = y0
    732     vld1.32       d28, [r1], r3         @ Q12 = 0x070605....0x070605....
    733     vsub.s16      q9, q0, q4            @ Q9 = y2
    734 
    735     vsra.s16      q2, q6, #0x1          @ Q2 = y6
    736     vsub.s16      q6, q10, q6           @ Q6 = y4
    737 
    738     vaddl.s16     q12, d14, d2          @
    739     vld1.32       d29, [r1], r3         @ Q12 = 0x070605....0x070605....
    740     vaddl.s16     q13, d15, d3          @
    741 
    742     vsubl.s16     q10, d14, d2          @
    743     vld1.32       d30, [r1], r3         @ Q12 = 0x070605....0x070605....
    744     vsubl.s16     q11, d15, d3          @
    745 
    746     vadd.s16      q0, q8, q2            @ Q0 = z0
    747     vld1.32       d31, [r1], r3         @ Q12 = 0x070605....0x070605....
    748     vsub.s16      q4, q8, q2            @ Q4 = z6
    749 
    750     vadd.s16      q8, q9, q6            @ Q8 = z2
    751     vsub.s16      q2, q9, q6            @ Q2 = z4
    752 
    753     vsubw.s16     q12, q12, d6          @
    754     vsubw.s16     q13, q13, d7          @
    755 
    756     vshr.s16      q6, q3, #0x1          @
    757 
    758     vaddw.s16     q10, q10, d10         @
    759     vaddw.s16     q11, q11, d11         @
    760 
    761     vshr.s16      q9, q5, #0x1          @
    762 
    763     vsubw.s16     q12, q12, d12         @
    764     vsubw.s16     q13, q13, d13         @
    765 
    766     vaddw.s16     q10, q10, d18         @
    767     vaddw.s16     q11, q11, d19         @
    768 
    769     vqmovn.s32    d12, q12              @
    770     vaddl.s16     q12, d10, d6          @
    771     vqmovn.s32    d13, q13              @ Q6 = y3
    772     vaddl.s16     q13, d11, d7          @
    773     vqmovn.s32    d18, q10              @
    774     vsubl.s16     q10, d10, d6          @
    775     vqmovn.s32    d19, q11              @ Q9 = y5
    776     vsubl.s16     q11, d11, d7          @
    777 
    778     vshr.s16      q3, q6, #0x2          @
    779 
    780     vsra.s16      q6, q9, #0x2          @ Q6 = z3
    781 
    782     vaddw.s16     q12, q12, d2          @
    783     vaddw.s16     q13, q13, d3          @
    784 
    785     vshr.s16      q1, #0x1              @
    786 
    787     vsub.s16      q5, q3, q9            @ Q5 = z5
    788 
    789     vsubw.s16     q10, q10, d14         @
    790     vsubw.s16     q11, q11, d15         @
    791 
    792     vshr.s16      q7, #0x1              @
    793 
    794     vaddw.s16     q12, q12, d2          @
    795     vaddw.s16     q13, q13, d3          @
    796 
    797     vsubw.s16     q10, q10, d14         @
    798     vsubw.s16     q11, q11, d15         @
    799 
    800     vqmovn.s32    d14, q12              @
    801     vadd.s16      q1, q8, q5            @ Q1 = x1
    802     vqmovn.s32    d15, q13              @ Q7 = y7
    803     vsub.s16      q3, q8, q5            @ Q3 = x6
    804     vqmovn.s32    d18, q10              @
    805     vsub.s16      q5, q2, q6            @ Q5 = x5
    806     vqmovn.s32    d19, q11              @ Q9 = y1
    807     vadd.s16      q2, q2, q6            @ Q2 = x2
    808 
    809     vshr.s16      q12, q9, #0x2         @
    810     vsra.s16      q9, q7, #0x2          @ Q9 = z1
    811 
    812     vsub.s16      q11, q7, q12          @ Q11 = z7
    813 
    814     vadd.s16      q6, q4, q9            @ Q6 = x3
    815     vsub.s16      q4, q4, q9            @ Q4 = x4
    816 
    817     vsub.s16      q7, q0, q11           @ Q7 = x7
    818     vadd.s16      q0, q0, q11           @ Q0 = x0
    819 
    820     vswp.s16      q3, q6                @ Q3 <-> Q6
    821 
    822     vrshr.s16     q1, q1, #6            @
    823     vld1.32       d16, [r1], r3         @ Q12 = 0x070605....0x070605....
    824     vrshr.s16     q2, q2, #6            @
    825     vrshr.s16     q4, q4, #6            @
    826     vld1.32       d17, [r1], r3         @ Q12 = 0x070605....0x070605....
    827     vrshr.s16     q5, q5, #6            @
    828     vrshr.s16     q7, q7, #6            @
    829     vld1.32       d18, [r1], r3         @ Q12 = 0x070605....0x070605....
    830     vrshr.s16     q0, q0, #6            @
    831     vrshr.s16     q3, q3, #6            @
    832     vld1.32       d19, [r1], r3         @ Q12 = 0x070605....0x070605....
    833     vrshr.s16     q6, q6, #6            @
    834 
    835 @ Code Added to pack sign and magnitudes
    836 
    837     vaddw.u8      q0, q0, d28
    838     vaddw.u8      q1, q1, d29
    839     vaddw.u8      q2, q2, d30
    840     vaddw.u8      q3, q3, d31
    841     vqmovun.s16   d0, q0
    842     vaddw.u8      q4, q4, d16
    843     vqmovun.s16   d1, q1
    844     vaddw.u8      q5, q5, d17
    845     vqmovun.s16   d2, q2
    846     vaddw.u8      q6, q6, d18
    847     vqmovun.s16   d3, q3
    848     vaddw.u8      q7, q7, d19
    849 
    850     vqmovun.s16   d4, q4
    851     vst1.32       d0, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    852     vqmovun.s16   d5, q5
    853     vst1.32       d1, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    854     vqmovun.s16   d6, q6
    855     vst1.32       d2, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    856     vqmovun.s16   d7, q7
    857     vst1.32       d3, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    858     vst1.32       d4, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    859 
    860     vst1.32       d5, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    861 
    862 
    863     vst1.32       d6, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    864 
    865 
    866     vst1.32       d7, [r2], r4          @ Magnitudes of 1st 4x4 block coeffs
    867 
    868 idct_8x8_end:
    869 
    870     vpop          {d8-d15}
    871     ldmfd         sp!, {r4-r12, r15}
    872 
    873