Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 ///*******************************************************************************
     22 // * //file
     23 // *  ih264_iquant_itrans_recon_a9.s
     24 // *
     25 // * //brief
     26 // *  Contains function definitions for single stage  inverse transform
     27 // *
     28 // * //author
     29 // *  Parthiban V
     30 // *     Mohit
     31 // *  Harinarayanaan
     32 // *
     33 // * //par List of Functions:
     34 // *  - ih264_iquant_itrans_recon_4x4_av8()
     35 // *     - ih264_iquant_itrans_recon_8x8_av8()
     36 // *     - ih264_iquant_itrans_recon_chroma_4x4_av8()
     37 // *
     38 // * //remarks
     39 // *  None
     40 // *
     41 // *******************************************************************************
     42 
     43 .text
     44 .p2align 2
     45 .include "ih264_neon_macros.s"
     46 
     47 ///*
     48 // *******************************************************************************
     49 // *
     50 // * //brief
     51 // *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
     52 // *
     53 // * //par Description:
     54 // *  Performs inverse transform Ci4 and adds the residue to get the
     55 // *  reconstructed block
     56 // *
     57 // * //param[in] pi2_src
     58 // *  Input 4x4 coefficients
     59 // *
     60 // * //param[in] pu1_pred
     61 // *  Prediction 4x4 block
     62 // *
     63 // * //param[out] pu1_out
     64 // *  Output 4x4 block
     65 // *
     66 // * //param[in] u4_qp_div_6
     67 // *     QP
     68 // *
     69 // * //param[in] pu2_weigh_mat
     70 // * Pointer to weight matrix
     71 // *
     72 // * //param[in] pred_strd,
     73 // *  Prediction stride
     74 // *
     75 // * //param[in] out_strd
     76 // *  Output Stride
     77 // *
     78 // *//param[in] pi2_tmp
     79 // * temporary buffer of size 1*16
     80 // *
     81 // * //param[in] pu2_iscal_mat
     82 // * Pointer to the inverse quantization matrix
     83 // *
     84 // * //returns  Void
     85 // *
     86 // * //remarks
     87 // *  None
     88 // *
     89 // *******************************************************************************
     90 // */
     91 //void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
     92 //                                    UWORD8 *pu1_pred,
     93 //                                    UWORD8 *pu1_out,
     94 //                                    WORD32 pred_strd,
     95 //                                    WORD32 out_strd,
     96 //                                    const UWORD16 *pu2_iscal_mat,
     97 //                                    const UWORD16 *pu2_weigh_mat,
     98 //                                    UWORD32 u4_qp_div_6,
     99 //                                    WORD32 *pi4_tmp,
    100 //                                    WORD32 iq_start_idx
    101 //                                    WORD16 *pi2_dc_ld_addr)
    102 //**************Variables Vs Registers*****************************************
    103 //x0 => *pi2_src
    104 //x1 => *pu1_pred
    105 //x2 => *pu1_out
    106 //w3 =>  pred_strd
    107 //w4 =>  out_strd
    108 //x5 => *pu2_iscal_mat
    109 //x6 => *pu2_weigh_mat
    110 //w7 =>  u4_qp_div_6
    111 //   =>  pi4_tmp
    112 //   =>  iq_start_idx
    113 //   =>  pi2_dc_ld_addr
    114 //Only one shift is done in horizontal inverse because,
    115 //if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
    116 //if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
    117 
    118     .global ih264_iquant_itrans_recon_4x4_av8
    119 ih264_iquant_itrans_recon_4x4_av8:
    120 
    121     push_v_regs
    122     sxtw      x3, w3
    123     sxtw      x4, w4
    124 
    125     dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
    126 
    127     ldr       w8, [sp, #72]             //Loads iq_start_idx
    128     sxtw      x8, w8
    129 
    130     ldr       x10, [sp, #80]            //Load alternate dc address
    131 
    132     subs      x8, x8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
    133 
    134 
    135 //=======================DEQUANT FROM HERE===================================
    136 
    137     ld4       {v20.4h - v23.4h}, [x5]   // load pu2_iscal_mat[i], i =0..15
    138     ld4       {v26.4h - v29.4h}, [x6]   // pu2_weigh_mat[i], i =0..15
    139     ld4       {v16.4h - v19.4h}, [x0]   // pi2_src_tmp[i], i =0..15
    140 
    141 
    142     mul       v20.4h, v20.4h, v26.4h    // x[i]=(scale[i] * dequant[i]) where i = 0..3
    143     mul       v21.4h, v21.4h, v27.4h    // x[i]=(scale[i] * dequant[i]) where i = 4..7
    144     mul       v22.4h, v22.4h, v28.4h    // x[i]=(scale[i] * dequant[i]) where i = 8..11
    145     mul       v23.4h, v23.4h, v29.4h    // x[i]=(scale[i] * dequant[i]) where i = 12..14
    146 
    147     smull     v0.4s, v16.4h, v20.4h     // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    148     smull     v2.4s, v17.4h, v21.4h     // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    149     smull     v4.4s, v18.4h, v22.4h     // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    150     smull     v6.4s, v19.4h, v23.4h     // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
    151 
    152     sshl      v0.4s, v0.4s, v30.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
    153     sshl      v2.4s, v2.4s, v30.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
    154     sshl      v4.4s, v4.4s, v30.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
    155     sshl      v6.4s, v6.4s, v30.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
    156 
    157     sqrshrn   v0.4h, v0.4s, #0x4        // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
    158     sqrshrn   v1.4h, v2.4s, #0x4        // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
    159     sqrshrn   v2.4h, v4.4s, #0x4        // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
    160     sqrshrn   v3.4h, v6.4s, #0x4        // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
    161 
    162     bne       skip_loading_luma_dc_src
    163     ld1       {v0.h}[0], [x10]          // loads signed halfword pi2_dc_ld_addr[0], if x8==1
    164 skip_loading_luma_dc_src:
    165 
    166     //========= PROCESS IDCT FROM HERE =======
    167     //Steps for Stage 1:
    168     //------------------
    169     ld1       {v30.s}[0], [x1], x3      // i row load pu1_pred buffer
    170 
    171     sshr      v8.4h, v1.4h, #1          // d1>>1
    172     sshr      v9.4h, v3.4h, #1          // d3>>1
    173 
    174     add       v4.4h, v0.4h, v2.4h       // x0 = d0 + d2//
    175     sub       v5.4h, v0.4h, v2.4h       // x1 = d0 - d2//
    176     sub       v6.4h, v8.4h, v3.4h       // x2 = (d1 >> 1) -  d3//
    177     add       v7.4h, v1.4h, v9.4h       // x3 =  d1  + (d3 >>  1)//
    178 
    179     ld1       {v30.s}[1], [x1], x3      // ii row load pu1_pred buffer
    180 
    181     add       v10.4h, v4.4h , v7.4h     // x0+x3
    182     add       v11.4h, v5.4h , v6.4h     // x1+x2
    183     sub       v12.4h, v5.4h , v6.4h     // x1-x2
    184     sub       v13.4h, v4.4h , v7.4h
    185 
    186     ld1       {v31.s}[0], [x1], x3      // iii row load pu1_pred buf
    187 
    188 
    189     //Steps for Stage 2:
    190     //transopose
    191     trn1      v4.4h, v10.4h, v11.4h
    192     trn2      v5.4h, v10.4h, v11.4h
    193     trn1      v6.4h, v12.4h, v13.4h
    194     trn2      v7.4h, v12.4h, v13.4h
    195 
    196     trn1      v10.2s, v4.2s, v6.2s      // 0
    197     trn1      v11.2s, v5.2s, v7.2s      // 8
    198     trn2      v12.2s, v4.2s, v6.2s      // 4
    199     trn2      v13.2s, v5.2s, v7.2s
    200     //end transpose
    201 
    202     sshr      v18.4h, v11.4h, #1        // q0>>1
    203     sshr      v19.4h, v13.4h, #1        // q1>>1
    204 
    205     add       v14.4h, v10.4h, v12.4h    // x0 = q0 + q2//
    206     sub       v15.4h, v10.4h, v12.4h    // x1 = q0 - q2//
    207     sub       v16.4h, v18.4h, v13.4h    // x2 = (q1 >> 1) -  q3//
    208     add       v17.4h, v11.4h, v19.4h    // x3 = q1+ (q3 >> 3)//
    209 
    210 
    211     ld1       {v31.s}[1], [x1], x3      // iv row load pu1_pred buffer
    212 
    213     add       v20.4h, v14.4h, v17.4h    // x0 + x3
    214     add       v21.4h, v15.4h, v16.4h    // x1 + x2
    215     sub       v22.4h, v15.4h, v16.4h    // x1 - x2
    216     sub       v23.4h, v14.4h, v17.4h    // x0 - x3
    217 
    218     mov       v20.d[1], v21.d[0]
    219     mov       v22.d[1], v23.d[0]
    220 
    221     srshr     v20.8h, v20.8h, #6
    222     srshr     v22.8h, v22.8h, #6
    223 
    224     uaddw     v20.8h, v20.8h , v30.8b
    225     uaddw     v22.8h, v22.8h , v31.8b
    226 
    227     sqxtun    v0.8b, v20.8h
    228     sqxtun    v1.8b, v22.8h
    229 
    230     st1       {v0.s}[0], [x2], x4       //i row store the value
    231     st1       {v0.s}[1], [x2], x4       //ii row store the value
    232     st1       {v1.s}[0], [x2], x4       //iii row store the value
    233     st1       {v1.s}[1], [x2]           //iv row store the value
    234 
    235     pop_v_regs
    236     ret
    237 
    238 
    239 ///**
    240 // *******************************************************************************
    241 // *
    242 // * @brief
    243 // *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
    244 // *
    245 // * @par Description:
    246 // *  Performs inverse transform Ci4 and adds the residue to get the
    247 // *  reconstructed block
    248 // *
    249 // * @param[in] pi2_src
    250 // *  Input 4x4 coefficients
    251 // *
    252 // * @param[in] pu1_pred
    253 // *  Prediction 4x4 block
    254 // *
    255 // * @param[out] pu1_out
    256 // *  Output 4x4 block
    257 // *
    258 // * @param[in] u4_qp_div_6
    259 // *     QP
    260 // *
    261 // * @param[in] pu2_weigh_mat
    262 // * Pointer to weight matrix
    263 // *
    264 // * @param[in] pred_strd,
    265 // *  Prediction stride
    266 // *
    267 // * @param[in] out_strd
    268 // *  Output Stride
    269 // *
    270 // *@param[in] pi2_tmp
    271 // * temporary buffer of size 1*16
    272 // *
    273 // * @param[in] pu2_iscal_mat
    274 // * Pointer to the inverse quantization matrix
    275 // *
    276 // * @returns  Void
    277 // *
    278 // * @remarks
    279 // *  None
    280 // *
    281 // *******************************************************************************
    282 // */
    283 //void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
    284 //                                          UWORD8 *pu1_pred,
    285 //                                          UWORD8 *pu1_out,
    286 //                                          WORD32 pred_strd,
    287 //                                          WORD32 out_strd,
    288 //                                          const UWORD16 *pu2_iscal_mat,
    289 //                                          const UWORD16 *pu2_weigh_mat,
    290 //                                          UWORD32 u4_qp_div_6,
    291 //                                          WORD32 *pi4_tmp
    292 //                                          WORD16 *pi2_dc_src)
    293 //**************Variables Vs Registers*****************************************
    294 //x0 => *pi2_src
    295 //x1 => *pu1_pred
    296 //x2 => *pu1_out
    297 //w3 =>  pred_strd
    298 //w4 =>  out_strd
    299 //x5 => *pu2_iscal_mat
    300 //x6 => *pu2_weigh_mat
    301 //w7 =>  u4_qp_div_6
    302 //sp =>  pi4_tmp
    303 //sp#8 => *pi2_dc_src
    304 
    305     .global ih264_iquant_itrans_recon_chroma_4x4_av8
    306 ih264_iquant_itrans_recon_chroma_4x4_av8:
    307 
    308 //VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
    309 //If the macro value changes need to change the instruction according to it.
    310 //Only one shift is done in horizontal inverse because,
    311 //if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
    312 //if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
    313 
    314 //at the end of the fucntion, we could have moved 64 bits into heigher 64 bits of register and done further processing
    315 //but it seem to give only reduce the number of instruction by 1. [Since a15 we saw add and sub to be very high throughput
    316 //all instructions were taken as equal
    317 
    318     //reduce sp by 64
    319     push_v_regs
    320     sxtw      x3, w3
    321     sxtw      x4, w4
    322 
    323     dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
    324 
    325     //was at sp + 8, hence now at sp+64+8 = sp+72
    326     ldr       x10, [sp, #72]            //Load alternate dc address
    327 
    328 //=======================DEQUANT FROM HERE===================================
    329 
    330     ld4       {v20.4h - v23.4h}, [x5]   // load pu2_iscal_mat[i], i =0..15
    331     ld4       {v26.4h - v29.4h}, [x6]   // pu2_weigh_mat[i], i =0..15
    332     ld4       {v16.4h - v19.4h}, [x0]   // pi2_src_tmp[i], i =0..15
    333 
    334 
    335     mul       v20.4h, v20.4h, v26.4h    // x[i]=(scale[i] * dequant[i]) where i = 0..3
    336     mul       v21.4h, v21.4h, v27.4h    // x[i]=(scale[i] * dequant[i]) where i = 4..7
    337     mul       v22.4h, v22.4h, v28.4h    // x[i]=(scale[i] * dequant[i]) where i = 8..11
    338     mul       v23.4h, v23.4h, v29.4h    // x[i]=(scale[i] * dequant[i]) where i = 12..14
    339 
    340     smull     v0.4s, v16.4h, v20.4h     // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
    341     smull     v2.4s, v17.4h, v21.4h     // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
    342     smull     v4.4s, v18.4h, v22.4h     // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
    343     smull     v6.4s, v19.4h, v23.4h     // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
    344 
    345     sshl      v0.4s, v0.4s, v30.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
    346     sshl      v2.4s, v2.4s, v30.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
    347     sshl      v4.4s, v4.4s, v30.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
    348     sshl      v6.4s, v6.4s, v30.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
    349 
    350     sqrshrn   v0.4h, v0.4s, #0x4        // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
    351     sqrshrn   v1.4h, v2.4s, #0x4        // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
    352     sqrshrn   v2.4h, v4.4s, #0x4        // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
    353     sqrshrn   v3.4h, v6.4s, #0x4        // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
    354 
    355     ld1       {v0.h}[0], [x10]          // loads signed halfword pi2_dc_src[0]
    356 
    357     //========= PROCESS IDCT FROM HERE =======
    358     //Steps for Stage 1:
    359     //------------------
    360 
    361     sshr      v8.4h, v1.4h, #1          // d1>>1
    362     sshr      v9.4h, v3.4h, #1          // d3>>1
    363 
    364     add       v4.4h, v0.4h, v2.4h       // x0 = d0 + d2//
    365     sub       v5.4h, v0.4h, v2.4h       // x1 = d0 - d2//
    366     sub       v6.4h, v8.4h, v3.4h       // x2 = (d1 >> 1) -  d3//
    367     add       v7.4h, v1.4h, v9.4h       // x3 =  d1  + (d3 >>  1)//
    368 
    369 
    370     add       v10.4h, v4.4h , v7.4h     // x0+x3
    371     add       v11.4h, v5.4h , v6.4h     // x1+x2
    372     sub       v12.4h, v5.4h , v6.4h     // x1-x2
    373     sub       v13.4h, v4.4h , v7.4h
    374 
    375     ld1       {v26.8b}, [x1], x3        // i row load pu1_pred buffer
    376     ld1       {v27.8b}, [x1], x3        // ii row load pu1_pred buffer
    377     ld1       {v28.8b}, [x1], x3        // iii row load pu1_pred buf
    378     ld1       {v29.8b}, [x1], x3        // iv row load pu1_pred buffer
    379 
    380     //Steps for Stage 2:
    381     //transopose
    382     trn1      v4.4h, v10.4h, v11.4h
    383     trn2      v5.4h, v10.4h, v11.4h
    384     trn1      v6.4h, v12.4h, v13.4h
    385     trn2      v7.4h, v12.4h, v13.4h
    386 
    387     trn1      v10.2s, v4.2s, v6.2s      // 0
    388     trn1      v11.2s, v5.2s, v7.2s      // 8
    389     trn2      v12.2s, v4.2s, v6.2s      // 4
    390     trn2      v13.2s, v5.2s, v7.2s
    391     //end transpose
    392 
    393     sshr      v18.4h, v11.4h, #1        // q0>>1
    394     sshr      v19.4h, v13.4h, #1        // q1>>1
    395 
    396     add       v14.4h, v10.4h, v12.4h    // x0 = q0 + q2//
    397     sub       v15.4h, v10.4h, v12.4h    // x1 = q0 - q2//
    398     sub       v16.4h, v18.4h, v13.4h    // x2 = (q1 >> 1) -  q3//
    399     add       v17.4h, v11.4h, v19.4h    // x3 = q1+ (q3 >> 3)//
    400 
    401     //Backup the output addr
    402     mov       x0, x2
    403 
    404     //load outpt buufer for interleaving
    405     ld1       {v10.8b}, [x2], x4
    406     ld1       {v11.8b}, [x2], x4
    407     ld1       {v12.8b}, [x2], x4
    408     ld1       {v13.8b}, [x2]
    409 
    410     add       v20.4h, v14.4h, v17.4h    // x0 + x3
    411     add       v21.4h, v15.4h, v16.4h    // x1 + x2
    412     sub       v22.4h, v15.4h, v16.4h    // x1 - x2
    413     sub       v23.4h, v14.4h, v17.4h    // x0 - x3
    414 
    415     srshr     v20.4h, v20.4h, #6
    416     srshr     v21.4h, v21.4h, #6
    417     srshr     v22.4h, v22.4h, #6
    418     srshr     v23.4h, v23.4h, #6
    419 
    420     //nop       v30.8b                            //dummy for deinterleaving
    421     movi      v31.4h, #0x00ff           //mask for interleaving [copy lower 8 bits]
    422 
    423     //Extract u/v plane from interleaved data
    424     uzp1      v26.8b, v26.8b, v30.8b
    425     uzp1      v27.8b, v27.8b, v30.8b
    426     uzp1      v28.8b, v28.8b, v30.8b
    427     uzp1      v29.8b, v29.8b, v30.8b
    428 
    429     uaddw     v20.8h, v20.8h, v26.8b
    430     uaddw     v21.8h, v21.8h, v27.8b
    431     uaddw     v22.8h, v22.8h, v28.8b
    432     uaddw     v23.8h, v23.8h, v29.8b
    433 
    434     sqxtun    v0.8b, v20.8h
    435     sqxtun    v1.8b, v21.8h
    436     sqxtun    v2.8b, v22.8h
    437     sqxtun    v3.8b, v23.8h
    438 
    439     //long the output so that we have 0 at msb and value at lsb
    440     uxtl      v6.8h, v0.8b
    441     uxtl      v7.8h, v1.8b
    442     uxtl      v8.8h, v2.8b
    443     uxtl      v9.8h, v3.8b
    444 
    445     //select lsbs from proceesd data and msbs from pu1_out loaded data
    446     bit       v10.8b, v6.8b, v31.8b
    447     bit       v11.8b, v7.8b, v31.8b
    448     bit       v12.8b, v8.8b, v31.8b
    449     bit       v13.8b, v9.8b, v31.8b
    450 
    451     //store the interleaved result
    452     st1       {v10.8b}, [x0], x4
    453     st1       {v11.8b}, [x0], x4
    454     st1       {v12.8b}, [x0], x4
    455     st1       {v13.8b}, [x0]
    456 
    457     pop_v_regs
    458     ret
    459 
    460 ///*
    461 // *******************************************************************************
    462 // *
    463 // * //brief
    464 // *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
    465 // *
    466 // * //par Description:
    467 // *  Performs inverse transform Ci8 and adds the residue to get the
    468 // *  reconstructed block
    469 // *
    470 // * //param[in] pi2_src
    471 // *  Input 4x4 coefficients
    472 // *
    473 // * //param[in] pu1_pred
    474 // *  Prediction 4x4 block
    475 // *
    476 // * //param[out] pu1_out
    477 // *  Output 4x4 block
    478 // *
    479 // * //param[in] u4_qp_div_6
    480 // *     QP
    481 // *
    482 // * //param[in] pu2_weigh_mat
    483 // * Pointer to weight matrix
    484 // *
    485 // * //param[in] pred_strd,
    486 // *  Prediction stride
    487 // *
    488 // * //param[in] out_strd
    489 // *  Output Stride
    490 // *
    491 // *//param[in] pi2_tmp
    492 // * temporary buffer of size 1*64
    493 // *
    494 // * //param[in] pu2_iscal_mat
    495 // * Pointer to the inverse quantization matrix
    496 // *
    497 // * //returns  Void
    498 // *
    499 // * //remarks
    500 // *  None
    501 // *
    502 // *******************************************************************************
    503 // */
    504 //void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
    505 //                                   UWORD8 *pu1_pred,
    506 //                                   UWORD8 *pu1_out,
    507 //                                   WORD32 pred_strd,
    508 //                                   WORD32 out_strd,
    509 //                                   const UWORD16 *pu2_iscal_mat,
    510 //                                   const UWORD16 *pu2_weigh_mat,
    511 //                                   UWORD32 u4_qp_div_6,
    512 //                                   WORD32 *pi4_tmp,
    513 //                                   WORD32 iq_start_idx
    514 //                                   WORD16 *pi2_dc_ld_addr)
    515 //**************Variables Vs Registers*****************************************
    516 //x0       => *pi2_src
    517 //x1       => *pu1_pred
    518 //x2       => *pu1_out
    519 //w3       =>  pred_strd
    520 //w4       =>  out_strd
    521 //x5       =>  *pu2_iscal_mat
    522 //x6       =>  *pu2_weigh_mat
    523 //w7       =>  u4_qp_div_6
    524 //NOT USED =>  pi4_tmp
    525 //NOT USED =>  iq_start_idx
    526 //NOT USED =>  pi2_dc_ld_addr
    527 
    528     .global ih264_iquant_itrans_recon_8x8_av8
    529 ih264_iquant_itrans_recon_8x8_av8:
    530 
    531     push_v_regs
    532     sxtw      x3, w3
    533     sxtw      x4, w4
    534 
    535     ld1       {v8.8h -v11.8h}, [x5], #64
    536     ld1       {v12.8h-v15.8h}, [x5]
    537 
    538     ld1       {v16.8h -v19.8h}, [x6], #64
    539     ld1       {v20.8h -v23.8h}, [x6]
    540 
    541     mov       x8, #16
    542     ld1       {v0.8h}, [x0], x8
    543     ld1       {v1.8h}, [x0], x8
    544     ld1       {v2.8h}, [x0], x8
    545     ld1       {v3.8h}, [x0], x8
    546     ld1       {v4.8h}, [x0], x8
    547     ld1       {v5.8h}, [x0], x8
    548     ld1       {v6.8h}, [x0], x8
    549     ld1       {v7.8h}, [x0]
    550 
    551     mul       v8.8h, v8.8h, v16.8h
    552     mul       v9.8h, v9.8h, v17.8h
    553     mul       v10.8h, v10.8h, v18.8h
    554     mul       v11.8h, v11.8h, v19.8h
    555     mul       v12.8h, v12.8h, v20.8h
    556     mul       v13.8h, v13.8h, v21.8h
    557     mul       v14.8h, v14.8h, v22.8h
    558     mul       v15.8h, v15.8h, v23.8h
    559 
    560     smull     v16.4s, v0.4h, v8.4h
    561     smull2    v17.4s, v0.8h, v8.8h
    562     smull     v18.4s, v1.4h, v9.4h
    563     smull2    v19.4s, v1.8h, v9.8h
    564     smull     v20.4s, v2.4h, v10.4h
    565     smull2    v21.4s, v2.8h, v10.8h
    566     smull     v22.4s, v3.4h, v11.4h
    567     smull2    v23.4s, v3.8h, v11.8h
    568     smull     v24.4s, v4.4h, v12.4h
    569     smull2    v25.4s, v4.8h, v12.8h
    570     smull     v26.4s, v5.4h, v13.4h
    571     smull2    v27.4s, v5.8h, v13.8h
    572     smull     v28.4s, v6.4h, v14.4h
    573     smull2    v29.4s, v6.8h, v14.8h
    574     smull     v30.4s, v7.4h, v15.4h
    575     smull2    v31.4s, v7.8h, v15.8h
    576 
    577     dup       v0.4s, w7
    578 
    579     sshl      v16.4s, v16.4s, v0.4s
    580     sshl      v17.4s, v17.4s, v0.4s
    581     sshl      v18.4s, v18.4s, v0.4s
    582     sshl      v19.4s, v19.4s, v0.4s
    583     sshl      v20.4s, v20.4s, v0.4s
    584     sshl      v21.4s, v21.4s, v0.4s
    585     sshl      v22.4s, v22.4s, v0.4s
    586     sshl      v23.4s, v23.4s, v0.4s
    587     sshl      v24.4s, v24.4s, v0.4s
    588     sshl      v25.4s, v25.4s, v0.4s
    589     sshl      v26.4s, v26.4s, v0.4s
    590     sshl      v27.4s, v27.4s, v0.4s
    591     sshl      v28.4s, v28.4s, v0.4s
    592     sshl      v29.4s, v29.4s, v0.4s
    593     sshl      v30.4s, v30.4s, v0.4s
    594     sshl      v31.4s, v31.4s, v0.4s
    595 
    596     sqrshrn   v0.4h, v16.4s, #6
    597     sqrshrn2  v0.8h, v17.4s, #6
    598     sqrshrn   v1.4h, v18.4s, #6
    599     sqrshrn2  v1.8h, v19.4s, #6
    600     sqrshrn   v2.4h, v20.4s, #6
    601     sqrshrn2  v2.8h, v21.4s, #6
    602     sqrshrn   v3.4h, v22.4s, #6
    603     sqrshrn2  v3.8h, v23.4s, #6
    604     sqrshrn   v4.4h, v24.4s, #6
    605     sqrshrn2  v4.8h, v25.4s, #6
    606     sqrshrn   v5.4h, v26.4s, #6
    607     sqrshrn2  v5.8h, v27.4s, #6
    608     sqrshrn   v6.4h, v28.4s, #6
    609     sqrshrn2  v6.8h, v29.4s, #6
    610     sqrshrn   v7.4h, v30.4s, #6
    611     sqrshrn2  v7.8h, v31.4s, #6
    612 
    613     //loop counter
    614     mov       x8, #2
    615 //1x8 transofORM
    616 trans_1x8_1d:
    617 
    618     //transpose 8x8
    619     trn1      v8.8h, v0.8h, v1.8h
    620     trn2      v9.8h, v0.8h, v1.8h
    621     trn1      v10.8h, v2.8h, v3.8h
    622     trn2      v11.8h, v2.8h, v3.8h
    623     trn1      v12.8h, v4.8h, v5.8h
    624     trn2      v13.8h, v4.8h, v5.8h
    625     trn1      v14.8h, v6.8h, v7.8h
    626     trn2      v15.8h, v6.8h, v7.8h
    627 
    628     trn1      v0.4s, v8.4s, v10.4s
    629     trn2      v2.4s, v8.4s, v10.4s
    630     trn1      v1.4s, v9.4s, v11.4s
    631     trn2      v3.4s, v9.4s, v11.4s
    632     trn1      v4.4s, v12.4s, v14.4s
    633     trn2      v6.4s, v12.4s, v14.4s
    634     trn1      v5.4s, v13.4s, v15.4s
    635     trn2      v7.4s, v13.4s, v15.4s
    636 
    637     trn1      v8.2d, v0.2d, v4.2d       //0
    638     trn2      v12.2d, v0.2d, v4.2d      //1
    639     trn1      v9.2d, v1.2d, v5.2d       //2
    640     trn2      v13.2d, v1.2d, v5.2d      //3
    641     trn1      v10.2d, v2.2d, v6.2d      //4
    642     trn2      v14.2d, v2.2d, v6.2d      //5
    643     trn1      v11.2d, v3.2d, v7.2d      //6
    644     trn2      v15.2d, v3.2d, v7.2d      //7
    645 
    646     // 1 3 5 6 7
    647     sshr      v16.8h, v9.8h, #1         //(pi2_tmp_ptr[1] >> 1)
    648     sshr      v17.8h, v10.8h, #1        //(pi2_tmp_ptr[2] >> 1)
    649     sshr      v18.8h, v11.8h, #1        //(pi2_tmp_ptr[3] >> 1)
    650     sshr      v19.8h, v13.8h, #1        //(pi2_tmp_ptr[5] >> 1)
    651     sshr      v20.8h, v14.8h, #1        //(pi2_tmp_ptr[6] >> 1)
    652     sshr      v21.8h, v15.8h, #1        //(pi2_tmp_ptr[7] >> 1)
    653 
    654     add       v0.8h, v8.8h, v12.8h      // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
    655     sub       v2.8h, v8.8h, v12.8h      // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
    656 
    657     sub       v4.8h, v17.8h, v14.8h     //i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
    658     add       v6.8h, v10.8h, v20.8h     //i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
    659 
    660     //-w3 + w5
    661     ssubl     v22.4s, v13.4h, v11.4h
    662     ssubl2    v23.4s, v13.8h, v11.8h
    663     //w3 + w5
    664     saddl     v24.4s, v13.4h, v11.4h
    665     saddl2    v25.4s, v13.8h, v11.8h
    666     //-w1 + w7
    667     ssubl     v26.4s, v15.4h, v9.4h
    668     ssubl2    v27.4s, v15.8h, v9.8h
    669     //w1 + w7
    670     saddl     v28.4s, v15.4h, v9.4h
    671     saddl2    v29.4s, v15.8h, v9.8h
    672 
    673     //-w3 + w5 - w7
    674     ssubw     v22.4s, v22.4s, v15.4h
    675     ssubw2    v23.4s, v23.4s, v15.8h
    676     //w3 + w5 + w1
    677     saddw     v24.4s, v24.4s, v9.4h
    678     saddw2    v25.4s, v25.4s, v9.8h
    679     //-w1 + w7 + w5
    680     saddw     v26.4s, v26.4s, v13.4h
    681     saddw2    v27.4s, v27.4s, v13.8h
    682     //w1 + w7 - w3
    683     ssubw     v28.4s, v28.4s, v11.4h
    684     ssubw2    v29.4s, v29.4s, v11.8h
    685 
    686     //-w3 + w5 - w7 - (w7 >> 1)
    687     ssubw     v22.4s, v22.4s, v21.4h
    688     ssubw2    v23.4s, v23.4s, v21.8h
    689     //w3 + w5 + w1 + (w1 >> 1)
    690     saddw     v24.4s, v24.4s, v16.4h
    691     saddw2    v25.4s, v25.4s, v16.8h
    692     //-w1 + w7 + w5 + (w5 >> 1)
    693     saddw     v26.4s, v26.4s, v19.4h
    694     saddw2    v27.4s, v27.4s, v19.8h
    695     //w1 + w7 - w3 - (w3 >> 1)
    696     ssubw     v28.4s, v28.4s, v18.4h
    697     ssubw2    v29.4s, v29.4s, v18.8h
    698 
    699     xtn       v1.4h, v22.4s
    700     xtn2      v1.8h, v23.4s
    701     xtn       v3.4h, v28.4s
    702     xtn2      v3.8h, v29.4s
    703     xtn       v5.4h, v26.4s
    704     xtn2      v5.8h, v27.4s
    705     xtn       v7.4h, v24.4s
    706     xtn2      v7.8h, v25.4s
    707 
    708     sshr      v16.8h, v1.8h, #2         //(y1 >> 2)
    709     sshr      v17.8h, v3.8h, #2         //(y3 >> 2)
    710     sshr      v18.8h, v5.8h, #2         //(y5 >> 2)
    711     sshr      v19.8h, v7.8h, #2         //(y7 >> 2)
    712 
    713     add       v8.8h, v0.8h, v6.8h
    714     add       v9.8h, v1.8h, v19.8h
    715     add       v10.8h, v2.8h, v4.8h
    716     add       v11.8h, v3.8h, v18.8h
    717     sub       v12.8h, v2.8h, v4.8h
    718     sub       v13.8h, v17.8h, v5.8h
    719     sub       v14.8h, v0.8h, v6.8h
    720     sub       v15.8h, v7.8h, v16.8h
    721 
    722     add       v0.8h, v8.8h, v15.8h
    723     add       v1.8h, v10.8h, v13.8h
    724     add       v2.8h, v12.8h, v11.8h
    725     add       v3.8h, v14.8h, v9.8h
    726     sub       v4.8h, v14.8h, v9.8h
    727     sub       v5.8h, v12.8h, v11.8h
    728     sub       v6.8h, v10.8h, v13.8h
    729     sub       v7.8h, v8.8h, v15.8h
    730 
    731     subs      x8, x8, #1
    732     bne       trans_1x8_1d
    733 
    734     ld1       {v22.8b}, [x1], x3
    735     ld1       {v23.8b}, [x1], x3
    736     ld1       {v24.8b}, [x1], x3
    737     ld1       {v25.8b}, [x1], x3
    738     ld1       {v26.8b}, [x1], x3
    739     ld1       {v27.8b}, [x1], x3
    740     ld1       {v28.8b}, [x1], x3
    741     ld1       {v29.8b}, [x1]
    742 
    743     srshr     v0.8h, v0.8h, #6
    744     srshr     v1.8h, v1.8h, #6
    745     srshr     v2.8h, v2.8h, #6
    746     srshr     v3.8h, v3.8h, #6
    747     srshr     v4.8h, v4.8h, #6
    748     srshr     v5.8h, v5.8h, #6
    749     srshr     v6.8h, v6.8h, #6
    750     srshr     v7.8h, v7.8h, #6
    751 
    752     uaddw     v0.8h, v0.8h, v22.8b
    753     uaddw     v1.8h, v1.8h, v23.8b
    754     uaddw     v2.8h, v2.8h, v24.8b
    755     uaddw     v3.8h, v3.8h, v25.8b
    756     uaddw     v4.8h, v4.8h, v26.8b
    757     uaddw     v5.8h, v5.8h, v27.8b
    758     uaddw     v6.8h, v6.8h, v28.8b
    759     uaddw     v7.8h, v7.8h, v29.8b
    760 
    761     sqxtun    v0.8b, v0.8h
    762     sqxtun    v1.8b, v1.8h
    763     sqxtun    v2.8b, v2.8h
    764     sqxtun    v3.8b, v3.8h
    765     sqxtun    v4.8b, v4.8h
    766     sqxtun    v5.8b, v5.8h
    767     sqxtun    v6.8b, v6.8h
    768     sqxtun    v7.8b, v7.8h
    769 
    770     st1       {v0.8b}, [x2], x4
    771     st1       {v1.8b}, [x2], x4
    772     st1       {v2.8b}, [x2], x4
    773     st1       {v3.8b}, [x2], x4
    774     st1       {v4.8b}, [x2], x4
    775     st1       {v5.8b}, [x2], x4
    776     st1       {v6.8b}, [x2], x4
    777     st1       {v7.8b}, [x2]
    778 
    779     pop_v_regs
    780     ret
    781 
    782 
    783 
    784 
    785