Home | History | Annotate | Download | only in arm64
      1 ///*****************************************************************************
      2 //*
      3 //* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************/
     18 ///**
     19 // *******************************************************************************
     20 // * @file
     21 // *  ihevc_itrans_recon_8x8_neon.s
     22 // *
     23 // * @brief
     24 // *  contains function definitions for single stage  inverse transform
     25 // *
     26 // * @author
     27 // *  anand s
     28 // *
     29 // * @par list of functions:
     30 // *  - ihevc_itrans_recon_8x8()
     31 // *
     32 // * @remarks
     33 // *  none
     34 // *
     35 // *******************************************************************************
     36 //*/
     37 
     38 ///**
     39 // *******************************************************************************
     40 // *
     41 // * @brief
     42 // *  this function performs inverse transform  and reconstruction for 8x8
     43 // * input block
     44 // *
     45 // * @par description:
     46 // *  performs inverse transform and adds the prediction  data and clips output
     47 // * to 8 bit
     48 // *
     49 // * @param[in] pi2_src
     50 // *  input 8x8 coefficients
     51 // *
     52 // * @param[in] pi2_tmp
     53 // *  temporary 8x8 buffer for storing inverse
     54 // *
     55 // *  transform
     56 // *  1st stage output
     57 // *
     58 // * @param[in] pu1_pred
     59 // *  prediction 8x8 block
     60 // *
     61 // * @param[out] pu1_dst
     62 // *  output 8x8 block
     63 // *
     64 // * @param[in] src_strd
     65 // *  input stride
     66 // *
     67 // * @param[in] pred_strd
     68 // *  prediction stride
     69 // *
     70 // * @param[in] dst_strd
     71 // *  output stride
     72 // *
     73 // * @param[in] shift
     74 // *  output shift
     75 // *
     76 // * @param[in] zero_cols
     77 // *  zero columns in pi2_src
     78 // *
     79 // * @returns  void
     80 // *
     81 // * @remarks
     82 // *  none
     83 // *
     84 // *******************************************************************************
     85 // */
     86 
     87 //void ihevc_itrans_recon_8x8(word16 *pi2_src,
     88 //                            word16 *pi2_tmp,
     89 //                            uword8 *pu1_pred,
     90 //                            uword8 *pu1_dst,
     91 //                            word32 src_strd,
     92 //                            word32 pred_strd,
     93 //                            word32 dst_strd,
     94 //                            word32 zero_cols
     95 //                             word32    zero_rows                )
     96 
     97 //**************variables vs registers*************************
     98 //    x0 => *pi2_src
     99 //    x1 => *pi2_tmp
    100 //    x2 => *pu1_pred
    101 //    x3 => *pu1_dst
    102 //    src_strd
    103 //    pred_strd
    104 //    dst_strd
    105 //    zero_cols
    106 
    107 
    108 
    109 .text
    110 .align 4
    111 .include "ihevc_neon_macros.s"
    112 
    113 
    114 
    115 .set width_x_size_x5 ,   40
    116 .set width_x_size_x2 ,   32
    117 .set shift_stage1_idct ,   7
    118 .set shift_stage2_idct ,   12
    119 
    120 .globl ihevc_itrans_recon_8x8_av8
    121 
    122 .extern g_ai2_ihevc_trans_8_transpose
    123 
    124 .type ihevc_itrans_recon_8x8_av8, %function
    125 
    126 ihevc_itrans_recon_8x8_av8:
    127 ////register usage.extern        - loading and until idct of columns
    128 ////    cosine constants     -     d0
    129 ////    sine constants         -     d1
    130 ////    row 0 first half     -     d2        -    y0
    131 ////    row 1 first half     -     d6        -    y1
    132 ////    row 2 first half     -     d3        -    y2
    133 ////    row 3 first half     -     d7        -    y3
    134 ////    row 4 first half     -     d10        -    y4
    135 ////    row 5 first half     -     d14        -    y5
    136 ////    row 6 first half     -     d11        -    y6
    137 ////    row 7 first half     -     d15        -    y7
    138 
    139 ////    row 0 second half    -     d4        -    y0
    140 ////    row 1 second half    -     d8      -    y1
    141 ////    row 2 second half    -     d5      -    y2
    142 ////    row 3 second half    -     d9      -    y3
    143 ////    row 4 second half    -     d12     -    y4
    144 ////    row 5 second half    -     d16     -    y5
    145 ////    row 6 second half    -     d13     -    y6
    146 ////    row 7 second half    -     d17     -    y7
    147 
    148     //// copy the input pointer to another register
    149     //// step 1 : load all constants
    150     // stmfd sp!,{x4-x12,x14}
    151 
    152     ldr         w11, [sp]                   // zero rows
    153 
    154     push_v_regs
    155     stp         x19, x20,[sp,#-16]!
    156 
    157     mov         x12, x7 // zero columns
    158     mov         x8, x5 // prediction stride
    159     mov         x7, x6 // destination stride
    160     mov         x6, x4 // src stride
    161     lsl         x6, x6, #1                  // x sizeof(word16)
    162     add         x9,x0,x6, lsl #1            // 2 rows
    163 
    164     add         x10,x6,x6, lsl #1           // 3 rows
    165 
    166     sub         x10,x10, #8                 // - 4 cols * sizeof(word16)
    167     sub         x5,x6, #8                   // src_strd - 4 cols * sizeof(word16)
    168 
    169     adrp        x14, :got:g_ai2_ihevc_trans_8_transpose
    170     ldr         x14, [x14, #:got_lo12:g_ai2_ihevc_trans_8_transpose]
    171 
    172     ld1         {v0.4h, v1.4h},[x14]        ////d0,d1 are used for storing the constant data
    173 
    174     ////step 2 load all the input data
    175     ////step 3 operate first 4 colums at a time
    176 
    177     and         x11,x11,#0xff
    178     and         x12,x12,#0xff
    179 
    180     cmp         x11,#0xf0
    181     bge         skip_last4_rows
    182 
    183 
    184     ld1         {v2.4h},[x0],#8
    185     ld1         {v3.4h},[x9],#8
    186     ld1         {v4.4h},[x0],x5
    187     smull       v20.4s, v2.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
    188     ld1         {v5.4h},[x9],x5
    189     smull       v18.4s, v3.4h, v1.h[2]      //// y2 * sin2 (q3 is freed by this time)(part of d1)
    190     ld1         {v6.4h},[x0],#8
    191     ld1         {v7.4h},[x9],#8
    192     smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
    193     ld1         {v8.4h},[x0],x10
    194     smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
    195     ld1         {v9.4h},[x9],x10
    196     smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
    197     ld1         {v10.4h},[x0],#8
    198     smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
    199     ld1         {v11.4h},[x9],#8
    200     smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
    201     ld1         {v12.4h},[x0],x5
    202     smlsl       v26.4s, v7.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
    203     ld1         {v13.4h},[x9],x5
    204     smlsl       v28.4s, v7.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
    205     ld1         {v14.4h},[x0],#8
    206     smlsl       v30.4s, v7.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
    207     ld1         {v15.4h},[x9],#8
    208     smull       v22.4s, v10.4h, v0.h[0]     //// y4 * cos4(part of c0 and c1)
    209     ld1         {v16.4h},[x0],x10
    210     smull       v6.4s, v3.4h, v0.h[2]       //// y2 * cos2(part of d0)
    211     ld1         {v17.4h},[x9],x10
    212 
    213     ///* this following was activated when alignment is not there */
    214 ////    vld1.16        d2,[x0]!
    215 ////    vld1.16        d3,[x2]!
    216 ////    vld1.16        d4,[x0]!
    217 ////    vld1.16        d5,[x2]!
    218 ////    vld1.16        d6,[x0]!
    219 ////    vld1.16        d7,[x2]!
    220 ////    vld1.16        d8,[x0],x3
    221 ////    vld1.16        d9,[x2],x3
    222 ////    vld1.16        d10,[x0]!
    223 ////    vld1.16        d11,[x2]!
    224 ////    vld1.16        d12,[x0]!
    225 ////    vld1.16        d13,[x2]!
    226 ////    vld1.16        d14,[x0]!
    227 ////    vld1.16        d15,[x2]!
    228 ////    vld1.16        d16,[x0],x3
    229 ////    vld1.16        d17,[x2],x3
    230 
    231 
    232 
    233 
    234     smlal       v24.4s, v14.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    235     smlsl       v26.4s, v14.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    236     smlal       v28.4s, v14.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    237     smlal       v30.4s, v14.4h, v0.h[3]     //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
    238 
    239     smlsl       v18.4s, v11.4h, v0.h[2]     //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    240     smlal       v6.4s, v11.4h, v1.h[2]      //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
    241 
    242     add         v10.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    243     sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
    244 
    245     smlal       v24.4s, v15.4h, v1.h[3]     //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
    246     smlsl       v26.4s, v15.4h, v1.h[1]     //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
    247     smlal       v28.4s, v15.4h, v0.h[3]     //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
    248     smlsl       v30.4s, v15.4h, v0.h[1]     //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
    249 
    250     add         v14.4s,  v10.4s ,  v6.4s    ////    a0 = c0 + d0(part of x0,x7)
    251     sub         v10.4s,  v10.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
    252     sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
    253     add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
    254 
    255     add         v20.4s,  v14.4s ,  v24.4s   //// a0 + b0(part of x0)
    256     sub         v6.4s,  v14.4s ,  v24.4s    //// a0 - b0(part of x7)
    257 
    258     add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of x2)
    259     sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of x5)
    260 
    261     add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of x1)
    262     sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of x6)
    263 
    264     add         v26.4s,  v10.4s ,  v30.4s   //// a3 + b3(part of x3)
    265     sub         v30.4s,  v10.4s ,  v30.4s   //// a3 - b3(part of x4)
    266 
    267     sqrshrn     v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    268     sqrshrn     v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    269     sqrshrn     v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    270     sqrshrn     v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    271     sqrshrn     v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    272     sqrshrn     v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    273     sqrshrn     v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    274     sqrshrn     v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    275 
    276 
    277     b           last4_cols
    278 
    279 
    280 
    281 skip_last4_rows:
    282 
    283 
    284 
    285     ld1         {v2.4h},[x0],#8
    286     ld1         {v3.4h},[x9],#8
    287     ld1         {v4.4h},[x0],x5
    288     ld1         {v5.4h},[x9],x5
    289     ld1         {v6.4h},[x0],#8
    290     ld1         {v7.4h},[x9],#8
    291     ld1         {v8.4h},[x0],x10
    292     ld1         {v9.4h},[x9],x10
    293 
    294 
    295 
    296     movi        v12.4h, #0
    297     movi        v13.4h, #0
    298     movi        v16.4h, #0
    299     movi        v17.4h, #0
    300 
    301 
    302 
    303 
    304     smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
    305     smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
    306     smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
    307     smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
    308 
    309     smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
    310     smlsl       v26.4s, v7.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
    311     smlsl       v28.4s, v7.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
    312     smlsl       v30.4s, v7.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
    313 
    314     smull       v18.4s, v3.4h, v1.h[2]      //// y2 * sin2 (q3 is freed by this time)(part of d1)
    315     smull       v6.4s, v3.4h, v0.h[2]       //// y2 * cos2(part of d0)
    316 
    317     smull       v20.4s, v2.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
    318 
    319 
    320     add         v14.4s,  v20.4s ,  v6.4s    ////    a0 = c0 + d0(part of x0,x7)
    321     sub         v10.4s,  v20.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
    322     sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
    323     add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
    324 
    325     add         v20.4s,  v14.4s ,  v24.4s   //// a0 + b0(part of x0)
    326     sub         v6.4s,  v14.4s ,  v24.4s    //// a0 - b0(part of x7)
    327 
    328     add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of x2)
    329     sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of x5)
    330 
    331     add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of x1)
    332     sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of x6)
    333 
    334     add         v26.4s,  v10.4s ,  v30.4s   //// a3 + b3(part of x3)
    335     sub         v30.4s,  v10.4s ,  v30.4s   //// a3 - b3(part of x4)
    336 
    337     sqrshrn     v2.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    338     sqrshrn     v15.4h, v6.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    339     sqrshrn     v3.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    340     sqrshrn     v14.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    341     sqrshrn     v6.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    342     sqrshrn     v11.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    343     sqrshrn     v7.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    344     sqrshrn     v10.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    345 
    346 
    347 last4_cols:
    348 
    349 
    350     cmp         x12,#0xf0
    351     bge         skip_last4cols
    352 
    353     smull       v24.4s, v8.4h, v0.h[1]      //// y1 * cos1(part of b0)
    354     smull       v26.4s, v8.4h, v0.h[3]      //// y1 * cos3(part of b1)
    355     smull       v28.4s, v8.4h, v1.h[1]      //// y1 * sin3(part of b2)
    356     smull       v30.4s, v8.4h, v1.h[3]      //// y1 * sin1(part of b3)
    357 
    358     smlal       v24.4s, v9.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
    359     smlsl       v26.4s, v9.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
    360     smlsl       v28.4s, v9.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
    361     smlsl       v30.4s, v9.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
    362 
    363     smull       v18.4s, v5.4h, v1.h[2]      //// y2 * sin2 (q4 is freed by this time)(part of d1)
    364     smull       v8.4s, v5.4h, v0.h[2]       //// y2 * cos2(part of d0)
    365 
    366     smull       v20.4s, v4.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
    367     smull       v22.4s, v12.4h, v0.h[0]     //// y4 * cos4(part of c0 and c1)
    368 
    369     smlal       v24.4s, v16.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    370     smlsl       v26.4s, v16.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    371     smlal       v28.4s, v16.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    372     smlal       v30.4s, v16.4h, v0.h[3]     //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
    373 
    374     smlsl       v18.4s, v13.4h, v0.h[2]     //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    375     smlal       v8.4s, v13.4h, v1.h[2]      //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
    376 
    377     add         v12.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    378     sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
    379 
    380     smlal       v24.4s, v17.4h, v1.h[3]     //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
    381     smlsl       v26.4s, v17.4h, v1.h[1]     //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
    382     smlal       v28.4s, v17.4h, v0.h[3]     //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
    383     smlsl       v30.4s, v17.4h, v0.h[1]     //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
    384 
    385     add         v16.4s,  v12.4s ,  v8.4s    ////    a0 = c0 + d0(part of e0,e7)
    386     sub         v12.4s,  v12.4s ,  v8.4s    //// a3 = c0 - d0(part of e3,e4)
    387     sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of e2,e5)
    388     add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of e1,e6)
    389 
    390     add         v20.4s,  v16.4s ,  v24.4s   //// a0 + b0(part of e0)
    391     sub         v8.4s,  v16.4s ,  v24.4s    //// a0 - b0(part of e7)
    392 
    393     add         v24.4s,  v22.4s ,  v28.4s   //// a2 + b2(part of e2)
    394     sub         v22.4s,  v22.4s ,  v28.4s   //// a2 - b2(part of e5)
    395 
    396     add         v28.4s,  v18.4s ,  v26.4s   //// a1 + b1(part of e1)
    397     sub         v18.4s,  v18.4s ,  v26.4s   //// a1 - b1(part of e6)
    398 
    399     add         v26.4s,  v12.4s ,  v30.4s   //// a3 + b3(part of e3)
    400     sub         v30.4s,  v12.4s ,  v30.4s   //// a3 - b3(part of x4)
    401 
    402     sqrshrn     v4.4h, v20.4s,#shift_stage1_idct //// x0 = (a0 + b0 + rnd) >> 7(shift_stage1_idct)
    403     sqrshrn     v17.4h, v8.4s,#shift_stage1_idct //// x7 = (a0 - b0 + rnd) >> 7(shift_stage1_idct)
    404     sqrshrn     v5.4h, v24.4s,#shift_stage1_idct //// x2 = (a2 + b2 + rnd) >> 7(shift_stage1_idct)
    405     sqrshrn     v16.4h, v22.4s,#shift_stage1_idct //// x5 = (a2 - b2 + rnd) >> 7(shift_stage1_idct)
    406     sqrshrn     v8.4h, v28.4s,#shift_stage1_idct //// x1 = (a1 + b1 + rnd) >> 7(shift_stage1_idct)
    407     sqrshrn     v13.4h, v18.4s,#shift_stage1_idct //// x6 = (a1 - b1 + rnd) >> 7(shift_stage1_idct)
    408     sqrshrn     v9.4h, v26.4s,#shift_stage1_idct //// x3 = (a3 + b3 + rnd) >> 7(shift_stage1_idct)
    409     sqrshrn     v12.4h, v30.4s,#shift_stage1_idct //// x4 = (a3 - b3 + rnd) >> 7(shift_stage1_idct)
    410     b           end_skip_last4cols
    411 
    412 
    413 
    414 skip_last4cols:
    415 
    416     umov        x15,v25.d[0]
    417 
    418     trn1        v25.4h, v2.4h, v6.4h
    419     trn2        v29.4h, v2.4h, v6.4h        ////[x3,x1],[x2,x0] first qudrant transposing
    420 
    421     trn1        v27.4h, v3.4h, v7.4h
    422     trn2        v31.4h, v3.4h, v7.4h        ////[x3,x1],[x2,x0] first qudrant transposing
    423 
    424     trn1        v6.2s, v29.2s, v31.2s
    425     trn2        v7.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
    426     trn1        v2.2s, v25.2s, v27.2s
    427     trn2        v3.2s, v25.2s, v27.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
    428 
    429 
    430     trn1        v25.4h, v10.4h, v14.4h
    431     trn2        v29.4h, v10.4h, v14.4h      ////[x7,x5],[x6,x4] third qudrant transposing
    432 
    433     trn1        v27.4h, v11.4h, v15.4h
    434     trn2        v31.4h, v11.4h, v15.4h      ////[x7,x5],[x6,x4] third qudrant transposing
    435 
    436     trn1        v10.2s, v25.2s, v27.2s
    437     trn2        v11.2s, v25.2s, v27.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
    438     trn1        v14.2s, v29.2s, v31.2s
    439     trn2        v15.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
    440 
    441     mov         v25.d[0],x15
    442 
    443     smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
    444     smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
    445     smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
    446     smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
    447 
    448     smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
    449     smlsl       v26.4s, v7.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
    450     smlsl       v28.4s, v7.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
    451     smlsl       v30.4s, v7.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
    452 
    453     smull       v20.4s, v2.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
    454 //    vmull.s16    q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)
    455 
    456     smull       v18.4s, v3.4h, v1.h[2]      //// y2 * sin2 (q3 is freed by this time)(part of d1)
    457     smull       v6.4s, v3.4h, v0.h[2]       //// y2 * cos2(part of d0)
    458 
    459 
    460 
    461 
    462     sub         v22.4s,  v20.4s ,  v6.4s    //// a3 = c0 - d0(part of x3,x4)
    463     add         v4.4s,  v20.4s ,  v6.4s     ////    a0 = c0 + d0(part of x0,x7)
    464 
    465 
    466     add         v2.4s,  v4.4s ,  v24.4s
    467 
    468     sub         v6.4s,  v4.4s ,  v24.4s
    469 
    470     add         v8.4s,  v22.4s ,  v30.4s
    471 
    472     sub         v24.4s,  v22.4s ,  v30.4s
    473 
    474     sqrshrn     v5.4h, v8.4s,#shift_stage2_idct
    475     sqrshrn     v2.4h, v2.4s,#shift_stage2_idct
    476     sqrshrn     v9.4h, v6.4s,#shift_stage2_idct
    477     sqrshrn     v6.4h, v24.4s,#shift_stage2_idct
    478 
    479     sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
    480     add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
    481 
    482 
    483     add         v30.4s,  v22.4s ,  v28.4s
    484 
    485     sub         v24.4s,  v22.4s ,  v28.4s
    486 
    487     add         v28.4s,  v18.4s ,  v26.4s
    488 
    489     sub         v22.4s,  v18.4s ,  v26.4s
    490     sqrshrn     v4.4h, v30.4s,#shift_stage2_idct
    491     sqrshrn     v7.4h, v24.4s,#shift_stage2_idct
    492     sqrshrn     v3.4h, v28.4s,#shift_stage2_idct
    493     sqrshrn     v8.4h, v22.4s,#shift_stage2_idct
    494 
    495 
    496 
    497     umov        x19,v25.d[0]
    498     umov        x20,v25.d[1]
    499 
    500     trn1        v27.4h, v2.4h, v3.4h
    501     trn2        v29.4h, v2.4h, v3.4h
    502     trn1        v25.4h, v4.4h, v5.4h
    503     trn2        v31.4h, v4.4h, v5.4h
    504 
    505     trn1        v2.2s, v27.2s, v25.2s
    506     trn2        v4.2s, v27.2s, v25.2s
    507     trn1        v3.2s, v29.2s, v31.2s
    508     trn2        v5.2s, v29.2s, v31.2s
    509 
    510     trn1        v27.4h, v6.4h, v7.4h
    511     trn2        v29.4h, v6.4h, v7.4h
    512     trn1        v25.4h, v8.4h, v9.4h
    513     trn2        v31.4h, v8.4h, v9.4h
    514 
    515     trn1        v6.2s, v27.2s, v25.2s
    516     trn2        v8.2s, v27.2s, v25.2s
    517     trn1        v7.2s, v29.2s, v31.2s
    518     trn2        v9.2s, v29.2s, v31.2s
    519 
    520     mov         v25.d[0],x19
    521     mov         v25.d[1],x20
    522 
    523     smull       v24.4s, v14.4h, v0.h[1]     //// y1 * cos1(part of b0)
    524 
    525     smull       v26.4s, v14.4h, v0.h[3]     //// y1 * cos3(part of b1)
    526     smull       v28.4s, v14.4h, v1.h[1]     //// y1 * sin3(part of b2)
    527     smull       v30.4s, v14.4h, v1.h[3]     //// y1 * sin1(part of b3)
    528 
    529     smlal       v24.4s, v15.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    530     smlsl       v26.4s, v15.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    531     smlsl       v28.4s, v15.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
    532     smlsl       v30.4s, v15.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    533     smull       v20.4s, v10.4h, v0.h[0]     //// y0 * cos4(part of c0 and c1)
    534     smull       v18.4s, v11.4h, v1.h[2]     //// y2 * sin2 (q7 is freed by this time)(part of d1)
    535     smull       v14.4s, v11.4h, v0.h[2]     //// y2 * cos2(part of d0)
    536 
    537 
    538     add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
    539 
    540 
    541     add         x5,x8,x8, lsl #1            //
    542 
    543 
    544     add         x0,x3,x7, lsl #1            // x0 points to 3rd row of dest data
    545 
    546 
    547     add         x10,x7,x7, lsl #1           //
    548 
    549     // swapping v3 and v6
    550     mov         v31.d[0], v3.d[0]
    551     mov         v3.d[0], v6.d[0]
    552     mov         v6.d[0], v31.d[0]
    553 
    554     // swapping v5 and v8
    555     mov         v31.d[0], v5.d[0]
    556     mov         v5.d[0], v8.d[0]
    557     mov         v8.d[0], v31.d[0]
    558 
    559 
    560     sub         v22.4s,  v20.4s ,  v14.4s   //// a3 = c0 - d0(part of x3,x4)
    561     add         v12.4s,  v20.4s ,  v14.4s   ////    a0 = c0 + d0(part of x0,x7)
    562 
    563 
    564     add         v0.4s,  v12.4s ,  v24.4s
    565 
    566 
    567     sub         v24.4s,  v12.4s ,  v24.4s
    568 
    569 
    570     add         v12.4s,  v22.4s ,  v30.4s
    571 
    572 
    573     sub         v14.4s,  v22.4s ,  v30.4s
    574 
    575     sqrshrn     v10.4h, v0.4s,#shift_stage2_idct
    576     sqrshrn     v17.4h, v24.4s,#shift_stage2_idct
    577     sqrshrn     v13.4h, v12.4s,#shift_stage2_idct
    578     sqrshrn     v14.4h, v14.4s,#shift_stage2_idct
    579 
    580     sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
    581     add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
    582 
    583 
    584     add         v0.4s,  v22.4s ,  v28.4s
    585 
    586 
    587     sub         v24.4s,  v22.4s ,  v28.4s
    588 
    589 
    590     add         v28.4s,  v18.4s ,  v26.4s
    591 
    592 
    593     sub         v26.4s,  v18.4s ,  v26.4s
    594     ld1         {v18.8b},[x2],x8
    595 
    596     sqrshrn     v12.4h, v0.4s,#shift_stage2_idct
    597     ld1         {v20.8b},[x2],x5
    598 
    599 
    600     sqrshrn     v15.4h, v24.4s,#shift_stage2_idct
    601     ld1         {v19.8b},[x2],x8
    602 
    603 
    604 
    605 
    606     sqrshrn     v11.4h, v28.4s,#shift_stage2_idct
    607     ld1         {v22.8b},[x4],x8
    608 
    609 
    610 
    611 
    612     sqrshrn     v16.4h, v26.4s,#shift_stage2_idct
    613     ld1         {v21.8b},[x2],x5
    614 
    615 
    616     b           pred_buff_addition
    617 end_skip_last4cols:
    618 
    619 
    620     umov        x19,v25.d[0]
    621     umov        x20,v25.d[1]
    622 
    623 ///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
    624     trn1        v27.4h, v2.4h, v6.4h
    625     trn2        v29.4h, v2.4h, v6.4h        ////[x3,x1],[x2,x0] first qudrant transposing
    626     trn1        v25.4h, v3.4h, v7.4h
    627     trn2        v31.4h, v3.4h, v7.4h        ////[x3,x1],[x2,x0] first qudrant transposing
    628 
    629     trn1        v2.2s, v27.2s, v25.2s
    630     trn2        v3.2s, v27.2s, v25.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
    631     trn1        v6.2s, v29.2s, v31.2s
    632     trn2        v7.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 first qudrant transposing continued.....
    633 
    634     trn1        v27.4h, v4.4h, v8.4h
    635     trn2        v29.4h, v4.4h, v8.4h        ////[x3,x1],[x2,x0] second qudrant transposing
    636     trn1        v25.4h, v5.4h, v9.4h
    637     trn2        v31.4h, v5.4h, v9.4h        ////[x3,x1],[x2,x0] second qudrant transposing
    638 
    639     trn1        v4.2s, v27.2s, v25.2s
    640     trn2        v5.2s, v27.2s, v25.2s       ////x0,x1,x2,x3 second qudrant transposing continued.....
    641     trn1        v8.2s, v29.2s, v31.2s
    642     trn2        v9.2s, v29.2s, v31.2s       ////x0,x1,x2,x3 second qudrant transposing continued.....
    643 
    644     trn1        v27.4h, v10.4h, v14.4h
    645     trn2        v29.4h, v10.4h, v14.4h      ////[x7,x5],[x6,x4] third qudrant transposing
    646     trn1        v25.4h, v11.4h, v15.4h
    647     trn2        v31.4h, v11.4h, v15.4h      ////[x7,x5],[x6,x4] third qudrant transposing
    648 
    649     trn1        v10.2s, v27.2s, v25.2s
    650     trn2        v11.2s, v27.2s, v25.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
    651     trn1        v14.2s, v29.2s, v31.2s
    652     trn2        v15.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 third qudrant transposing continued.....
    653 
    654     trn1        v27.4h, v12.4h, v16.4h
    655     trn2        v29.4h, v12.4h, v16.4h      ////[x7,x5],[x6,x4] fourth qudrant transposing
    656     trn1        v25.4h, v13.4h, v17.4h
    657     trn2        v31.4h, v13.4h, v17.4h      ////[x7,x5],[x6,x4] fourth qudrant transposing
    658 
    659     trn1        v12.2s, v27.2s, v25.2s
    660     trn2        v13.2s, v27.2s, v25.2s      ////x4,x5,x6,x7 fourth qudrant transposing continued.....
    661     trn1        v16.2s, v29.2s, v31.2s
    662     trn2        v17.2s, v29.2s, v31.2s      ////x4,x5,x6,x7 fourth qudrant transposing continued.....
    663 
    664     mov         v25.d[0],x19
    665     mov         v25.d[1],x20
    666 
    667     ////step6 operate on first four rows and find their idct
    668     ////register usage.extern        - storing and idct of rows
    669 ////    cosine constants     -     d0
    670 ////    sine constants         -     d1
    671 ////    element 0 first four     -     d2        -    y0
    672 ////    element 1 first four     -     d6        -    y1
    673 ////    element 2 first four     -     d3        -    y2
    674 ////    element 3 first four     -     d7        -    y3
    675 ////    element 4 first four     -     d4        -    y4
    676 ////    element 5 first four     -     d8        -    y5
    677 ////    element 6 first four     -     d5        -    y6
    678 ////    element 7 first four     -     d9        -    y7
    679 ////    element 0 second four    -     d10        -    y0
    680 ////    element 1 second four    -     d14     -    y1
    681 ////    element 2 second four    -     d11     -    y2
    682 ////    element 3 second four    -     d15     -    y3
    683 ////    element 4 second four    -     d12     -    y4
    684 ////    element 5 second four    -     d16     -    y5
    685 ////    element 6 second four    -     d13     -    y6
    686 ////    element 7 second four    -     d17     -    y7
    687 
    688     //// map between first kernel code seq and current
    689 ////        d2    ->    d2
    690 ////        d6    ->    d6
    691 ////        d3    ->    d3
    692 ////        d7    ->    d7
    693 ////        d10    ->    d4
    694 ////        d14    ->    d8
    695 ////        d11    ->    d5
    696 ////        d15    ->    d9
    697 ////        q3    ->    q3
    698 ////        q5    ->    q2
    699 ////        q7    ->    q4
    700 
    701     smull       v24.4s, v6.4h, v0.h[1]      //// y1 * cos1(part of b0)
    702     smull       v26.4s, v6.4h, v0.h[3]      //// y1 * cos3(part of b1)
    703     smull       v28.4s, v6.4h, v1.h[1]      //// y1 * sin3(part of b2)
    704     smull       v30.4s, v6.4h, v1.h[3]      //// y1 * sin1(part of b3)
    705 
    706     smlal       v24.4s, v7.4h, v0.h[3]      //// y1 * cos1 + y3 * cos3(part of b0)
    707     smlsl       v26.4s, v7.4h, v1.h[3]      //// y1 * cos3 - y3 * sin1(part of b1)
    708     smlsl       v28.4s, v7.4h, v0.h[1]      //// y1 * sin3 - y3 * cos1(part of b2)
    709     smlsl       v30.4s, v7.4h, v1.h[1]      //// y1 * sin1 - y3 * sin3(part of b3)
    710 
    711     smull       v20.4s, v2.4h, v0.h[0]      //// y0 * cos4(part of c0 and c1)
    712     smull       v22.4s, v4.4h, v0.h[0]      //// y4 * cos4(part of c0 and c1)
    713 
    714     smull       v18.4s, v3.4h, v1.h[2]      //// y2 * sin2 (q3 is freed by this time)(part of d1)
    715     smull       v6.4s, v3.4h, v0.h[2]       //// y2 * cos2(part of d0)
    716 
    717 
    718     smlal       v24.4s, v8.4h, v1.h[1]      //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    719     smlsl       v26.4s, v8.4h, v0.h[1]      //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    720     smlal       v28.4s, v8.4h, v1.h[3]      //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    721     smlal       v30.4s, v8.4h, v0.h[3]      //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
    722 
    723     smlsl       v18.4s, v5.4h, v0.h[2]      //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    724     smlal       v6.4s, v5.4h, v1.h[2]       //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
    725 
    726     add         v2.4s,  v20.4s ,  v22.4s    //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    727     sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
    728 
    729     smlal       v24.4s, v9.4h, v1.h[3]      //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
    730     smlsl       v26.4s, v9.4h, v1.h[1]      //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
    731     smlal       v28.4s, v9.4h, v0.h[3]      //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
    732     smlsl       v30.4s, v9.4h, v0.h[1]      //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
    733 
    734     sub         v22.4s,  v2.4s ,  v6.4s     //// a3 = c0 - d0(part of x3,x4)
    735     add         v4.4s,  v2.4s ,  v6.4s      ////    a0 = c0 + d0(part of x0,x7)
    736 
    737 
    738     add         v2.4s,  v4.4s ,  v24.4s
    739 
    740     sub         v6.4s,  v4.4s ,  v24.4s
    741 
    742     add         v8.4s,  v22.4s ,  v30.4s
    743 
    744     sub         v24.4s,  v22.4s ,  v30.4s
    745 
    746     sqrshrn     v5.4h, v8.4s,#shift_stage2_idct
    747     sqrshrn     v2.4h, v2.4s,#shift_stage2_idct
    748     sqrshrn     v9.4h, v6.4s,#shift_stage2_idct
    749     sqrshrn     v6.4h, v24.4s,#shift_stage2_idct
    750 
    751     sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
    752     add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
    753 
    754 
    755     add         v30.4s,  v22.4s ,  v28.4s
    756 
    757     sub         v24.4s,  v22.4s ,  v28.4s
    758 
    759     add         v28.4s,  v18.4s ,  v26.4s
    760 
    761     sub         v22.4s,  v18.4s ,  v26.4s
    762     sqrshrn     v4.4h, v30.4s,#shift_stage2_idct
    763     sqrshrn     v7.4h, v24.4s,#shift_stage2_idct
    764     sqrshrn     v3.4h, v28.4s,#shift_stage2_idct
    765     sqrshrn     v8.4h, v22.4s,#shift_stage2_idct
    766 
    767 
    768 
    769     umov        x19,v25.d[0]
    770     umov        x20,v25.d[1]
    771 
    772     trn1        v27.4h, v2.4h, v3.4h
    773     trn2        v29.4h, v2.4h, v3.4h
    774     trn1        v25.4h, v4.4h, v5.4h
    775     trn2        v31.4h, v4.4h, v5.4h
    776 
    777     trn1        v2.2s, v27.2s, v25.2s
    778     trn2        v4.2s, v27.2s, v25.2s
    779     trn1        v3.2s, v29.2s, v31.2s
    780     trn2        v5.2s, v29.2s, v31.2s
    781 
    782     trn1        v27.4h, v6.4h, v7.4h
    783     trn2        v29.4h, v6.4h, v7.4h
    784     trn1        v25.4h, v8.4h, v9.4h
    785     trn2        v31.4h, v8.4h, v9.4h
    786 
    787     trn1        v6.2s, v27.2s, v25.2s
    788     trn2        v8.2s, v27.2s, v25.2s
    789     trn1        v7.2s, v29.2s, v31.2s
    790     trn2        v9.2s, v29.2s, v31.2s
    791 
    792     mov         v25.d[0],x19
    793     mov         v25.d[1],x20
    794 
    795 
    796 
    797     smull       v24.4s, v14.4h, v0.h[1]     //// y1 * cos1(part of b0)
    798     smull       v26.4s, v14.4h, v0.h[3]     //// y1 * cos3(part of b1)
    799     smull       v28.4s, v14.4h, v1.h[1]     //// y1 * sin3(part of b2)
    800     smull       v30.4s, v14.4h, v1.h[3]     //// y1 * sin1(part of b3)
    801     smlal       v24.4s, v15.4h, v0.h[3]     //// y1 * cos1 + y3 * cos3(part of b0)
    802     smlsl       v26.4s, v15.4h, v1.h[3]     //// y1 * cos3 - y3 * sin1(part of b1)
    803     smlsl       v28.4s, v15.4h, v0.h[1]     //// y1 * sin3 - y3 * cos1(part of b2)
    804     smlsl       v30.4s, v15.4h, v1.h[1]     //// y1 * sin1 - y3 * sin3(part of b3)
    805     smull       v20.4s, v10.4h, v0.h[0]     //// y0 * cos4(part of c0 and c1)
    806     smull       v22.4s, v12.4h, v0.h[0]     //// y4 * cos4(part of c0 and c1)
    807     smull       v18.4s, v11.4h, v1.h[2]     //// y2 * sin2 (q7 is freed by this time)(part of d1)
    808     smull       v14.4s, v11.4h, v0.h[2]     //// y2 * cos2(part of d0)
    809     smlal       v24.4s, v16.4h, v1.h[1]     //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    810 
    811     add         x4,x2,x8, lsl #1            // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
    812     smlsl       v26.4s, v16.4h, v0.h[1]     //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    813 
    814     add         x5,x8,x8, lsl #1            //
    815     smlal       v28.4s, v16.4h, v1.h[3]     //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    816 
    817     add         x0,x3,x7, lsl #1            // x0 points to 3rd row of dest data
    818     smlal       v30.4s, v16.4h, v0.h[3]     //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
    819 
    820     add         x10,x7,x7, lsl #1           //
    821     smlsl       v18.4s, v13.4h, v0.h[2]     //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    822 
    823 
    824     smlal       v14.4s, v13.4h, v1.h[2]     //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
    825 
    826     add         v12.4s,  v20.4s ,  v22.4s   //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    827     sub         v20.4s,  v20.4s ,  v22.4s   //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
    828 
    829     smlal       v24.4s, v17.4h, v1.h[3]     //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
    830 
    831     // swapping v3 and v6
    832     mov         v31.d[0], v3.d[0]
    833     mov         v3.d[0], v6.d[0]
    834     mov         v6.d[0], v31.d[0]
    835 
    836     smlsl       v26.4s, v17.4h, v1.h[1]     //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
    837     // swapping v5 and v8
    838     mov         v31.d[0], v5.d[0]
    839     mov         v5.d[0], v8.d[0]
    840     mov         v8.d[0], v31.d[0]
    841 
    842     smlal       v28.4s, v17.4h, v0.h[3]     //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
    843     smlsl       v30.4s, v17.4h, v0.h[1]     //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
    844 
    845     sub         v22.4s,  v12.4s ,  v14.4s   //// a3 = c0 - d0(part of x3,x4)
    846     add         v12.4s,  v12.4s ,  v14.4s   ////    a0 = c0 + d0(part of x0,x7)
    847 
    848 
    849     add         v0.4s,  v12.4s ,  v24.4s
    850 
    851 
    852     sub         v24.4s,  v12.4s ,  v24.4s
    853 
    854 
    855     add         v12.4s,  v22.4s ,  v30.4s
    856 
    857 
    858     sub         v14.4s,  v22.4s ,  v30.4s
    859 
    860     sqrshrn     v10.4h, v0.4s,#shift_stage2_idct
    861     sqrshrn     v17.4h, v24.4s,#shift_stage2_idct
    862     sqrshrn     v13.4h, v12.4s,#shift_stage2_idct
    863     sqrshrn     v14.4h, v14.4s,#shift_stage2_idct
    864 
    865     sub         v22.4s,  v20.4s ,  v18.4s   //// a2 = c1 - d1(part of x2,x5)
    866     add         v18.4s,  v20.4s ,  v18.4s   //// a1 = c1 + d1(part of x1,x6)
    867 
    868 
    869     add         v0.4s,  v22.4s ,  v28.4s
    870 
    871 
    872     sub         v24.4s,  v22.4s ,  v28.4s
    873 
    874 
    875     add         v28.4s,  v18.4s ,  v26.4s
    876 
    877 
    878     sub         v26.4s,  v18.4s ,  v26.4s
    879     ld1         {v18.8b},[x2],x8
    880 
    881     sqrshrn     v12.4h, v0.4s,#shift_stage2_idct
    882     ld1         {v20.8b},[x2],x5
    883 
    884 
    885     sqrshrn     v15.4h, v24.4s,#shift_stage2_idct
    886     ld1         {v19.8b},[x2],x8
    887 
    888 
    889 
    890 
    891     sqrshrn     v11.4h, v28.4s,#shift_stage2_idct
    892     ld1         {v22.8b},[x4],x8
    893 
    894 
    895 
    896 
    897     sqrshrn     v16.4h, v26.4s,#shift_stage2_idct
    898     ld1         {v21.8b},[x2],x5
    899 
    900 
    901 
    902 
    903 pred_buff_addition:
    904 
    905     umov        x19,v25.d[0]
    906     umov        x20,v25.d[1]
    907 
    908     trn1        v27.4h, v10.4h, v11.4h
    909     trn2        v29.4h, v10.4h, v11.4h
    910     trn1        v25.4h, v12.4h, v13.4h
    911     trn2        v31.4h, v12.4h, v13.4h
    912 
    913     trn1        v10.2s, v27.2s, v25.2s
    914     trn2        v12.2s, v27.2s, v25.2s
    915     trn1        v11.2s, v29.2s, v31.2s
    916     trn2        v13.2s, v29.2s, v31.2s
    917 
    918     trn1        v27.4h, v14.4h, v15.4h
    919     trn2        v29.4h, v14.4h, v15.4h
    920     trn1        v25.4h, v16.4h, v17.4h
    921     trn2        v31.4h, v16.4h, v17.4h
    922 
    923     trn1        v14.2s, v27.2s, v25.2s
    924     trn2        v16.2s, v27.2s, v25.2s
    925     trn1        v15.2s, v29.2s, v31.2s
    926     trn2        v17.2s, v29.2s, v31.2s
    927 
    928 
    929     mov         v25.d[0],x19
    930     mov         v25.d[1],x20
    931 
    932 
    933     ld1         {v24.8b},[x4],x5
    934     ld1         {v23.8b},[x4],x8
    935     ld1         {v25.8b},[x4],x5
    936     mov         v2.d[1], v3.d[0]
    937     mov         v4.d[1], v5.d[0]
    938     mov         v6.d[1], v7.d[0]
    939     mov         v8.d[1], v9.d[0]
    940     uaddw       v2.8h,  v2.8h ,  v18.8b
    941     uaddw       v4.8h,  v4.8h ,  v22.8b
    942     uaddw       v6.8h,  v6.8h ,  v20.8b
    943     uaddw       v8.8h,  v8.8h ,  v24.8b
    944 
    945     // swapping v11 and v14
    946     mov         v31.d[0], v11.d[0]
    947     mov         v11.d[0], v14.d[0]
    948     mov         v14.d[0], v31.d[0]
    949 
    950     // swapping v13 and v16
    951     mov         v31.d[0], v13.d[0]
    952     mov         v13.d[0], v16.d[0]
    953     mov         v16.d[0], v31.d[0]
    954 // row values stored in the q register.
    955 
    956 //q1 :x0
    957 //q3: x1
    958 //q2: x2
    959 //q4: x3
    960 //q5: x4
    961 //q7: x5
    962 //q6: x6
    963 //q8: x7
    964 
    965 
    966 
    967 ///// adding the prediction buffer
    968 
    969 
    970 
    971 
    972 
    973 
    974 
    975 
    976 
    977     // load prediction data
    978 
    979 
    980 
    981 
    982 
    983     //adding recon with prediction
    984 
    985 
    986 
    987 
    988     mov         v10.d[1], v11.d[0]
    989     mov         v12.d[1], v13.d[0]
    990     mov         v14.d[1], v15.d[0]
    991     mov         v16.d[1], v17.d[0]
    992     uaddw       v10.8h,  v10.8h ,  v19.8b
    993     sqxtun      v2.8b, v2.8h
    994     uaddw       v14.8h,  v14.8h ,  v21.8b
    995     sqxtun      v4.8b, v4.8h
    996     uaddw       v12.8h,  v12.8h ,  v23.8b
    997     sqxtun      v6.8b, v6.8h
    998     uaddw       v16.8h,  v16.8h ,  v25.8b
    999     sqxtun      v8.8b, v8.8h
   1000 
   1001 
   1002 
   1003 
   1004 
   1005 
   1006 
   1007     st1         {v2.8b},[x3],x7
   1008     sqxtun      v10.8b, v10.8h
   1009     st1         {v6.8b},[x3],x10
   1010     sqxtun      v14.8b, v14.8h
   1011     st1         {v4.8b},[x0],x7
   1012     sqxtun      v12.8b, v12.8h
   1013     st1         {v8.8b},[x0],x10
   1014     sqxtun      v16.8b, v16.8h
   1015 
   1016 
   1017 
   1018 
   1019 
   1020 
   1021 
   1022     st1         {v10.8b},[x3],x7
   1023     st1         {v14.8b},[x3],x10
   1024     st1         {v12.8b},[x0],x7
   1025     st1         {v16.8b},[x0],x10
   1026 
   1027 
   1028 
   1029 
   1030     // ldmfd sp!,{x4-x12,pc}
   1031     ldp         x19, x20,[sp],#16
   1032     pop_v_regs
   1033     ret
   1034 
   1035 
   1036 
   1037 
   1038 
   1039