Home | History | Annotate | Download | only in armv8
      1 //******************************************************************************
      2 //*
      3 //* Copyright (C) 2015 The Android Open Source Project
      4 //*
      5 //* Licensed under the Apache License, Version 2.0 (the "License");
      6 //* you may not use this file except in compliance with the License.
      7 //* You may obtain a copy of the License at:
      8 //*
      9 //* http://www.apache.org/licenses/LICENSE-2.0
     10 //*
     11 //* Unless required by applicable law or agreed to in writing, software
     12 //* distributed under the License is distributed on an "AS IS" BASIS,
     13 //* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 //* See the License for the specific language governing permissions and
     15 //* limitations under the License.
     16 //*
     17 //*****************************************************************************
     18 //* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
     19 //*/
     20 ///**
     21 // *******************************************************************************
     22 // * @file
     23 // *  impeg2_idct.s
     24 // *
     25 // * @brief
     26 // *  contains function definitions for single stage  inverse transform
     27 // *
     28 // * @author
     29 // *  anand s
     30 // *
     31 // * @par list of functions:
     32 // *  - impeg2_idct_recon_dc_av8()
     33 // *
     34 // * @remarks
     35 // *  none
     36 // *
     37 // *******************************************************************************
     38 //*/
     39 
     40 ///**
     41 // *******************************************************************************
     42 // *
     43 // * @brief
     44 // *  this function performs inverse transform  and reconstruction for 8x8
     45 // * input block
     46 // *
     47 // * @par description:
     48 // *  performs inverse transform and adds the prediction  data and clips output
     49 // * to 8 bit
     50 // *
     51 // * @param[in] pi2_src
     52 // *  input 8x8 coefficients
     53 // *
     54 // * @param[in] pi2_tmp
     55 // *  temporary 8x8 buffer for storing inverse
     56 // *
     57 // *  transform
     58 // *  1st stage output
     59 // *
     60 // * @param[in] pu1_pred
     61 // *  prediction 8x8 block
     62 // *
     63 // * @param[out] pu1_dst
     64 // *  output 8x8 block
     65 // *
     66 // * @param[in] src_strd
     67 // *  input stride
     68 // *
     69 // * @param[in] pred_strd
     70 // *  prediction stride
     71 // *
     72 // * @param[in] dst_strd
     73 // *  output stride
     74 // *
     75 // * @param[in] shift
     76 // *  output shift
     77 // *
     78 // * @param[in] zero_cols
     79 // *  zero columns in pi2_src
     80 // *
     81 // * @returns  void
     82 // *
     83 // * @remarks
     84 // *  none
     85 // *
     86 // *******************************************************************************
     87 // */
     88 
     89 //void impeg2_itrans_recon_8x8(word16 *pi2_src,
     90 //                            word16 *pi2_tmp,
     91 //                            uword8 *pu1_pred,
     92 //                            uword8 *pu1_dst,
     93 //                            word32 src_strd,
     94 //                            word32 pred_strd,
     95 //                            word32 dst_strd,
     96 //                            word32 zero_cols
     97 //                             word32    zero_rows                )
     98 
     99 //**************variables vs registers*************************
    100 //    x0 => *pi2_src
    101 //    x1 => *pi2_tmp
    102 //    x2 => *pu1_pred
    103 //    x3 => *pu1_dst
    104 //    src_strd
    105 //    pred_strd
    106 //    dst_strd
    107 //    zero_cols
    108 
    109 
    110 
    111 .text
    112 .align 4
    113 .include "impeg2_neon_macros.s"
    114 
    115 .set idct_stg1_shift       ,            12
    116 .set idct_stg2_shift       ,            16
    117 .set idct_stg1_round        ,           (1 << (idct_stg1_shift - 1))
    118 .set idct_stg2_round        ,           (1 << (idct_stg2_shift - 1))
    119 
    120 .extern gai2_impeg2_idct_q15
    121 .extern gai2_impeg2_idct_q11
    122 .extern gai2_impeg2_idct_first_col_q15
    123 .extern gai2_impeg2_idct_first_col_q11
    124 .extern gai2_impeg2_mismatch_stg2_additive
    125 
    126 .global impeg2_idct_recon_dc_av8
    127 impeg2_idct_recon_dc_av8:
    128     // STMFD sp!,{x4,x6,x12,x14}
    129     push_v_regs
    130     ////x0: pi2_src
    131     ////x1: pi2_tmp - not used, used as pred_strd
    132     ////x2: pu1_pred
    133     ////x3: pu1_dst
    134     ////x4: used as scratch
    135     ////x5: pred_strd
    136     ////x6: dst_strd
    137 
    138     ldrsh           x4, [x0]
    139     adrp            x14, :got:gai2_impeg2_idct_q15
    140     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
    141     ldrsh           x12, [x14]
    142 
    143     ld1             {v0.8b}, [x2], x5
    144     mul             x4, x4, x12
    145 
    146     ld1             {v1.8b}, [x2], x5
    147     add             x4, x4, #idct_stg1_round
    148 
    149     ld1             {v2.8b}, [x2], x5
    150     asr             x4, x4, #idct_stg1_shift
    151 
    152     adrp            x14, :got:gai2_impeg2_idct_q11
    153     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
    154     ldrsh           x12, [x14]
    155 
    156     ld1             {v3.8b}, [x2], x5
    157     mul             x4, x4, x12
    158 
    159     ld1             {v4.8b}, [x2], x5
    160     add             x4, x4, #idct_stg2_round
    161 
    162     ld1             {v5.8b}, [x2], x5
    163     asr             x4, x4, #idct_stg2_shift
    164 
    165     ld1             {v6.8b}, [x2], x5
    166     dup             v30.8h, w4
    167 
    168 
    169     ld1             {v7.8b}, [x2], x5
    170 
    171     uaddw           v8.8h, v30.8h , v0.8b
    172 
    173     uaddw           v10.8h, v30.8h , v1.8b
    174     sqxtun          v0.8b, v8.8h
    175 
    176     uaddw           v12.8h, v30.8h , v2.8b
    177     sqxtun          v1.8b, v10.8h
    178     st1             {v0.8b}, [x3], x6
    179 
    180     uaddw           v14.8h, v30.8h , v3.8b
    181     sqxtun          v2.8b, v12.8h
    182     st1             {v1.8b}, [x3], x6
    183 
    184     uaddw           v16.8h, v30.8h , v4.8b
    185     sqxtun          v3.8b, v14.8h
    186     st1             {v2.8b}, [x3], x6
    187 
    188     uaddw           v18.8h, v30.8h , v5.8b
    189     sqxtun          v4.8b, v16.8h
    190     st1             {v3.8b}, [x3], x6
    191 
    192     uaddw           v20.8h, v30.8h , v6.8b
    193     sqxtun          v5.8b, v18.8h
    194     st1             {v4.8b}, [x3], x6
    195 
    196     uaddw           v22.8h, v30.8h , v7.8b
    197     sqxtun          v6.8b, v20.8h
    198     st1             {v5.8b}, [x3], x6
    199 
    200     sqxtun          v7.8b, v22.8h
    201     st1             {v6.8b}, [x3], x6
    202 
    203 
    204     st1             {v7.8b}, [x3], x6
    205 
    206     // LDMFD sp!,{x4,x6,x12,pc}
    207     pop_v_regs
    208     ret
    209 
    210 
    211 
    212 .global impeg2_idct_recon_dc_mismatch_av8
    213 .extern gai2_impeg2_idct_last_row_q11
    214 .extern gai2_impeg2_mismatch_stg1_outp
    215 impeg2_idct_recon_dc_mismatch_av8:
    216     // STMFD sp!,{x4-x12,x14}
    217     push_v_regs
    218 
    219     ldrsh           x4, [x0]
    220     adrp            x14, :got:gai2_impeg2_idct_q15
    221     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q15]
    222     ldrsh           x12, [x14]
    223 
    224     mul             x4, x4, x12
    225     add             x4, x4, #idct_stg1_round
    226     asr             x4, x4, #idct_stg1_shift
    227 
    228     adrp            x14, :got:gai2_impeg2_idct_q11
    229     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_q11]
    230     ldrsh           x12, [x14]
    231     mul             x4, x4, x12
    232     dup             v0.4s, w4
    233 
    234     mov             x14, #16            ////Increment for table read
    235     adrp            x4, :got:gai2_impeg2_mismatch_stg2_additive
    236     ldr             x4, [x4, #:got_lo12:gai2_impeg2_mismatch_stg2_additive]
    237 
    238     ld1             {v2.4h, v3.4h}, [x4], x14
    239     ld1             {v30.8b}, [x2], x5
    240     sxtl            v8.4s, v2.4h
    241     sxtl            v10.4s, v3.4h
    242     raddhn          v12.4h, v0.4s, v8.4s
    243     raddhn2         v12.8h, v0.4s, v10.4s
    244     uaddw           v14.8h, v12.8h , v30.8b
    245     sqxtun          v30.8b, v14.8h
    246     st1             {v30.8b}, [x3], x6
    247 
    248     ld1             {v2.4h, v3.4h}, [x4], x14
    249     ld1             {v30.8b}, [x2], x5
    250     sxtl            v8.4s, v2.4h
    251     sxtl            v10.4s, v3.4h
    252     raddhn          v12.4h, v0.4s, v8.4s
    253     raddhn2         v12.8h, v0.4s, v10.4s
    254     uaddw           v14.8h, v12.8h , v30.8b
    255     sqxtun          v30.8b, v14.8h
    256     st1             {v30.8b}, [x3], x6
    257 
    258     ld1             {v2.4h, v3.4h}, [x4], x14
    259     ld1             {v30.8b}, [x2], x5
    260     sxtl            v8.4s, v2.4h
    261     sxtl            v10.4s, v3.4h
    262     raddhn          v12.4h, v0.4s, v8.4s
    263     raddhn2         v12.8h, v0.4s, v10.4s
    264     uaddw           v14.8h, v12.8h , v30.8b
    265     sqxtun          v30.8b, v14.8h
    266     st1             {v30.8b}, [x3], x6
    267 
    268     ld1             {v2.4h, v3.4h}, [x4], x14
    269     ld1             {v30.8b}, [x2], x5
    270     sxtl            v8.4s, v2.4h
    271     sxtl            v10.4s, v3.4h
    272     raddhn          v12.4h, v0.4s, v8.4s
    273     raddhn2         v12.8h, v0.4s, v10.4s
    274     uaddw           v14.8h, v12.8h , v30.8b
    275     sqxtun          v30.8b, v14.8h
    276     st1             {v30.8b}, [x3], x6
    277 
    278     ld1             {v2.4h, v3.4h}, [x4], x14
    279     ld1             {v30.8b}, [x2], x5
    280     sxtl            v8.4s, v2.4h
    281     sxtl            v10.4s, v3.4h
    282     raddhn          v12.4h, v0.4s, v8.4s
    283     raddhn2         v12.8h, v0.4s, v10.4s
    284     uaddw           v14.8h, v12.8h , v30.8b
    285     sqxtun          v30.8b, v14.8h
    286     st1             {v30.8b}, [x3], x6
    287 
    288     ld1             {v2.4h, v3.4h}, [x4], x14
    289     ld1             {v30.8b}, [x2], x5
    290     sxtl            v8.4s, v2.4h
    291     sxtl            v10.4s, v3.4h
    292     raddhn          v12.4h, v0.4s, v8.4s
    293     raddhn2         v12.8h, v0.4s, v10.4s
    294     uaddw           v14.8h, v12.8h , v30.8b
    295     sqxtun          v30.8b, v14.8h
    296     st1             {v30.8b}, [x3], x6
    297 
    298     ld1             {v2.4h, v3.4h}, [x4], x14
    299     ld1             {v30.8b}, [x2], x5
    300     sxtl            v8.4s, v2.4h
    301     sxtl            v10.4s, v3.4h
    302     raddhn          v12.4h, v0.4s, v8.4s
    303     raddhn2         v12.8h, v0.4s, v10.4s
    304     uaddw           v14.8h, v12.8h , v30.8b
    305     sqxtun          v30.8b, v14.8h
    306     st1             {v30.8b}, [x3], x6
    307 
    308     ld1             {v2.4h, v3.4h}, [x4], x14
    309     ld1             {v30.8b}, [x2], x5
    310     sxtl            v8.4s, v2.4h
    311     sxtl            v10.4s, v3.4h
    312     raddhn          v12.4h, v0.4s, v8.4s
    313     raddhn2         v12.8h, v0.4s, v10.4s
    314     uaddw           v14.8h, v12.8h , v30.8b
    315     sqxtun          v30.8b, v14.8h
    316     st1             {v30.8b}, [x3], x6
    317 
    318 
    319     // LDMFD sp!,{x4-x12,pc}
    320     pop_v_regs
    321     ret
    322 
    323 .globl impeg2_idct_recon_av8
    324 
    325 .type impeg2_idct_recon_av8, %function
    326 
    327 impeg2_idct_recon_av8:
    328 ////register usage.extern        - loading and until idct of columns
    329 ////    cosine constants     -     d0
    330 ////    sine constants         -     d1
    331 ////    row 0 first half     -     d2        -    y0
    332 ////    row 1 first half     -     d6        -    y1
    333 ////    row 2 first half     -     d3        -    y2
    334 ////    row 3 first half     -     d7        -    y3
    335 ////    row 4 first half     -     d10        -    y4
    336 ////    row 5 first half     -     d14        -    y5
    337 ////    row 6 first half     -     d11        -    y6
    338 ////    row 7 first half     -     d15        -    y7
    339 
    340 ////    row 0 second half    -     d4        -    y0
    341 ////    row 1 second half    -     d8      -    y1
    342 ////    row 2 second half    -     d5      -    y2
    343 ////    row 3 second half    -     d9      -    y3
    344 ////    row 4 second half    -     d12     -    y4
    345 ////    row 5 second half    -     d16     -    y5
    346 ////    row 6 second half    -     d13     -    y6
    347 ////    row 7 second half    -     d17     -    y7
    348 
    349     //// copy the input pointer to another register
    350     //// step 1 : load all constants
    351     // stmfd sp!,{x4-x12,x14}
    352 
    353     ldr             w11, [sp]           // zero rows
    354 
    355     push_v_regs
    356     stp             x19, x20, [sp, #-16]!
    357 
    358     mov             x12, x7             // zero columns
    359     mov             x8, x5              // prediction stride
    360     mov             x7, x6              // destination stride
    361     mov             x6, x4              // src stride
    362     lsl             x6, x6, #1          // x sizeof(word16)
    363     add             x9, x0, x6, lsl #1  // 2 rows
    364 
    365     add             x10, x6, x6, lsl #1 // 3 rows
    366 
    367     sub             x10, x10, #8        // - 4 cols * sizeof(word16)
    368     sub             x5, x6, #8          // src_strd - 4 cols * sizeof(word16)
    369 
    370     adrp            x14, :got:gai2_impeg2_idct_first_col_q15
    371     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
    372     ld1             {v0.4h, v1.4h}, [x14] ////d0,d1 are used for storing the constant data
    373 
    374     ////step 2 load all the input data
    375     ////step 3 operate first 4 colums at a time
    376 
    377     and             x11, x11, #0xff
    378     and             x12, x12, #0xff
    379 
    380     cmp             x11, #0xf0
    381     bge             skip_last4_rows
    382 
    383 
    384     ld1             {v2.4h}, [x0], #8
    385     ld1             {v3.4h}, [x9], #8
    386     ld1             {v4.4h}, [x0], x5
    387     smull           v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
    388     ld1             {v5.4h}, [x9], x5
    389     smull           v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
    390     ld1             {v6.4h}, [x0], #8
    391     ld1             {v7.4h}, [x9], #8
    392     smull           v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
    393     ld1             {v8.4h}, [x0], x10
    394     smull           v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
    395     ld1             {v9.4h}, [x9], x10
    396     smull           v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
    397     ld1             {v10.4h}, [x0], #8
    398     smull           v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
    399     ld1             {v11.4h}, [x9], #8
    400     smlal           v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    401     ld1             {v12.4h}, [x0], x5
    402     smlsl           v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    403     ld1             {v13.4h}, [x9], x5
    404     smlsl           v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    405     ld1             {v14.4h}, [x0], #8
    406     smlsl           v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    407     ld1             {v15.4h}, [x9], #8
    408     smull           v22.4s, v10.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
    409     ld1             {v16.4h}, [x0], x10
    410     smull           v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
    411     ld1             {v17.4h}, [x9], x10
    412 
    413     ///* this following was activated when alignment is not there */
    414 ////    vld1.16        d2,[x0]!
    415 ////    vld1.16        d3,[x2]!
    416 ////    vld1.16        d4,[x0]!
    417 ////    vld1.16        d5,[x2]!
    418 ////    vld1.16        d6,[x0]!
    419 ////    vld1.16        d7,[x2]!
    420 ////    vld1.16        d8,[x0],x3
    421 ////    vld1.16        d9,[x2],x3
    422 ////    vld1.16        d10,[x0]!
    423 ////    vld1.16        d11,[x2]!
    424 ////    vld1.16        d12,[x0]!
    425 ////    vld1.16        d13,[x2]!
    426 ////    vld1.16        d14,[x0]!
    427 ////    vld1.16        d15,[x2]!
    428 ////    vld1.16        d16,[x0],x3
    429 ////    vld1.16        d17,[x2],x3
    430 
    431 
    432 
    433 
    434     smlal           v24.4s, v14.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    435     smlsl           v26.4s, v14.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    436     smlal           v28.4s, v14.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    437     smlal           v30.4s, v14.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
    438 
    439     smlsl           v18.4s, v11.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    440     smlal           v6.4s, v11.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
    441 
    442     add             v10.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    443     sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
    444 
    445     smlal           v24.4s, v15.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
    446     smlsl           v26.4s, v15.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
    447     smlal           v28.4s, v15.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
    448     smlsl           v30.4s, v15.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
    449 
    450     add             v14.4s, v10.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
    451     sub             v10.4s, v10.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
    452     sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    453     add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
    454 
    455     add             v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
    456     sub             v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
    457 
    458     add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
    459     sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
    460 
    461     add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
    462     sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
    463 
    464     add             v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
    465     sub             v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
    466 
    467     sqrshrn         v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    468     sqrshrn         v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    469     sqrshrn         v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    470     sqrshrn         v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    471     sqrshrn         v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    472     sqrshrn         v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    473     sqrshrn         v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    474     sqrshrn         v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    475 
    476 
    477     b               last4_cols
    478 
    479 
    480 
    481 skip_last4_rows:
    482     adrp            x14, :got:gai2_impeg2_idct_first_col_q15
    483     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
    484     ld1             {v0.4h, v1.4h}, [x14]
    485 
    486     ld1             {v2.4h}, [x0], #8
    487     ld1             {v3.4h}, [x9], #8
    488     ld1             {v4.4h}, [x0], x5
    489     ld1             {v5.4h}, [x9], x5
    490     ld1             {v6.4h}, [x0], #8
    491     ld1             {v7.4h}, [x9], #8
    492     ld1             {v8.4h}, [x0], x10
    493     ld1             {v9.4h}, [x9], x10
    494 
    495 
    496 
    497     movi            v12.4h, #0
    498     movi            v13.4h, #0
    499     movi            v16.4h, #0
    500     movi            v17.4h, #0
    501 
    502 
    503 
    504 
    505     smull           v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
    506     smull           v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
    507     smull           v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
    508     smull           v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
    509 
    510     smlal           v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    511     smlsl           v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    512     smlsl           v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    513     smlsl           v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    514 
    515     smull           v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
    516     smull           v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
    517 
    518     smull           v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
    519 
    520 
    521     add             v14.4s, v20.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
    522     sub             v10.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
    523     sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    524     add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
    525 
    526     add             v20.4s, v14.4s , v24.4s //// a0 + b0(part of x0)
    527     sub             v6.4s, v14.4s , v24.4s //// a0 - b0(part of x7)
    528 
    529     add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of x2)
    530     sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of x5)
    531 
    532     add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of x1)
    533     sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of x6)
    534 
    535     add             v26.4s, v10.4s , v30.4s //// a3 + b3(part of x3)
    536     sub             v30.4s, v10.4s , v30.4s //// a3 - b3(part of x4)
    537 
    538     sqrshrn         v2.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    539     sqrshrn         v15.4h, v6.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    540     sqrshrn         v3.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    541     sqrshrn         v14.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    542     sqrshrn         v6.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    543     sqrshrn         v11.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    544     sqrshrn         v7.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    545     sqrshrn         v10.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    546 
    547 
    548 last4_cols:
    549     adrp            x14, :got:gai2_impeg2_idct_first_col_q15
    550     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q15]
    551     ld1             {v0.4h, v1.4h}, [x14]
    552 
    553 
    554     cmp             x12, #0xf0
    555     bge             skip_last4cols
    556 
    557     smull           v24.4s, v8.4h, v0.h[1] //// y1 * cos1(part of b0)
    558     smull           v26.4s, v8.4h, v0.h[3] //// y1 * cos3(part of b1)
    559     smull           v28.4s, v8.4h, v1.h[1] //// y1 * sin3(part of b2)
    560     smull           v30.4s, v8.4h, v1.h[3] //// y1 * sin1(part of b3)
    561 
    562     smlal           v24.4s, v9.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    563     smlsl           v26.4s, v9.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    564     smlsl           v28.4s, v9.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    565     smlsl           v30.4s, v9.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    566 
    567     smull           v18.4s, v5.4h, v1.h[2] //// y2 * sin2 (q4 is freed by this time)(part of d1)
    568     smull           v8.4s, v5.4h, v0.h[2] //// y2 * cos2(part of d0)
    569 
    570     smull           v20.4s, v4.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
    571     smull           v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
    572 
    573     smlal           v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    574     smlsl           v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    575     smlal           v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    576     smlal           v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
    577 
    578     smlsl           v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    579     smlal           v8.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
    580 
    581     add             v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    582     sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
    583 
    584     smlal           v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of e0,e7)
    585     smlsl           v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of e1,e6)
    586     smlal           v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of e2,e5)
    587     smlsl           v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of e3,e4)
    588 
    589     add             v16.4s, v12.4s , v8.4s ////    a0 = c0 + d0(part of e0,e7)
    590     sub             v12.4s, v12.4s , v8.4s //// a3 = c0 - d0(part of e3,e4)
    591     sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of e2,e5)
    592     add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of e1,e6)
    593 
    594     add             v20.4s, v16.4s , v24.4s //// a0 + b0(part of e0)
    595     sub             v8.4s, v16.4s , v24.4s //// a0 - b0(part of e7)
    596 
    597     add             v24.4s, v22.4s , v28.4s //// a2 + b2(part of e2)
    598     sub             v22.4s, v22.4s , v28.4s //// a2 - b2(part of e5)
    599 
    600     add             v28.4s, v18.4s , v26.4s //// a1 + b1(part of e1)
    601     sub             v18.4s, v18.4s , v26.4s //// a1 - b1(part of e6)
    602 
    603     add             v26.4s, v12.4s , v30.4s //// a3 + b3(part of e3)
    604     sub             v30.4s, v12.4s , v30.4s //// a3 - b3(part of x4)
    605 
    606     sqrshrn         v4.4h, v20.4s, #idct_stg1_shift //// x0 = (a0 + b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    607     sqrshrn         v17.4h, v8.4s, #idct_stg1_shift //// x7 = (a0 - b0 + rnd) >> 7(IDCT_STG1_SHIFT)
    608     sqrshrn         v5.4h, v24.4s, #idct_stg1_shift //// x2 = (a2 + b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    609     sqrshrn         v16.4h, v22.4s, #idct_stg1_shift //// x5 = (a2 - b2 + rnd) >> 7(IDCT_STG1_SHIFT)
    610     sqrshrn         v8.4h, v28.4s, #idct_stg1_shift //// x1 = (a1 + b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    611     sqrshrn         v13.4h, v18.4s, #idct_stg1_shift //// x6 = (a1 - b1 + rnd) >> 7(IDCT_STG1_SHIFT)
    612     sqrshrn         v9.4h, v26.4s, #idct_stg1_shift //// x3 = (a3 + b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    613     sqrshrn         v12.4h, v30.4s, #idct_stg1_shift //// x4 = (a3 - b3 + rnd) >> 7(IDCT_STG1_SHIFT)
    614     b               end_skip_last4cols
    615 
    616 
    617 
    618 skip_last4cols:
    619     adrp            x14, :got:gai2_impeg2_idct_first_col_q11
    620     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
    621     ld1             {v0.4h, v1.4h}, [x14]
    622 
    623     umov            x15, v25.d[0]
    624 
    625     trn1            v25.4h, v2.4h, v6.4h
    626     trn2            v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
    627 
    628     trn1            v27.4h, v3.4h, v7.4h
    629     trn2            v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
    630 
    631     trn1            v6.2s, v29.2s, v31.2s
    632     trn2            v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
    633     trn1            v2.2s, v25.2s, v27.2s
    634     trn2            v3.2s, v25.2s, v27.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
    635 
    636 
    637     trn1            v25.4h, v10.4h, v14.4h
    638     trn2            v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
    639 
    640     trn1            v27.4h, v11.4h, v15.4h
    641     trn2            v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
    642 
    643     trn1            v10.2s, v25.2s, v27.2s
    644     trn2            v11.2s, v25.2s, v27.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
    645     trn1            v14.2s, v29.2s, v31.2s
    646     trn2            v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
    647 
    648     mov             v25.d[0], x15
    649 
    650     smull           v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
    651     smull           v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
    652     smull           v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
    653     smull           v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
    654 
    655     smlal           v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    656     smlsl           v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    657     smlsl           v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    658     smlsl           v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    659 
    660     smull           v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
    661 //    vmull.s16    q11,d4,d0[0]                    @// y4 * cos4(part of c0 and c1)
    662 
    663     smull           v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
    664     smull           v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
    665 
    666 
    667 
    668 
    669     sub             v22.4s, v20.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
    670     add             v4.4s, v20.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
    671 
    672 
    673     add             v2.4s, v4.4s , v24.4s
    674 
    675     sub             v6.4s, v4.4s , v24.4s
    676 
    677     add             v8.4s, v22.4s , v30.4s
    678 
    679     sub             v24.4s, v22.4s , v30.4s
    680 
    681     sqrshrn         v5.4h, v8.4s, #idct_stg2_shift
    682     sqrshrn         v2.4h, v2.4s, #idct_stg2_shift
    683     sqrshrn         v9.4h, v6.4s, #idct_stg2_shift
    684     sqrshrn         v6.4h, v24.4s, #idct_stg2_shift
    685 
    686     sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    687     add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
    688 
    689 
    690     add             v30.4s, v22.4s , v28.4s
    691 
    692     sub             v24.4s, v22.4s , v28.4s
    693 
    694     add             v28.4s, v18.4s , v26.4s
    695 
    696     sub             v22.4s, v18.4s , v26.4s
    697     sqrshrn         v4.4h, v30.4s, #idct_stg2_shift
    698     sqrshrn         v7.4h, v24.4s, #idct_stg2_shift
    699     sqrshrn         v3.4h, v28.4s, #idct_stg2_shift
    700     sqrshrn         v8.4h, v22.4s, #idct_stg2_shift
    701 
    702 
    703 
    704     umov            x19, v25.d[0]
    705     umov            x20, v25.d[1]
    706 
    707     trn1            v27.4h, v2.4h, v3.4h
    708     trn2            v29.4h, v2.4h, v3.4h
    709     trn1            v25.4h, v4.4h, v5.4h
    710     trn2            v31.4h, v4.4h, v5.4h
    711 
    712     trn1            v2.2s, v27.2s, v25.2s
    713     trn2            v4.2s, v27.2s, v25.2s
    714     trn1            v3.2s, v29.2s, v31.2s
    715     trn2            v5.2s, v29.2s, v31.2s
    716 
    717     trn1            v27.4h, v6.4h, v7.4h
    718     trn2            v29.4h, v6.4h, v7.4h
    719     trn1            v25.4h, v8.4h, v9.4h
    720     trn2            v31.4h, v8.4h, v9.4h
    721 
    722     trn1            v6.2s, v27.2s, v25.2s
    723     trn2            v8.2s, v27.2s, v25.2s
    724     trn1            v7.2s, v29.2s, v31.2s
    725     trn2            v9.2s, v29.2s, v31.2s
    726 
    727     mov             v25.d[0], x19
    728     mov             v25.d[1], x20
    729 
    730     smull           v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
    731 
    732     smull           v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
    733     smull           v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
    734     smull           v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
    735 
    736     smlal           v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    737     smlsl           v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    738     smlsl           v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    739     smlsl           v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    740     smull           v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
    741     smull           v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
    742     smull           v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
    743 
    744 
    745     add             x4, x2, x8, lsl #1  // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
    746 
    747 
    748     add             x5, x8, x8, lsl #1  //
    749 
    750 
    751     add             x0, x3, x7, lsl #1  // x0 points to 3rd row of dest data
    752 
    753 
    754     add             x10, x7, x7, lsl #1 //
    755 
    756     // swapping v3 and v6
    757     mov             v31.d[0], v3.d[0]
    758     mov             v3.d[0], v6.d[0]
    759     mov             v6.d[0], v31.d[0]
    760 
    761     // swapping v5 and v8
    762     mov             v31.d[0], v5.d[0]
    763     mov             v5.d[0], v8.d[0]
    764     mov             v8.d[0], v31.d[0]
    765 
    766 
    767     sub             v22.4s, v20.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
    768     add             v12.4s, v20.4s , v14.4s ////    a0 = c0 + d0(part of x0,x7)
    769 
    770 
    771     add             v0.4s, v12.4s , v24.4s
    772 
    773 
    774     sub             v24.4s, v12.4s , v24.4s
    775 
    776 
    777     add             v12.4s, v22.4s , v30.4s
    778 
    779 
    780     sub             v14.4s, v22.4s , v30.4s
    781 
    782     sqrshrn         v10.4h, v0.4s, #idct_stg2_shift
    783     sqrshrn         v17.4h, v24.4s, #idct_stg2_shift
    784     sqrshrn         v13.4h, v12.4s, #idct_stg2_shift
    785     sqrshrn         v14.4h, v14.4s, #idct_stg2_shift
    786 
    787     sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    788     add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
    789 
    790 
    791     add             v0.4s, v22.4s , v28.4s
    792 
    793 
    794     sub             v24.4s, v22.4s , v28.4s
    795 
    796 
    797     add             v28.4s, v18.4s , v26.4s
    798 
    799 
    800     sub             v26.4s, v18.4s , v26.4s
    801     ld1             {v18.8b}, [x2], x8
    802 
    803     sqrshrn         v12.4h, v0.4s, #idct_stg2_shift
    804     ld1             {v20.8b}, [x2], x5
    805 
    806 
    807     sqrshrn         v15.4h, v24.4s, #idct_stg2_shift
    808     ld1             {v19.8b}, [x2], x8
    809 
    810 
    811 
    812 
    813     sqrshrn         v11.4h, v28.4s, #idct_stg2_shift
    814     ld1             {v22.8b}, [x4], x8
    815 
    816 
    817 
    818 
    819     sqrshrn         v16.4h, v26.4s, #idct_stg2_shift
    820     ld1             {v21.8b}, [x2], x5
    821 
    822 
    823     b               pred_buff_addition
    824 end_skip_last4cols:
    825     adrp            x14, :got:gai2_impeg2_idct_first_col_q11
    826     ldr             x14, [x14, #:got_lo12:gai2_impeg2_idct_first_col_q11]
    827     ld1             {v0.4h, v1.4h}, [x14]
    828 
    829 
    830     umov            x19, v25.d[0]
    831     umov            x20, v25.d[1]
    832 
    833 ///* now the idct of columns is done, transpose so that row idct done efficiently(step5) */
    834     trn1            v27.4h, v2.4h, v6.4h
    835     trn2            v29.4h, v2.4h, v6.4h ////[x3,x1],[x2,x0] first qudrant transposing
    836     trn1            v25.4h, v3.4h, v7.4h
    837     trn2            v31.4h, v3.4h, v7.4h ////[x3,x1],[x2,x0] first qudrant transposing
    838 
    839     trn1            v2.2s, v27.2s, v25.2s
    840     trn2            v3.2s, v27.2s, v25.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
    841     trn1            v6.2s, v29.2s, v31.2s
    842     trn2            v7.2s, v29.2s, v31.2s ////x0,x1,x2,x3 first qudrant transposing continued.....
    843 
    844     trn1            v27.4h, v4.4h, v8.4h
    845     trn2            v29.4h, v4.4h, v8.4h ////[x3,x1],[x2,x0] second qudrant transposing
    846     trn1            v25.4h, v5.4h, v9.4h
    847     trn2            v31.4h, v5.4h, v9.4h ////[x3,x1],[x2,x0] second qudrant transposing
    848 
    849     trn1            v4.2s, v27.2s, v25.2s
    850     trn2            v5.2s, v27.2s, v25.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
    851     trn1            v8.2s, v29.2s, v31.2s
    852     trn2            v9.2s, v29.2s, v31.2s ////x0,x1,x2,x3 second qudrant transposing continued.....
    853 
    854     trn1            v27.4h, v10.4h, v14.4h
    855     trn2            v29.4h, v10.4h, v14.4h ////[x7,x5],[x6,x4] third qudrant transposing
    856     trn1            v25.4h, v11.4h, v15.4h
    857     trn2            v31.4h, v11.4h, v15.4h ////[x7,x5],[x6,x4] third qudrant transposing
    858 
    859     trn1            v10.2s, v27.2s, v25.2s
    860     trn2            v11.2s, v27.2s, v25.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
    861     trn1            v14.2s, v29.2s, v31.2s
    862     trn2            v15.2s, v29.2s, v31.2s ////x4,x5,x6,x7 third qudrant transposing continued.....
    863 
    864     trn1            v27.4h, v12.4h, v16.4h
    865     trn2            v29.4h, v12.4h, v16.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
    866     trn1            v25.4h, v13.4h, v17.4h
    867     trn2            v31.4h, v13.4h, v17.4h ////[x7,x5],[x6,x4] fourth qudrant transposing
    868 
    869     trn1            v12.2s, v27.2s, v25.2s
    870     trn2            v13.2s, v27.2s, v25.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
    871     trn1            v16.2s, v29.2s, v31.2s
    872     trn2            v17.2s, v29.2s, v31.2s ////x4,x5,x6,x7 fourth qudrant transposing continued.....
    873 
    874     mov             v25.d[0], x19
    875     mov             v25.d[1], x20
    876 
    877     ////step6 operate on first four rows and find their idct
    878     ////register usage.extern        - storing and idct of rows
    879 ////    cosine constants     -     d0
    880 ////    sine constants         -     d1
    881 ////    element 0 first four     -     d2        -    y0
    882 ////    element 1 first four     -     d6        -    y1
    883 ////    element 2 first four     -     d3        -    y2
    884 ////    element 3 first four     -     d7        -    y3
    885 ////    element 4 first four     -     d4        -    y4
    886 ////    element 5 first four     -     d8        -    y5
    887 ////    element 6 first four     -     d5        -    y6
    888 ////    element 7 first four     -     d9        -    y7
    889 ////    element 0 second four    -     d10        -    y0
    890 ////    element 1 second four    -     d14     -    y1
    891 ////    element 2 second four    -     d11     -    y2
    892 ////    element 3 second four    -     d15     -    y3
    893 ////    element 4 second four    -     d12     -    y4
    894 ////    element 5 second four    -     d16     -    y5
    895 ////    element 6 second four    -     d13     -    y6
    896 ////    element 7 second four    -     d17     -    y7
    897 
    898     //// map between first kernel code seq and current
    899 ////        d2    ->    d2
    900 ////        d6    ->    d6
    901 ////        d3    ->    d3
    902 ////        d7    ->    d7
    903 ////        d10    ->    d4
    904 ////        d14    ->    d8
    905 ////        d11    ->    d5
    906 ////        d15    ->    d9
    907 ////        q3    ->    q3
    908 ////        q5    ->    q2
    909 ////        q7    ->    q4
    910 
    911     smull           v24.4s, v6.4h, v0.h[1] //// y1 * cos1(part of b0)
    912     smull           v26.4s, v6.4h, v0.h[3] //// y1 * cos3(part of b1)
    913     smull           v28.4s, v6.4h, v1.h[1] //// y1 * sin3(part of b2)
    914     smull           v30.4s, v6.4h, v1.h[3] //// y1 * sin1(part of b3)
    915 
    916     smlal           v24.4s, v7.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
    917     smlsl           v26.4s, v7.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
    918     smlsl           v28.4s, v7.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
    919     smlsl           v30.4s, v7.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
    920 
    921     smull           v20.4s, v2.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
    922     smull           v22.4s, v4.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
    923 
    924     smull           v18.4s, v3.4h, v1.h[2] //// y2 * sin2 (q3 is freed by this time)(part of d1)
    925     smull           v6.4s, v3.4h, v0.h[2] //// y2 * cos2(part of d0)
    926 
    927 
    928     smlal           v24.4s, v8.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
    929     smlsl           v26.4s, v8.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
    930     smlal           v28.4s, v8.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
    931     smlal           v30.4s, v8.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
    932 
    933     smlsl           v18.4s, v5.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
    934     smlal           v6.4s, v5.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
    935 
    936     add             v2.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
    937     sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
    938 
    939     smlal           v24.4s, v9.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
    940     smlsl           v26.4s, v9.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
    941     smlal           v28.4s, v9.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
    942     smlsl           v30.4s, v9.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
    943 
    944     sub             v22.4s, v2.4s , v6.4s //// a3 = c0 - d0(part of x3,x4)
    945     add             v4.4s, v2.4s , v6.4s ////    a0 = c0 + d0(part of x0,x7)
    946 
    947 
    948     add             v2.4s, v4.4s , v24.4s
    949 
    950     sub             v6.4s, v4.4s , v24.4s
    951 
    952     add             v8.4s, v22.4s , v30.4s
    953 
    954     sub             v24.4s, v22.4s , v30.4s
    955 
    956     sqrshrn         v5.4h, v8.4s, #idct_stg2_shift
    957     sqrshrn         v2.4h, v2.4s, #idct_stg2_shift
    958     sqrshrn         v9.4h, v6.4s, #idct_stg2_shift
    959     sqrshrn         v6.4h, v24.4s, #idct_stg2_shift
    960 
    961     sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
    962     add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
    963 
    964 
    965     add             v30.4s, v22.4s , v28.4s
    966 
    967     sub             v24.4s, v22.4s , v28.4s
    968 
    969     add             v28.4s, v18.4s , v26.4s
    970 
    971     sub             v22.4s, v18.4s , v26.4s
    972     sqrshrn         v4.4h, v30.4s, #idct_stg2_shift
    973     sqrshrn         v7.4h, v24.4s, #idct_stg2_shift
    974     sqrshrn         v3.4h, v28.4s, #idct_stg2_shift
    975     sqrshrn         v8.4h, v22.4s, #idct_stg2_shift
    976 
    977 
    978 
    979     umov            x19, v25.d[0]
    980     umov            x20, v25.d[1]
    981 
    982     trn1            v27.4h, v2.4h, v3.4h
    983     trn2            v29.4h, v2.4h, v3.4h
    984     trn1            v25.4h, v4.4h, v5.4h
    985     trn2            v31.4h, v4.4h, v5.4h
    986 
    987     trn1            v2.2s, v27.2s, v25.2s
    988     trn2            v4.2s, v27.2s, v25.2s
    989     trn1            v3.2s, v29.2s, v31.2s
    990     trn2            v5.2s, v29.2s, v31.2s
    991 
    992     trn1            v27.4h, v6.4h, v7.4h
    993     trn2            v29.4h, v6.4h, v7.4h
    994     trn1            v25.4h, v8.4h, v9.4h
    995     trn2            v31.4h, v8.4h, v9.4h
    996 
    997     trn1            v6.2s, v27.2s, v25.2s
    998     trn2            v8.2s, v27.2s, v25.2s
    999     trn1            v7.2s, v29.2s, v31.2s
   1000     trn2            v9.2s, v29.2s, v31.2s
   1001 
   1002     mov             v25.d[0], x19
   1003     mov             v25.d[1], x20
   1004 
   1005 
   1006 
   1007     smull           v24.4s, v14.4h, v0.h[1] //// y1 * cos1(part of b0)
   1008     smull           v26.4s, v14.4h, v0.h[3] //// y1 * cos3(part of b1)
   1009     smull           v28.4s, v14.4h, v1.h[1] //// y1 * sin3(part of b2)
   1010     smull           v30.4s, v14.4h, v1.h[3] //// y1 * sin1(part of b3)
   1011     smlal           v24.4s, v15.4h, v0.h[3] //// y1 * cos1 + y3 * cos3(part of b0)
   1012     smlsl           v26.4s, v15.4h, v1.h[3] //// y1 * cos3 - y3 * sin1(part of b1)
   1013     smlsl           v28.4s, v15.4h, v0.h[1] //// y1 * sin3 - y3 * cos1(part of b2)
   1014     smlsl           v30.4s, v15.4h, v1.h[1] //// y1 * sin1 - y3 * sin3(part of b3)
   1015     smull           v20.4s, v10.4h, v0.h[0] //// y0 * cos4(part of c0 and c1)
   1016     smull           v22.4s, v12.4h, v0.h[0] //// y4 * cos4(part of c0 and c1)
   1017     smull           v18.4s, v11.4h, v1.h[2] //// y2 * sin2 (q7 is freed by this time)(part of d1)
   1018     smull           v14.4s, v11.4h, v0.h[2] //// y2 * cos2(part of d0)
   1019     smlal           v24.4s, v16.4h, v1.h[1] //// y1 * cos1 + y3 * cos3 + y5 * sin3(part of b0)
   1020 
   1021     add             x4, x2, x8, lsl #1  // x4 = x2 + pred_strd * 2    => x4 points to 3rd row of pred data
   1022     smlsl           v26.4s, v16.4h, v0.h[1] //// y1 * cos3 - y3 * sin1 - y5 * cos1(part of b1)
   1023 
   1024     add             x5, x8, x8, lsl #1  //
   1025     smlal           v28.4s, v16.4h, v1.h[3] //// y1 * sin3 - y3 * cos1 + y5 * sin1(part of b2)
   1026 
   1027     add             x0, x3, x7, lsl #1  // x0 points to 3rd row of dest data
   1028     smlal           v30.4s, v16.4h, v0.h[3] //// y1 * sin1 - y3 * sin3 + y5 * cos3(part of b3)
   1029 
   1030     add             x10, x7, x7, lsl #1 //
   1031     smlsl           v18.4s, v13.4h, v0.h[2] //// d1 = y2 * sin2 - y6 * cos2(part of a0 and a1)
   1032 
   1033 
   1034     smlal           v14.4s, v13.4h, v1.h[2] //// d0 = y2 * cos2 + y6 * sin2(part of a0 and a1)
   1035 
   1036     add             v12.4s, v20.4s , v22.4s //// c0 = y0 * cos4 + y4 * cos4(part of a0 and a1)
   1037     sub             v20.4s, v20.4s , v22.4s //// c1 = y0 * cos4 - y4 * cos4(part of a0 and a1)
   1038 
   1039     smlal           v24.4s, v17.4h, v1.h[3] //// b0 = y1 * cos1 + y3 * cos3 + y5 * sin3 + y7 * sin1(part of x0,x7)
   1040 
   1041     // swapping v3 and v6
   1042     mov             v31.d[0], v3.d[0]
   1043     mov             v3.d[0], v6.d[0]
   1044     mov             v6.d[0], v31.d[0]
   1045 
   1046     smlsl           v26.4s, v17.4h, v1.h[1] //// b1 = y1 * cos3 - y3 * sin1 - y5 * cos1 - y7 * sin3(part of x1,x6)
   1047     // swapping v5 and v8
   1048     mov             v31.d[0], v5.d[0]
   1049     mov             v5.d[0], v8.d[0]
   1050     mov             v8.d[0], v31.d[0]
   1051 
   1052     smlal           v28.4s, v17.4h, v0.h[3] //// b2 = y1 * sin3 - y3 * cos1 + y5 * sin1 + y7 * cos3(part of x2,x5)
   1053     smlsl           v30.4s, v17.4h, v0.h[1] //// b3 = y1 * sin1 - y3 * sin3 + y5 * cos3 - y7 * cos1(part of x3,x4)
   1054 
   1055     sub             v22.4s, v12.4s , v14.4s //// a3 = c0 - d0(part of x3,x4)
   1056     add             v12.4s, v12.4s , v14.4s ////    a0 = c0 + d0(part of x0,x7)
   1057 
   1058 
   1059     add             v0.4s, v12.4s , v24.4s
   1060 
   1061 
   1062     sub             v24.4s, v12.4s , v24.4s
   1063 
   1064 
   1065     add             v12.4s, v22.4s , v30.4s
   1066 
   1067 
   1068     sub             v14.4s, v22.4s , v30.4s
   1069 
   1070     sqrshrn         v10.4h, v0.4s, #idct_stg2_shift
   1071     sqrshrn         v17.4h, v24.4s, #idct_stg2_shift
   1072     sqrshrn         v13.4h, v12.4s, #idct_stg2_shift
   1073     sqrshrn         v14.4h, v14.4s, #idct_stg2_shift
   1074 
   1075     sub             v22.4s, v20.4s , v18.4s //// a2 = c1 - d1(part of x2,x5)
   1076     add             v18.4s, v20.4s , v18.4s //// a1 = c1 + d1(part of x1,x6)
   1077 
   1078 
   1079     add             v0.4s, v22.4s , v28.4s
   1080 
   1081 
   1082     sub             v24.4s, v22.4s , v28.4s
   1083 
   1084 
   1085     add             v28.4s, v18.4s , v26.4s
   1086 
   1087 
   1088     sub             v26.4s, v18.4s , v26.4s
   1089     ld1             {v18.8b}, [x2], x8
   1090 
   1091     sqrshrn         v12.4h, v0.4s, #idct_stg2_shift
   1092     ld1             {v20.8b}, [x2], x5
   1093 
   1094 
   1095     sqrshrn         v15.4h, v24.4s, #idct_stg2_shift
   1096     ld1             {v19.8b}, [x2], x8
   1097 
   1098 
   1099 
   1100 
   1101     sqrshrn         v11.4h, v28.4s, #idct_stg2_shift
   1102     ld1             {v22.8b}, [x4], x8
   1103 
   1104 
   1105 
   1106 
   1107     sqrshrn         v16.4h, v26.4s, #idct_stg2_shift
   1108     ld1             {v21.8b}, [x2], x5
   1109 
   1110 
   1111 
   1112 
   1113 pred_buff_addition:
   1114 
   1115     umov            x19, v25.d[0]
   1116     umov            x20, v25.d[1]
   1117 
   1118     trn1            v27.4h, v10.4h, v11.4h
   1119     trn2            v29.4h, v10.4h, v11.4h
   1120     trn1            v25.4h, v12.4h, v13.4h
   1121     trn2            v31.4h, v12.4h, v13.4h
   1122 
   1123     trn1            v10.2s, v27.2s, v25.2s
   1124     trn2            v12.2s, v27.2s, v25.2s
   1125     trn1            v11.2s, v29.2s, v31.2s
   1126     trn2            v13.2s, v29.2s, v31.2s
   1127 
   1128     trn1            v27.4h, v14.4h, v15.4h
   1129     trn2            v29.4h, v14.4h, v15.4h
   1130     trn1            v25.4h, v16.4h, v17.4h
   1131     trn2            v31.4h, v16.4h, v17.4h
   1132 
   1133     trn1            v14.2s, v27.2s, v25.2s
   1134     trn2            v16.2s, v27.2s, v25.2s
   1135     trn1            v15.2s, v29.2s, v31.2s
   1136     trn2            v17.2s, v29.2s, v31.2s
   1137 
   1138 
   1139     mov             v25.d[0], x19
   1140     mov             v25.d[1], x20
   1141 
   1142 
   1143     ld1             {v24.8b}, [x4], x5
   1144     ld1             {v23.8b}, [x4], x8
   1145     ld1             {v25.8b}, [x4], x5
   1146     mov             v2.d[1], v3.d[0]
   1147     mov             v4.d[1], v5.d[0]
   1148     mov             v6.d[1], v7.d[0]
   1149     mov             v8.d[1], v9.d[0]
   1150     uaddw           v2.8h, v2.8h , v18.8b
   1151     uaddw           v4.8h, v4.8h , v22.8b
   1152     uaddw           v6.8h, v6.8h , v20.8b
   1153     uaddw           v8.8h, v8.8h , v24.8b
   1154 
   1155     // swapping v11 and v14
   1156     mov             v31.d[0], v11.d[0]
   1157     mov             v11.d[0], v14.d[0]
   1158     mov             v14.d[0], v31.d[0]
   1159 
   1160     // swapping v13 and v16
   1161     mov             v31.d[0], v13.d[0]
   1162     mov             v13.d[0], v16.d[0]
   1163     mov             v16.d[0], v31.d[0]
   1164 // row values stored in the q register.
   1165 
   1166 //q1 :x0
   1167 //q3: x1
   1168 //q2: x2
   1169 //q4: x3
   1170 //q5: x4
   1171 //q7: x5
   1172 //q6: x6
   1173 //q8: x7
   1174 
   1175 
   1176 
   1177 ///// adding the prediction buffer
   1178 
   1179 
   1180 
   1181 
   1182 
   1183 
   1184 
   1185 
   1186 
   1187     // load prediction data
   1188 
   1189 
   1190 
   1191 
   1192 
   1193     //adding recon with prediction
   1194 
   1195 
   1196 
   1197 
   1198     mov             v10.d[1], v11.d[0]
   1199     mov             v12.d[1], v13.d[0]
   1200     mov             v14.d[1], v15.d[0]
   1201     mov             v16.d[1], v17.d[0]
   1202     uaddw           v10.8h, v10.8h , v19.8b
   1203     sqxtun          v2.8b, v2.8h
   1204     uaddw           v14.8h, v14.8h , v21.8b
   1205     sqxtun          v4.8b, v4.8h
   1206     uaddw           v12.8h, v12.8h , v23.8b
   1207     sqxtun          v6.8b, v6.8h
   1208     uaddw           v16.8h, v16.8h , v25.8b
   1209     sqxtun          v8.8b, v8.8h
   1210 
   1211 
   1212 
   1213 
   1214 
   1215 
   1216 
   1217     st1             {v2.8b}, [x3], x7
   1218     sqxtun          v10.8b, v10.8h
   1219     st1             {v6.8b}, [x3], x10
   1220     sqxtun          v14.8b, v14.8h
   1221     st1             {v4.8b}, [x0], x7
   1222     sqxtun          v12.8b, v12.8h
   1223     st1             {v8.8b}, [x0], x10
   1224     sqxtun          v16.8b, v16.8h
   1225 
   1226 
   1227 
   1228 
   1229 
   1230 
   1231 
   1232     st1             {v10.8b}, [x3], x7
   1233     st1             {v14.8b}, [x3], x10
   1234     st1             {v12.8b}, [x0], x7
   1235     st1             {v16.8b}, [x0], x10
   1236 
   1237 
   1238 
   1239 
   1240     // ldmfd sp!,{x4-x12,pc}
   1241     ldp             x19, x20, [sp], #16
   1242     pop_v_regs
   1243     ret
   1244 
   1245 
   1246 
   1247 
   1248